In [2]:
import numpy as np
np.random.seed(42)  # specific seed value

In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Data Science Jobs Salaries.csv')

# Explore the first few rows of the dataset
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'Data Science Jobs Salaries.csv'

In [None]:


# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Task 1: Convert 'work_year' to a consistent format
df['work_year'] = df['work_year'].str.replace('e', '', regex=False).astype(int)

# Task 2: Convert 'salary_currency' to uppercase for consistency
df['salary_currency'] = df['salary_currency'].str.upper()

# Task 3: Convert 'experience_level' to a categorical data type
df['experience_level'] = pd.Categorical(df['experience_level'], categories=['EN', 'MI', 'SE', 'EX'], ordered=True)

# Task 4: Convert 'remote_ratio' to a percentage
df['remote_ratio'] = df['remote_ratio'] / 100.0

# Task 5: Handle missing values (replace with mean or other strategies)
df['salary'].fillna(df['salary'].mean(), inplace=True)

# Task 6: Drop unnecessary columns
df.drop(['salary_in_usd'], axis=1, inplace=True)

# Display the preprocessed dataset
print("\nPreprocessed Dataset:")
print(df.head())


## **Task 1: Identify Big Data Job Families**

In [None]:
# Display the unique job titles to understand the data
unique_job_titles = df['job_title'].unique()
print("Unique Job Titles:")
print(unique_job_titles)

# Task 1: Identify Big Data Job Families
# Assume keywords related to Big Data job families (customize this based on your data)
big_data_keywords = ['Data Scientist', 'Machine Learning', 'Data Analyst', 'Data Engineer', 'Data Science Engineer', 'Data Analytics', 'ML Engineer', 'AI Scientist', 'Big Data', 'Data Architect']

# Create a new column 'job_family' and assign it based on the keywords
def assign_job_family(title):
    for keyword in big_data_keywords:
        if keyword.lower() in title.lower():
            return keyword
    return 'Other'

df['job_family'] = df['job_title'].apply(assign_job_family)

# Display the updated dataset with the 'job_family' column
print("\nUpdated Dataset with Job Families:")
print(df[['job_title', 'job_family']].head())


# **Task 2: Extract Big Data Skills**

In [None]:
import nltk

# Download the 'stopwords' corpus
nltk.download('stopwords')


In [None]:
from nltk.corpus import stopwords

# Combine all job titles into a single string
all_job_titles = ' '.join(df['job_title'].str.lower())

# Split the string into words
words = all_job_titles.split()

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_skills = [word for word in words if word not in stop_words]

# Extract unique words as potential skills
unique_skills = set(filtered_skills)

# Convert the list of skills to a DataFrame
skills_df = pd.DataFrame({'Big Data Skills': list(unique_skills)})

# Display the refined list of extracted skills in table form
print("\nRefined Extracted Big Data Skills:")
print(skills_df)


# **Task 3: Homogeneous Groups of Big Data Skills (Cluster Analysis or Association Analysis)**
You may use machine learning clustering techniques or association rules mining to identify groups of skills.
This step can be more complex and depends on the specific techniques you choose.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
# Use TF-IDF vectorization to convert skills into numerical features
vectorizer = TfidfVectorizer()
skills_tfidf = vectorizer.fit_transform(skills_df['Big Data Skills'])

# Apply KMeans clustering
num_clusters = 3  # You can adjust the number of clusters based on your preference
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
skills_df['Cluster'] = kmeans.fit_predict(skills_tfidf)

# Display the results
print("\nHomogeneous Groups of Big Data Skills (Cluster Analysis):")
print(skills_df.sort_values(by='Cluster'))

clustering results have grouped the skills into three clusters (0, 1, and 2). Each skill is assigned to a cluster, and you can interpret these clusters as homogeneous groups of Big Data skills.

# **Task 4: Characterize Job Families by Competence Level**
Assuming 'experience_level' column can be used to characterize competence levels
competence_levels = df.groupby('job_title')['experience_level'].unique() print("Competence Levels by Job Family:") print(competence_levels)

In [None]:
# Group by job title and collect unique competence levels
competence_levels = df.groupby('job_title')['experience_level'].unique()

# Display the competence levels by job family
print("\nCompetence Levels by Job Family:")
for job_title, levels in competence_levels.items():
    print(f"{job_title}: {', '.join(levels)}")

*In this section, we characterized job families in the Big Data field based on the provided dataset. The analysis involved identifying homogeneous groups of Big Data skills through a clustering approach, revealing three distinct skill clusters. Subsequently, we delved into the competence levels associated with each job family using the 'experience_level' column. The results unveiled a comprehensive mapping of competence levels to specific job titles, offering valuable insights into the expertise required for various roles in the Big Data domain. This categorization facilitates a nuanced understanding of the skills and experience levels demanded by different job families, serving as a valuable resource for HR departments seeking to optimize recruitment strategies in the ever-evolving landscape of Big Data professions.*

## **Task 5: Data Analysis and Visualization**
You can use libraries like Matplotlib or Seaborn for visualization.
Analyze and visualize the distribution of competence levels, most valued skills, etc.

**1. Distribution of Competence Levels:**

Visualizing the distribution of competence levels across all job families can provide insights into the overall workforce composition.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
#

# Distribution of Competence Levels
plt.figure(figsize=(10, 6))
sns.countplot(x='experience_level', data=df, order=df['experience_level'].value_counts().index)
plt.title('Distribution of Competence Levels')
plt.xlabel('Experience Level')
plt.ylabel('Count')
plt.show()

*The visualization is a bar plot showing the distribution of competence levels based on the 'experience_level' column.*

In [None]:
# Most Valued Skills
#'job_title' column contains information about skills
all_job_titles = ' '.join(df['job_title'].str.lower())
words = all_job_titles.split()
filtered_skills = [word for word in words if word not in stopwords.words('english')]
unique_skills = set(filtered_skills)

**2. Most Valued Skills Word Cloud:**

Creating a word cloud can visually represent the most frequently occurring skills across all job titles, providing a quick overview of the prominent skills.

In [None]:
# Plotting Word Cloud for Most Valued Skills
from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(unique_skills))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Most Valued Skills')
plt.show()

*visualization is a word cloud depicting the most valued skills extracted from the 'job_title' column.*

**3. Competence Levels by Job Family Heatmap:**

A heatmap displayin the association between job families and competence level

In [None]:
# Create a pivot table for heatmap
heatmap_data = df.groupby(['job_title', 'experience_level']).size().unstack(fill_value=0)

# Plotting Heatmap for Competence Levels by Job Family
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, cmap='viridis', annot=True, fmt='d')
plt.title('Competence Levels by Job Family')
plt.xlabel('Experience Level')
plt.ylabel('Job Family')
plt.show()


# **Task 6: Recommendations for Recruitment**
Based on your analysis, provide recommendations for recruitment strategies.

In [None]:
!pip install tabulate

In [None]:
from tabulate import tabulate
# Calculate the frequency of each skill in the dataset
skill_frequencies = {skill: filtered_skills.count(skill) for skill in unique_skills}

# Rank the skills based on frequency (1 is the highest rank)
ranked_skills = sorted(skill_frequencies, key=skill_frequencies.get, reverse=True)
numbered_skills = [{'Skill': skill, 'Rank': i+1} for i, skill in enumerate(ranked_skills)]

# Display the list of skills with ranks in a visually attractive table
table = tabulate(numbered_skills, headers="keys", tablefmt="pretty")
print(table)

In [None]:
from tabulate import tabulate
# Assuming 'job_title' column contains information about job families
job_families = df['job_title'].str.lower()

# Calculate the frequency of each job family in the dataset
job_family_frequencies = job_families.value_counts().to_dict()

# Rank the job families based on frequency (1 is the highest rank)
ranked_job_families = sorted(job_family_frequencies, key=job_family_frequencies.get, reverse=True)
numbered_job_families = [{'Job Family': job_family, 'Rank': i+1} for i, job_family in enumerate(ranked_job_families)]

# Display the list of job families with ranks in a visually attractive table
table = tabulate(numbered_job_families, headers="keys", tablefmt="pretty")
print(table)

# **Based on your analysis, provide recommendations for recruitment strategies.**



***Focus on In-Demand Job Families:***

*Prioritize recruitment efforts for job families that are highly prevalent and in demand, such as "Data Scientist," "Data Engineer," and "Machine Learning Engineer."*

***Emphasize Key Skills in Recruitment:***

*Identify and emphasize the most valued skills in the Big Data domain, as highlighted by the word cloud and skill ranking analysis.*
*Tailor recruitment messages to highlight the importance of these key skills for the specific job families.*

***Consider Remote Work Opportunities:***

*Acknowledge the trend of remote work, as indicated by the "remote_ratio" column in the dataset.*
*Consider offering remote work opportunities or flexibility to attract a broader pool of candidates.*

***Target Mid-Level and Senior-Level Candidates:***

*Given the distribution of experience levels in the dataset, target mid-level (MI) and senior-level (SE) candidates for key roles.*
*Develop recruitment strategies that align with the experience level preferences of each job family.*

***Invest in Skill Development Programs:***

*Recognize the importance of specific skills and competencies for various job families*.
*Develop tailored skill acquisition programs or training sessions to enhance these skills among potential candidates, as highlighted by the word cloud and skill ranking.*

***Utilize Online Platforms for Job Posts:***

*Leverage online platforms and job boards for posting job opportunities to reach a wider audience.*
*Craft compelling job descriptions that clearly outline the skills and qualifications required for each role.*

***Collaborate with Educational Institutions:***

*Establish partnerships with educational institutions to tap into emerging talent.*
*Collaborate with universities and training programs to identify and attract candidates with relevant skills.*

***Stay Competitive with Compensation:***

*Stay informed about industry salary trends and offer competitive compensation packages.*
*Highlight additional benefits, such as professional development opportunities and a positive work culture.*

***Promote Diversity and Inclusion:***

*Foster a diverse and inclusive workplace by actively seeking candidates from diverse backgrounds.*
*Highlight the company's commitment to diversity in recruitment materials.*

***Continuous Monitoring and Adjustment:***

*Regularly monitor recruitment outcomes and adjust strategies based on the evolving needs of the organization and the job market.*


*These recommendations aim to enhance the effectiveness of recruitment strategies, aligning them with the specific characteristics of the Big Data job market and the preferences of potential candidates.*

In [None]:


# Import libraries
import plotly.express as px


# Assuming you have extracted skills and stored them in the 'extracted_skills' column
df['extracted_skills'] = df['job_title'].str.lower().str.split()

# Flatten the list of lists into a single list of skills
all_skills = [skill for skills_list in df['extracted_skills'] for skill in skills_list]

# Create a DataFrame with skills and their frequencies
skill_frequencies_df = pd.DataFrame(all_skills, columns=['skill']).value_counts().reset_index(name='frequency')

# Job Families Overview
fig1 = px.pie(df, names='job_title', title='Job Families Overview')

# Skill Clusters and Rankings
fig2 = px.bar(skill_frequencies_df, x='skill', y='frequency', title='Skill Clusters and Rankings')

# Display the plots
fig1.show()
fig2.show()
