In [None]:
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install scikit-learn

In [197]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

In [198]:
# Load all datasets
job_postings = pd.read_csv('data/job_postings.csv')
job_skills = pd.read_csv('data/job_details/job_skills.csv')

companies = pd.read_csv('data/company_details/companies.csv')
skills = pd.read_csv('data/maps/skills.csv')

In [199]:
# Check the number of unique values in merging columns for each dataset
unique_values = {
    'job_postings': job_postings['job_id'].nunique(),
    'job_skills': job_skills['job_id'].nunique(),
    'companies': companies['company_id'].nunique(),
    'skills': skills['skill_name'].nunique()
}

unique_values

{'job_postings': 33246, 'job_skills': 32422, 'companies': 11361, 'skills': 35}

In [200]:
# Merging skills and job_skills tables
merged_skill = pd.merge(job_skills, skills, on='skill_abr', how='left')
merged_skill = merged_skill.drop('skill_abr', axis=1)
print(merged_skill)

           job_id              skill_name
0      3690843087     Accounting/Auditing
1      3690843087                 Finance
2      3691763971              Management
3      3691763971           Manufacturing
4      3691775263              Management
...           ...                     ...
56586  3757780487    Health Care Provider
56587  3757934256                  Design
56588  3757934256            Art/Creative
56589  3757934256  Information Technology
56590  3757498232          Administrative

[56591 rows x 2 columns]


In [201]:
# Merge_skills has 56591 rows, but there are only 32422 unique job postings. 
# Therefore we can conclude that there can be 0+ skills per job posting

# Grouping by 'job_id' and concatenating 'skill_name' values
grouped_skills = merged_skill.groupby('job_id')['skill_name'].agg(lambda x: ', '.join(x)).reset_index()

# Display the resulting DataFrame with unique 'job_id' rows and concatenated skills
print(grouped_skills)


           job_id                                    skill_name
0         3958427  Design, Art/Creative, Information Technology
1        85008768                   Sales, Business Development
2       102339515                   Business Development, Sales
3       108965123                                Administrative
4       133114754                   Sales, Business Development
...           ...                                           ...
32417  3757937095                        Information Technology
32418  3757938018                     Management, Manufacturing
32419  3757938019                                   Engineering
32420  3757940025                     Management, Manufacturing
32421  3757940104                                         Other

[32422 rows x 2 columns]


In [202]:
# Missing Company ID's
missing_id = job_postings['company_id'].isnull().sum()
missing_id

654

In [203]:
# Dropping Missing Data
job_postings.dropna(subset=['company_id'], inplace=True)
job_postings.shape

(32592, 28)

In [204]:
# Merging job postings with corresponding skills
merged_jobs = pd.merge(job_postings, grouped_skills, on='job_id', how='left')
columns_to_drop = ['description', 'max_salary','med_salary','min_salary',
                   'pay_period', 'applies', 'formatted_work_type', 'remote_allowed',
                   'job_posting_url','views','title','application_url','application_type',
                   'expiry','skills_desc','posting_domain','sponsored','original_listed_time',
                   'work_type','currency','compensation_type','scraped','closed_time','listed_time'
                   ]
merged_jobs= merged_jobs.drop(columns=columns_to_drop,axis=1)
merged_jobs.head()



Unnamed: 0,job_id,company_id,location,formatted_experience_level,skill_name
0,3757940104,553718.0,"Little River, SC",Entry level,Other
1,3757940025,2192142.0,"Beaver Dam, WI",,"Management, Manufacturing"
2,3757938019,474443.0,"Bessemer, AL",,Engineering
3,3757938018,18213359.0,"Aliso Viejo, CA",Entry level,"Management, Manufacturing"
4,3757937095,437225.0,United States,Mid-Senior level,Information Technology


In [205]:
# Merging Jobs and Companies
merged_data= pd.merge(merged_jobs, companies, on='company_id', how='left')

columns_to_drop = ['description','company_size','zip_code','url','address','state','country','city']
merged_data= merged_data.drop(columns=columns_to_drop,axis=1)
merged_data.head()

Unnamed: 0,job_id,company_id,location,formatted_experience_level,skill_name,name
0,3757940104,553718.0,"Little River, SC",Entry level,Other,HearingLife
1,3757940025,2192142.0,"Beaver Dam, WI",,"Management, Manufacturing","Metalcraft of Mayville, Inc."
2,3757938019,474443.0,"Bessemer, AL",,Engineering,"U.S. Tsubaki Power Transmission, LLC"
3,3757938018,18213359.0,"Aliso Viejo, CA",Entry level,"Management, Manufacturing",Episcopal Communities & Services
4,3757937095,437225.0,United States,Mid-Senior level,Information Technology,"iHerb, LLC"


In [206]:
# Displaying data in a better order
column_order = ['job_id','company_id','name','location','skill_name','formatted_experience_level']
merged_data = merged_data[column_order]
merged_data.head()

Unnamed: 0,job_id,company_id,name,location,skill_name,formatted_experience_level
0,3757940104,553718.0,HearingLife,"Little River, SC",Other,Entry level
1,3757940025,2192142.0,"Metalcraft of Mayville, Inc.","Beaver Dam, WI","Management, Manufacturing",
2,3757938019,474443.0,"U.S. Tsubaki Power Transmission, LLC","Bessemer, AL",Engineering,
3,3757938018,18213359.0,Episcopal Communities & Services,"Aliso Viejo, CA","Management, Manufacturing",Entry level
4,3757937095,437225.0,"iHerb, LLC",United States,Information Technology,Mid-Senior level


In [207]:
# Identify missing data in the merged dataset
missing_data = merged_data.isnull().sum()

# Display columns with significant missing data
significant_missing_columns = missing_data[missing_data > 0].sort_values(ascending=False)
significant_missing_columns

formatted_experience_level    8622
skill_name                    1023
name                            51
dtype: int64

In [208]:
# Identify missing data in the merged dataset
missing = merged_data['name'].isnull().sum()

null_name_indices = merged_data[merged_data['name'].isnull()]['company_id']
print("Number of missing ID's:", missing)
print(null_name_indices)

Number of missing ID's: 51
7305     76999667.0
15698     3641332.0
15706     3641332.0
15968     3641332.0
16405     3641332.0
20379      165957.0
20382     2313067.0
20384     3079381.0
20385      272676.0
20387    64734122.0
20389    27116461.0
20391     4316275.0
20392     1124883.0
20393    19115854.0
20394     1485063.0
25366        2815.0
25369     3514329.0
25380     9215353.0
25381    26489605.0
25382    79378951.0
25388    34771768.0
32290    91187899.0
32294     9516195.0
32296    89908682.0
32297    18872958.0
32298       35602.0
32300    18630069.0
32302        3657.0
32304    14615655.0
32305       88684.0
32306    10563070.0
32307      371180.0
32308     2902815.0
32309    90633414.0
32310    20338460.0
32311     2899710.0
32312      145145.0
32314       90844.0
32315     1434753.0
32316     7573454.0
32317     4781041.0
32318      718651.0
32319    86746333.0
32320    82296828.0
32321    82684341.0
32322    96649998.0
32323     2641066.0
32355     6049228.0
32525    1003

In [209]:
# Jupyter found 51 rows of company references in job_postings that were non existent in the company id's dataset
# Droping noisy data
merged_data.dropna(subset=['name'], inplace=True)

In [210]:
missing = merged_data['name'].isnull().sum()

null_name_indices = merged_data[merged_data['name'].isnull()]['company_id']
print("Number of missing ID's:", missing)
print(null_name_indices)

Number of missing ID's: 0
Series([], Name: company_id, dtype: float64)


In [211]:
# Handling missing values

# Categorical columns with missing values are filled with "Not Specified"
col_experience_fill_not_specified = ['formatted_experience_level']
for col in col_experience_fill_not_specified:
    merged_data[col].fillna("Not Specified", inplace=True)

col_skill_fill_other = ['skill_name']
for col in col_skill_fill_other:
    merged_data[col].fillna("Other", inplace=True)


# Check remaining missing values
remaining_missing = merged_data.isnull().sum()
remaining_missing_cols = remaining_missing[remaining_missing > 0].sort_values(ascending=False)
remaining_missing_cols

Series([], dtype: int64)

In [212]:
# Displaying distinct values of Experience_level for categorizing
distinct_values = merged_data['formatted_experience_level'].unique()

print("Distinct values in 'formatted_experience_level':")
for value in distinct_values:
    print(value)


Distinct values in 'formatted_experience_level':
Entry level
Not Specified
Mid-Senior level
Director
Associate
Executive
Internship


In [213]:
# Using map() to categorize column
experience_mapping = {
    'Internship': 1,
    'Entry level': 2,
    'Associate':3, 
    'Mid-Senior level': 4,
    'Director': 5,
    'Executive': 6,
    'Not Specified': 7
}

merged_data['formatted_experience_level'] = merged_data['formatted_experience_level'].map(experience_mapping)

In [214]:
# Pre-processing company name
def clean_company_names(name):
        
    # Remove special characters, except spaces and dots
    name = re.sub(r'[^a-zA-Z0-9\w\s\.]', '', name)
    
    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name).strip()
    
    # Handle hyphens and apostrophes
    name = name.replace('-', ' ')  # Replace hyphens with spaces
    name = name.replace('\'', '')  # Remove apostrophes
    name = re.sub(r'\s+', ' ', name)

    return name


merged_data['name'] = merged_data['name'].apply(clean_company_names)

In [215]:
# Extract location from City, State format
def extract_city(location):
    words = location.split()
    if len(words) == 1:  # Check if the location has only one word
        return None
    parts = location.split(', ')
    if len(parts) > 1:  # Check if the format is City, State, 
        return parts[0]
    else:
        if "Metropolitan Area" in location:
            return location.split(" Metropolitan Area")[0]
        elif "Area" in location or "Greater" in location:
            return location
        else:
            return None

# Apply the function to the 'location' column
merged_data['location'] = merged_data['location'].apply(extract_city)
merged_data = merged_data.dropna(subset=['location'])

merged_data.shape

(30175, 6)

In [216]:
# Remove duplicate rows
merged_data_cleaned = merged_data.drop_duplicates()

# Shape of the cleaned data
merged_data_cleaned.shape

(30175, 6)

In [217]:
merged_data_cleaned = merged_data_cleaned.rename(columns={'name':'company','skill_name':'required_skill',
                                                           'formatted_experience_level':'experience_level'})

merged_data_cleaned = merged_data_cleaned.sort_values('job_id', ascending=False)


In [218]:
merged_data_cleaned


Unnamed: 0,job_id,company_id,company,location,required_skill,experience_level
0,3757940104,553718.0,HearingLife,Little River,Other,2
1,3757940025,2192142.0,Metalcraft of Mayville Inc.,Beaver Dam,"Management, Manufacturing",7
2,3757938019,474443.0,U.S. Tsubaki Power Transmission LLC,Bessemer,Engineering,7
3,3757938018,18213359.0,Episcopal Communities Services,Aliso Viejo,"Management, Manufacturing",2
6,3757937004,10515052.0,Boyd Group Services Inc.,Daytona Beach,"Management, Manufacturing",2
...,...,...,...,...,...,...
32587,381055942,96654609.0,First Baptist Church Forney,Forney,Other,7
32588,133196985,1089558.0,Employvision Inc.,New York,"Accounting/Auditing, Finance",7
32589,133114754,77766802.0,CargoLogin.,Santa Clarita,"Sales, Business Development",7
32590,102339515,52132271.0,DryerVentz DuctVentz,Greater Boston,"Business Development, Sales",7


In [219]:
merged_data_cleaned.to_csv('data_preprocessed.csv', index=False)