### Data preprocessing of LinkedIn Job Postings Dataset
#### Setup and Library Installation

In [None]:
%pip install pandas

#### Library Imports



In [1]:
import pandas as pd
import re

#### Loading Data

In [2]:
# Load datasets from the CSV files for analysis.
job_postings = pd.read_csv('data/job_postings.csv')
job_skills = pd.read_csv('data/job_details/job_skills.csv')

companies = pd.read_csv('data/company_details/companies.csv')
mapping_skills = pd.read_csv('data/maps/skills.csv')

#### Dataset summary

In [3]:
unique_values = {
    'job_postings': job_postings['job_id'].nunique(), # details of job postings
    'job_skills': job_skills['job_id'].nunique(), # job posting with the required skill abbreviated
    'companies': companies['company_id'].nunique(), # information on the company
    'skills': mapping_skills['skill_name'].nunique() # maps between skill abbreviations and full skill names
    }
unique_values

{'job_postings': 33246, 'job_skills': 32422, 'companies': 11361, 'skills': 35}

#### Handling the integration of job_skills with mapping_skills
Merging job_skills with mapping_skills

In [4]:
# Merging job_skills with mapping_skills to replace skill_abr with skill_name
merged_skills = pd.merge(job_skills, mapping_skills, on='skill_abr', how='left')
merged_skills = merged_skills.drop('skill_abr', axis=1)
merged_skills.head()

Unnamed: 0,job_id,skill_name
0,3690843087,Accounting/Auditing
1,3690843087,Finance
2,3691763971,Management
3,3691763971,Manufacturing
4,3691775263,Management


Aggregating Skills by Job ID

In [5]:
# Aggregation process is done to handle duplicate 'job_id's resulting from multiple skills per job:

# Grouping by 'job_id' and concatenating 'skill_name' values
grouped_skills = merged_skills.groupby('job_id')['skill_name'].agg(lambda x: ', '.join(x)).reset_index()

# Display the resulting DataFrame with unique 'job_id' rows and concatenated skills
print(grouped_skills)

           job_id                                    skill_name
0         3958427  Design, Art/Creative, Information Technology
1        85008768                   Sales, Business Development
2       102339515                   Business Development, Sales
3       108965123                                Administrative
4       133114754                   Sales, Business Development
...           ...                                           ...
32417  3757937095                        Information Technology
32418  3757938018                     Management, Manufacturing
32419  3757938019                                   Engineering
32420  3757940025                     Management, Manufacturing
32421  3757940104                                         Other

[32422 rows x 2 columns]


#### Handling the integration of job_postings with grouped_skills
Checking for missing data in Company ID's

In [6]:
missing_id = job_postings['company_id'].isnull().sum()
missing_id

654

Dropping rows with empty company_id's

In [7]:
job_postings.dropna(subset=['company_id'], inplace=True)
job_postings.shape

(32592, 28)

In [8]:
missing_id = job_postings['company_id'].isnull().sum()
missing_id

0

Merging job_postings with grouped_skills

In [9]:
merged_job_postings_skills = pd.merge(job_postings, grouped_skills, on='job_id', how='left')

# If null values exist, column values are saved as float, therefore we convert them back to int
if 'company_id' in merged_job_postings_skills.columns:
    merged_job_postings_skills['company_id'] = merged_job_postings_skills['company_id'].astype('Int64')
    
merged_job_postings_skills.head()

Unnamed: 0,job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,...,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,scraped,skill_name
0,3757940104,553718,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,,5250.0,,MONTHLY,Full-time,"Little River, SC",...,Entry level,,1699090000000.0,careers-demant.icims.com,0,FULL_TIME,USD,BASE_SALARY,1699138101,Other
1,3757940025,2192142,Shipping & Receiving Associate 2nd shift (Beav...,Metalcraft of Mayville\nMetalcraft of Mayville...,,,,,Full-time,"Beaver Dam, WI",...,,,1699080000000.0,www.click2apply.net,0,FULL_TIME,,,1699085420,"Management, Manufacturing"
2,3757938019,474443,"Manager, Engineering",\nThe TSUBAKI name is synonymous with excellen...,,,,,Full-time,"Bessemer, AL",...,,Bachelor's Degree in Mechanical Engineering pr...,1699080000000.0,www.click2apply.net,0,FULL_TIME,,,1699085644,Engineering
3,3757938018,18213359,Cook,descriptionTitle\n\n Looking for a great oppor...,,22.27,,HOURLY,Full-time,"Aliso Viejo, CA",...,Entry level,,1699080000000.0,jobs.apploi.com,0,FULL_TIME,USD,BASE_SALARY,1699087461,"Management, Manufacturing"
4,3757937095,437225,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",275834.0,,205956.0,YEARLY,Full-time,United States,...,Mid-Senior level,,1699090000000.0,careers.iherb.com,0,FULL_TIME,USD,BASE_SALARY,1699085346,Information Technology


Reducing dimensionality of merged_job_postings_skills by dropping unnecessary columns

In [10]:
columns_to_drop = ['description', 'max_salary','med_salary','min_salary',
                   'pay_period', 'applies', 'formatted_work_type', 'remote_allowed',
                   'job_posting_url','views','title','application_url','application_type',
                   'expiry','skills_desc','posting_domain','sponsored','original_listed_time',
                   'work_type','currency','compensation_type','scraped','closed_time','listed_time'
                   ]
merged_job_postings_skills= merged_job_postings_skills.drop(columns=columns_to_drop,axis=1)
merged_job_postings_skills.head()


Unnamed: 0,job_id,company_id,location,formatted_experience_level,skill_name
0,3757940104,553718,"Little River, SC",Entry level,Other
1,3757940025,2192142,"Beaver Dam, WI",,"Management, Manufacturing"
2,3757938019,474443,"Bessemer, AL",,Engineering
3,3757938018,18213359,"Aliso Viejo, CA",Entry level,"Management, Manufacturing"
4,3757937095,437225,United States,Mid-Senior level,Information Technology


#### Handling the integration of merged_job_postings_skills with companies
Merging merged_job_postings_skills with companies


In [11]:
merged_data= pd.merge(merged_job_postings_skills, companies, on='company_id', how='left')
merged_data.head()

Unnamed: 0,job_id,company_id,location,formatted_experience_level,skill_name,name,description,company_size,state,country,city,zip_code,address,url
0,3757940104,553718,"Little River, SC",Entry level,Other,HearingLife,HearingLife is a national hearing care company...,5.0,New Jersey,US,Somerset,8873,580 Howard Avenue,https://www.linkedin.com/company/hearing-life
1,3757940025,2192142,"Beaver Dam, WI",,"Management, Manufacturing","Metalcraft of Mayville, Inc.","Headquartered in Mayville, Wisconsin, Metalcra...",4.0,WI,US,Mayville,53050,1000 Metalcraft Drive,https://www.linkedin.com/company/metalcraft-of...
2,3757938019,474443,"Bessemer, AL",,Engineering,"U.S. Tsubaki Power Transmission, LLC","U.S. Tsubaki Power Transmission, LLC is a subs...",4.0,Illinois,US,Wheeling,60090,301E Marquardt Drive,https://www.linkedin.com/company/u.s.-tsubaki-...
3,3757938018,18213359,"Aliso Viejo, CA",Entry level,"Management, Manufacturing",Episcopal Communities & Services,Episcopal Communities & Services (ECS) has bec...,4.0,California,US,Altadena,91001,2212 El Molino Ave,https://www.linkedin.com/company/episcopal-com...
4,3757937095,437225,United States,Mid-Senior level,Information Technology,"iHerb, LLC",iHerb is on a mission to make health and welln...,5.0,California,US,Irvine,92618,17400 Laguna Canyon Rd,https://www.linkedin.com/company/iherb


Reducing dimensionality by dropping unnecessary columns

In [12]:
columns_to_drop = ['description','company_size','zip_code','url','address','state','country','city']
merged_data= merged_data.drop(columns=columns_to_drop,axis=1)
merged_data.head()

Unnamed: 0,job_id,company_id,location,formatted_experience_level,skill_name,name
0,3757940104,553718,"Little River, SC",Entry level,Other,HearingLife
1,3757940025,2192142,"Beaver Dam, WI",,"Management, Manufacturing","Metalcraft of Mayville, Inc."
2,3757938019,474443,"Bessemer, AL",,Engineering,"U.S. Tsubaki Power Transmission, LLC"
3,3757938018,18213359,"Aliso Viejo, CA",Entry level,"Management, Manufacturing",Episcopal Communities & Services
4,3757937095,437225,United States,Mid-Senior level,Information Technology,"iHerb, LLC"


Displaying data in a better order

In [13]:
column_order = ['job_id','company_id','name','location','skill_name','formatted_experience_level']
merged_data = merged_data[column_order]
merged_data.head()

Unnamed: 0,job_id,company_id,name,location,skill_name,formatted_experience_level
0,3757940104,553718,HearingLife,"Little River, SC",Other,Entry level
1,3757940025,2192142,"Metalcraft of Mayville, Inc.","Beaver Dam, WI","Management, Manufacturing",
2,3757938019,474443,"U.S. Tsubaki Power Transmission, LLC","Bessemer, AL",Engineering,
3,3757938018,18213359,Episcopal Communities & Services,"Aliso Viejo, CA","Management, Manufacturing",Entry level
4,3757937095,437225,"iHerb, LLC",United States,Information Technology,Mid-Senior level


#### Identify remaining missing data in the merged dataset
Display columns with significant missing data

In [14]:
missing_data = merged_data.isnull().sum()
significant_missing_columns = missing_data[missing_data > 0].sort_values(ascending=False)
significant_missing_columns

formatted_experience_level    8622
skill_name                    1023
name                            51
dtype: int64

Handling missing values of 'name' column

In [15]:
missing_names = merged_data['name'].isnull().sum()

null_name_indices = merged_data[merged_data['name'].isnull()]['company_id']
print("Number of missing ID's:", missing_names)
print(null_name_indices)

Number of missing ID's: 51
7305     76999667
15698     3641332
15706     3641332
15968     3641332
16405     3641332
20379      165957
20382     2313067
20384     3079381
20385      272676
20387    64734122
20389    27116461
20391     4316275
20392     1124883
20393    19115854
20394     1485063
25366        2815
25369     3514329
25380     9215353
25381    26489605
25382    79378951
25388    34771768
32290    91187899
32294     9516195
32296    89908682
32297    18872958
32298       35602
32300    18630069
32302        3657
32304    14615655
32305       88684
32306    10563070
32307      371180
32308     2902815
32309    90633414
32310    20338460
32311     2899710
32312      145145
32314       90844
32315     1434753
32316     7573454
32317     4781041
32318      718651
32319    86746333
32320    82296828
32321    82684341
32322    96649998
32323     2641066
32355     6049228
32525    10033339
32536    81149246
32580    16265700
Name: company_id, dtype: Int64


51 rows of company references in job_postings were non existent in the company id's dataset. 

Dropping columns where name is null

In [16]:
merged_data.dropna(subset=['name'], inplace=True)

Handling missing values of 'skill_name' column

In [17]:
# Handling missing values of 'skill_name' column
col_skill_fill_other = ['skill_name']
for col in col_skill_fill_other:
    merged_data[col].fillna("Other", inplace=True)

Handling missing values of 'formatted_experience_level' column

In [18]:
# Categorical columns with missing values are filled with "Not Specified"
col_experience_fill_not_specified = ['formatted_experience_level']
for col in col_experience_fill_not_specified:
    merged_data[col].fillna("Not Specified", inplace=True)


Check for remaining missing values

In [19]:
remaining_missing = merged_data.isnull().sum()
remaining_missing_cols = remaining_missing[remaining_missing > 0].sort_values(ascending=False)
remaining_missing_cols

Series([], dtype: int64)

#### Categorizing experience level

In [20]:
experience_levels = merged_data['formatted_experience_level'].unique()

for value in experience_levels:
    print(value)


Entry level
Not Specified
Mid-Senior level
Director
Associate
Executive
Internship


Mapping experience to their numeric new values

In [21]:
experience_mapping = {
    'Internship': 1,
    'Entry level': 2,
    'Associate':3, 
    'Mid-Senior level': 4,
    'Director': 5,
    'Executive': 6,
    'Not Specified': 7
}

merged_data['formatted_experience_level'] = merged_data['formatted_experience_level'].map(experience_mapping)

#### Cleaning data in 'name' attribute

In [22]:
def clean_company_names(name):

    # Remove special characters, except spaces and dots
    name = re.sub(r'[^a-zA-Z0-9\w\s\.]', '', name)

    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name).strip()

    # Handle hyphens and apostrophes
    name = name.replace('-', ' ')  # Replace hyphens with spaces
    name = name.replace('\'', '')  # Remove apostrophes
    name = re.sub(r'\s+', ' ', name)

    return name

merged_data['name'] = merged_data['name'].apply(clean_company_names)

#### Extract City from 'location' attribute

In [23]:
def extract_city(location):
    words = location.split()
    if len(words) == 1:  # Check if the location has only one word
        return None
    parts = location.split(', ')
    if len(parts) > 1:  # Check if the format is City, State, 
        return parts[0]
    else:
        if "Metropolitan Area" in location:
            return location.split(" Metropolitan Area")[0]
        elif "Area" in location or "Greater" in location:
            return location
        else:
            return None

# Apply the function to the 'location' column
merged_data['location'] = merged_data['location'].apply(extract_city)
merged_data = merged_data.dropna(subset=['location'])

merged_data.shape

(30175, 6)

#### Preprocessed Dataset

Shape of the cleaned data

In [24]:
merged_data_cleaned = merged_data
merged_data_cleaned.shape

(30175, 6)

Rename column names 

In [25]:

merged_data_cleaned = merged_data_cleaned.rename(columns={'name':'company','skill_name':'required_skill',
                                                           'formatted_experience_level':'experience_level'})

merged_data_cleaned = merged_data_cleaned.sort_values('job_id', ascending=False)



Display

In [26]:
merged_data_cleaned

Unnamed: 0,job_id,company_id,company,location,required_skill,experience_level
0,3757940104,553718,HearingLife,Little River,Other,2
1,3757940025,2192142,Metalcraft of Mayville Inc.,Beaver Dam,"Management, Manufacturing",7
2,3757938019,474443,U.S. Tsubaki Power Transmission LLC,Bessemer,Engineering,7
3,3757938018,18213359,Episcopal Communities Services,Aliso Viejo,"Management, Manufacturing",2
6,3757937004,10515052,Boyd Group Services Inc.,Daytona Beach,"Management, Manufacturing",2
...,...,...,...,...,...,...
32587,381055942,96654609,First Baptist Church Forney,Forney,Other,7
32588,133196985,1089558,Employvision Inc.,New York,"Accounting/Auditing, Finance",7
32589,133114754,77766802,CargoLogin.,Santa Clarita,"Sales, Business Development",7
32590,102339515,52132271,DryerVentz DuctVentz,Greater Boston,"Business Development, Sales",7


Save the preprocessed dataset to data_preprocessed.csv

In [32]:

merged_data_cleaned.to_csv('data_preprocessed/job_postings.csv', index=False)