In [9]:
import pandas as pd
import numpy as np

In [10]:
# Load the datasets using the specified paths
job_posting = pd.read_csv('./Raw_Data/postings.csv')
companies = pd.read_csv('./Raw_Data/companies/companies.csv')
job_skills = pd.read_csv('./Raw_Data/jobs/job_skills.csv')
mapping_skills = pd.read_csv('./Raw_Data/mappings/skills.csv')

In [40]:
# Ensure 'company_id' is a nullable integer and 'company_size' is a float in the companies DataFrame
companies['company_id'] = companies['company_id'].astype('Int64')  # Sets as nullable integer for missing values
companies['company_size'] = pd.to_numeric(companies['company_size'], errors='coerce')  # Converts to float, setting non-numeric values as NaN


In [42]:
# Merge job_skills with mapping_skills to include full skill names
job_skills_detailed = job_skills.merge(mapping_skills, on="skill_abr", how="left")

# Display the first few rows of the merged data for verification
job_skills_detailed.head()


Unnamed: 0,job_id,skill_abr,skill_name
0,3884428798,MRKT,Marketing
1,3884428798,PR,Public Relations
2,3884428798,WRT,Writing/Editing
3,3887473071,SALE,Sales
4,3887465684,FIN,Finance


In [41]:
# Check for missing values in each DataFrame
missing_values_companies = companies.isnull().sum()
missing_values_job_skills = job_skills.isnull().sum()
missing_values_skills = mapping_skills.isnull().sum()
missing_values_postings = job_posting.isnull().sum()

In [43]:
# Display summaries of missing values after handling for verification
print("Missing values in companies:\n", missing_values_companies)
print("\nMissing values in job_skills:\n", missing_values_job_skills)
print("\nMissing values in mapping_skills:\n", missing_values_skills)
print("\nMissing values in job_posting:\n", missing_values_postings)


Missing values in companies:
 company_id      0
name            0
description     0
company_size    0
state           0
country         0
city            0
zip_code        0
address         0
url             0
dtype: int64

Missing values in job_skills:
 job_id       0
skill_abr    0
dtype: int64

Missing values in mapping_skills:
 skill_abr     0
skill_name    0
dtype: int64

Missing values in job_posting:
 job_id                             0
company_name                    1719
title                              0
description                        7
max_salary                     94056
pay_period                     87776
location                           0
company_id                      1717
views                           1689
med_salary                    117569
min_salary                     94056
formatted_work_type                0
applies                       100529
original_listed_time               0
remote_allowed                108603
job_posting_url                  

In [45]:
# Fill missing values in the companies DataFrame
companies['company_size'].fillna(companies['company_size'].median(), inplace=True)
companies['description'].fillna("No description provided", inplace=True)
companies['state'].fillna("Unknown", inplace=True)
companies['city'].fillna("Unknown", inplace=True)
companies['zip_code'].fillna("Unknown", inplace=True)
companies['address'].fillna("Unknown", inplace=True)

# Drop rows in postings where 'company_id' is missing and create a separate copy
postings = job_posting.dropna(subset=['company_id']).copy()

# Convert 'company_id' to nullable integer to avoid float conversion
postings['company_id'] = postings['company_id'].astype('Int64')


In [47]:
# Print initial missing values in postings
print("Initial missing values in postings:\n", postings.isnull().sum())



Initial missing values in postings:
 job_id                             0
company_name                       2
title                              0
description                        6
max_salary                     92793
pay_period                     86568
location                           0
company_id                         0
views                           1627
med_salary                    115907
min_salary                     92793
formatted_work_type                0
applies                        99236
original_listed_time               0
remote_allowed                107318
job_posting_url                    0
application_url                35063
application_type                   0
expiry                             0
closed_time                   121076
formatted_experience_level     27949
skills_desc                   119696
listed_time                        0
posting_domain                 38252
sponsored                          0
work_type                          0
c

In [52]:
# Fill key missing values in postings
postings['company_name'].fillna("Unknown Company", inplace=True)
postings['description'].fillna("No description provided", inplace=True)

# Optionally, fill salary-related columns with median salary values
salary_median = postings[['max_salary', 'min_salary']].median()
postings['max_salary'].fillna(salary_median['max_salary'], inplace=True)
postings['min_salary'].fillna(salary_median['min_salary'], inplace=True)

# Optionally drop columns with too many missing values (e.g., if over 80% missing)
# Drop columns if they exist; ignore any that aren't in the DataFrame
postings.drop(columns=['med_salary', 'skills_desc', 'remote_allowed', 'closed_time'], inplace=True, errors='ignore')


# Verify cleaned postings DataFrame
print("Postings after handling missing values:")
print(postings.info())
print("\nRemaining missing values in postings:\n", postings.isnull().sum())


Postings after handling missing values:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 122132 entries, 0 to 123848
Data columns (total 27 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      122132 non-null  int64  
 1   company_name                122132 non-null  object 
 2   title                       122132 non-null  object 
 3   description                 122132 non-null  object 
 4   max_salary                  122132 non-null  float64
 5   pay_period                  35564 non-null   object 
 6   location                    122132 non-null  object 
 7   company_id                  122132 non-null  Int64  
 8   views                       120505 non-null  float64
 9   min_salary                  122132 non-null  float64
 10  formatted_work_type         122132 non-null  object 
 11  applies                     22896 non-null   float64
 12  original_listed_time        1221

In [54]:
# 1.Integration
# Join job_skills with mapping_skills to include the full skill name for each skill abbreviation
job_skills_detailed = job_skills.merge(mapping_skills, on="skill_abr", how="left")

# Ensure 'company_id' in both DataFrames is of the same nullable integer type (Int64)
job_data_combined['company_id'] = job_data_combined['company_id'].astype('Int64')
companies['company_id'] = companies['company_id'].astype('Int64')

# Now merge the DataFrames
full_data = job_data_combined.merge(companies, on="company_id", how="left")


In [56]:
# Example: Aggregate to get the total number of job postings per company
job_count_per_company = full_data.groupby("company_id")['job_id'].count().reset_index()
job_count_per_company.rename(columns={"job_id": "total_jobs"}, inplace=True)
# Display the first few rows of the aggregated data
print("Total job postings per company:\n", job_count_per_company.head())

Total job postings per company:
    company_id  total_jobs
0        1009          33
1        1016          85
2        1025          24
3        1028         127
4        1033          37


In [57]:
# Aggregating to get a count of skills required across all jobs
skill_count = job_skills_detailed['skill_name'].value_counts().reset_index()
skill_count.columns = ['skill_name', 'count']

# Display the top required skills
print("Top required skills across all jobs:\n", skill_count.head())

Top required skills across all jobs:
                skill_name  count
0  Information Technology  26137
1                   Sales  22475
2              Management  20861
3           Manufacturing  18185
4    Health Care Provider  17369


In [60]:
# Check for and drop duplicate rows if any(Cleaning)
full_data.drop_duplicates(inplace=True)

In [61]:
# Sampling 
# Randomly sample 5% of the full_data for quick analysis or visualizat
sampled_data = full_data.sample(frac=0.05, random_state=42).copy()

# Display the first few rows of the sample to confirm
#print(sampled_data.head())


In [63]:
# Final overview after all preprocessing steps
print("\nAggregated job counts per company:\n", job_count_per_company.head())
print("\nSample of fully processed data:\n", sampled_data.head())


Aggregated job counts per company:
    company_id  total_jobs
0        1009          33
1        1016          85
2        1025          24
3        1028         127
4        1033          37

Sample of fully processed data:
             job_id              company_name  \
25246   3887899161              Webster Bank   
68202   3899524319                  Galderma   
84534   3901379413  Cornerstone Search Group   
68799   3899527533                 SAY Group   
199914  3906223667                  Circle K   

                                                    title  \
25246                     Managing Director, Loan Workout   
68202   Account Manager, Immunology - Knoxville/Chatta...   
84534   Field-based: AD, Reimbursement & Access (RAD) ...   
68799                           Human Resource Generalist   
199914                                   Lead Team Member   

                                            description_x  max_salary  \
25246   If you’re looking for a meaningful c