In [9]:
import pandas as pd
import numpy as np

In [10]:
# Load the datasets using the specified paths
job_posting = pd.read_csv('./Raw_Data/postings.csv')
job_skills = pd.read_csv('./Raw_Data/jobs/job_skills.csv')
companies = pd.read_csv('./Raw_Data/companies/companies.csv')
mapping_skills = pd.read_csv('./Raw_Data/mappings/skills.csv')

In [11]:

# Ensure columns have appropriate data types
companies['company_id'] = companies['company_id'].astype(int)
companies['company_size'] = companies['company_size'].astype(float)

In [12]:
# Check for missing values in each DataFrame
missing_values_companies = companies.isnull().sum()
missing_values_job_skills = job_skills.isnull().sum()
missing_values_skills = mapping_skills.isnull().sum()
missing_values_postings = job_posting.isnull().sum()

In [13]:
companies['company_size'].fillna(companies['company_size'].median(), inplace=True)
companies['description'].fillna("No description provided", inplace=True)
companies['state'].fillna("Unknown", inplace=True)
companies['city'].fillna("Unknown", inplace=True)
companies['zip_code'].fillna("Unknown", inplace=True)
companies['address'].fillna("Unknown", inplace=True)

In [14]:
# Merge job_skills with mapping_skills to include full skill names
job_skills_detailed = job_skills.merge(mapping_skills, on="skill_abr", how="left")

In [15]:
# Display summaries of missing values for quality assurance
print("Missing values in companies:\n", missing_values_companies)
print("\nMissing values in job_skills:\n", missing_values_job_skills)
print("\nMissing values in mapping_skills:\n", missing_values_skills)
print("\nMissing values in job_posting:\n", missing_values_postings)


Missing values in companies:
 company_id         0
name               0
description      296
company_size    2774
state             22
country            0
city               1
zip_code          28
address           22
url                0
dtype: int64

Missing values in job_skills:
 job_id       0
skill_abr    0
dtype: int64

Missing values in mapping_skills:
 skill_abr     0
skill_name    0
dtype: int64

Missing values in job_posting:
 job_id                             0
company_name                    1719
title                              0
description                        7
max_salary                     94056
pay_period                     87776
location                           0
company_id                      1717
views                           1689
med_salary                    117569
min_salary                     94056
formatted_work_type                0
applies                       100529
original_listed_time               0
remote_allowed                108603
job

In [16]:
# Display the first few rows of each transformed DataFrame
print("\nCompanies head:\n", companies.head())
print("\nJob Skills (detailed) head:\n", job_skills_detailed.head())
print("\nJob Posting head:\n", job_posting.head())


Companies head:
    company_id                        name  \
0        1009                         IBM   
1        1016               GE HealthCare   
2        1025  Hewlett Packard Enterprise   
3        1028                      Oracle   
4        1033                   Accenture   

                                         description  company_size  state  \
0  At IBM, we do more than work. We create. We cr...           7.0     NY   
1  Every day millions of people feel the impact o...           7.0      0   
2  Official LinkedIn of Hewlett Packard Enterpris...           7.0  Texas   
3  We’re a cloud technology company that provides...           7.0  Texas   
4  Accenture is a leading global professional ser...           7.0      0   

  country              city zip_code                                address  \
0      US  Armonk, New York    10504  International Business Machines Corp.   
1      US           Chicago        0                                      -   
2      US  

In [17]:
# 1. Integrimi (Integration)
# Join job_skills with mapping_skills to include the full skill name for each skill abbreviation
job_skills_detailed = job_skills.merge(mapping_skills, on="skill_abr", how="left")

# Join job_skills_detailed with job_posting on 'job_id' to have skill details with job postings
job_data_combined = job_posting.merge(job_skills_detailed, on="job_id", how="left")

# Join job_data_combined with companies on 'company_id' to include company information
full_data = job_data_combined.merge(companies, on="company_id", how="left")

In [18]:
# Example: Aggregate to get the total number of job postings per company
job_count_per_company = full_data.groupby("company_id")['job_id'].count().reset_index()
job_count_per_company.rename(columns={"job_id": "total_jobs"}, inplace=True)

In [19]:
# Aggregating to get a count of skills required across all jobs
skill_count = job_skills_detailed['skill_name'].value_counts().reset_index()
skill_count.columns = ['skill_name', 'count']

In [20]:
# Randomly sample 5% of the full_data for quick analysis or visualization (Sampling)
sampled_data = full_data.sample(frac=0.05, random_state=42)

In [21]:
# Check for and drop duplicate rows if any(Cleaning)
full_data.drop_duplicates(inplace=True)

In [25]:
# Identify missing values
missing_values = full_data.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 job_id                             0
company_name                    2826
title                              0
description_x                     12
max_salary                    156767
pay_period                    146431
location                           0
company_id                      2822
views                           2901
med_salary                    197195
min_salary                    156767
formatted_work_type                0
applies                       167194
original_listed_time               0
remote_allowed                180754
job_posting_url                    0
application_url                62683
application_type                   0
expiry                             0
closed_time                   206094
formatted_experience_level     48040
skills_desc                   203725
listed_time                        0
posting_domain                 69610
sponsored                          0
work_type                          0
currency             

In [26]:
# Fill missing values based on context

# Textual and Categorical Fields
full_data['company_name'].fillna("Unknown Company", inplace=True)
full_data['description_x'].fillna("No description provided", inplace=True)
full_data['description_y'].fillna("No description provided", inplace=True)
full_data['pay_period'].fillna("Unknown", inplace=True)
full_data['currency'].fillna("Unknown", inplace=True)
full_data['compensation_type'].fillna("Unknown", inplace=True)
full_data['skills_desc'].fillna("No skills listed", inplace=True)
full_data['application_url'].fillna("No application URL", inplace=True)
full_data['posting_domain'].fillna("Unknown", inplace=True)

# Salary Fields (fill with median salary)
salary_median = full_data[['max_salary', 'med_salary', 'min_salary', 'normalized_salary']].median()
full_data['max_salary'].fillna(salary_median['max_salary'], inplace=True)
full_data['med_salary'].fillna(salary_median['med_salary'], inplace=True)
full_data['min_salary'].fillna(salary_median['min_salary'], inplace=True)
full_data['normalized_salary'].fillna(salary_median['normalized_salary'], inplace=True)

# Experience Level, Remote Allowed, and Work Type
full_data['formatted_experience_level'].fillna("Entry Level", inplace=True)
full_data['remote_allowed'].fillna("Not specified", inplace=True)
full_data['work_type'].fillna("Unknown", inplace=True)

# Location and Address Fields
full_data['state'].fillna("Unknown", inplace=True)
full_data['city'].fillna("Unknown", inplace=True)
full_data['zip_code_x'].fillna("Unknown", inplace=True)
full_data['zip_code_y'].fillna("Unknown", inplace=True)
full_data['fips'].fillna("Unknown", inplace=True)
full_data['address'].fillna("Unknown", inplace=True)

# Skill Fields
full_data['skill_abr'].fillna("N/A", inplace=True)
full_data['skill_name'].fillna("N/A", inplace=True)

# Fill view and apply counts with 0
full_data['views'].fillna(0, inplace=True)
full_data['applies'].fillna(0, inplace=True)

# Confirm missing values have been filled
print("Remaining missing values:\n", full_data.isnull().sum())


Remaining missing values:
 job_id                             0
company_name                       0
title                              0
description_x                      0
max_salary                         0
pay_period                         0
location                           0
company_id                      2822
views                              0
med_salary                         0
min_salary                         0
formatted_work_type                0
applies                            0
original_listed_time               0
remote_allowed                     0
job_posting_url                    0
application_url                    0
application_type                   0
expiry                             0
closed_time                   206094
formatted_experience_level         0
skills_desc                        0
listed_time                        0
posting_domain                     0
sponsored                          0
work_type                          0
currency   

In [27]:
# Handle remaining missing values

# Drop rows with missing 'company_id' as this identifier is critical for many analyses
full_data.dropna(subset=['company_id'], inplace=True)

# Fill remaining missing values in company-related fields with placeholders
full_data['name'].fillna("Unknown Company", inplace=True)
full_data['company_size'].fillna("Not Available", inplace=True)
full_data['country'].fillna("Unknown", inplace=True)
full_data['url'].fillna("No URL provided", inplace=True)

# Fill missing 'closed_time' with a placeholder
full_data['closed_time'].fillna("No closing date", inplace=True)

# Verify all missing values are addressed
print("Final check for missing values:\n", full_data.isnull().sum())


Final check for missing values:
 job_id                        0
company_name                  0
title                         0
description_x                 0
max_salary                    0
pay_period                    0
location                      0
company_id                    0
views                         0
med_salary                    0
min_salary                    0
formatted_work_type           0
applies                       0
original_listed_time          0
remote_allowed                0
job_posting_url               0
application_url               0
application_type              0
expiry                        0
closed_time                   0
formatted_experience_level    0
skills_desc                   0
listed_time                   0
posting_domain                0
sponsored                     0
work_type                     0
currency                      0
compensation_type             0
normalized_salary             0
zip_code_x                    0
fips   

In [28]:
print("\nSample of fully processed data:\n", sampled_data.head())
print("\nAggregated job counts per company:\n", job_count_per_company.head())
print("\nSkill counts:\n", skill_count.head())


Sample of fully processed data:
             job_id              company_name  \
25246   3887899161              Webster Bank   
68202   3899524319                  Galderma   
84534   3901379413  Cornerstone Search Group   
68799   3899527533                 SAY Group   
199914  3906223667                  Circle K   

                                                    title  \
25246                     Managing Director, Loan Workout   
68202   Account Manager, Immunology - Knoxville/Chatta...   
84534   Field-based: AD, Reimbursement & Access (RAD) ...   
68799                           Human Resource Generalist   
199914                                   Lead Team Member   

                                            description_x  max_salary  \
25246   If you’re looking for a meaningful career, you...    200000.0   
68202   Job Title: Account Manager, ImmunologyLocation...         NaN   
84534   Home-based AD, REIMBURSEMENT & ACCESS - SOUTHE...         NaN   
68799   Our client

In [29]:
# Convert company_id back to integer after handling missing values
full_data['company_id'] = full_data['company_id'].astype(int)

# Ensure 'company_id' is integer in the aggregated result as well
job_count_per_company['company_id'] = job_count_per_company['company_id'].astype(int)

# Check the result to confirm the data type
print("\nAggregated job counts per company:\n", job_count_per_company.head())


Aggregated job counts per company:
    company_id  total_jobs
0        1009          33
1        1016          85
2        1025          24
3        1028         127
4        1033          37
