In [9]:
import pandas as pd
import numpy as np

In [10]:
# Load the datasets using the specified paths
job_posting = pd.read_csv('./Raw_Data/postings.csv')
job_skills = pd.read_csv('./Raw_Data/jobs/job_skills.csv')
companies = pd.read_csv('./Raw_Data/companies/companies.csv')
mapping_skills = pd.read_csv('./Raw_Data/mappings/skills.csv')

In [11]:

# Ensure columns have appropriate data types
companies['company_id'] = companies['company_id'].astype(int)
companies['company_size'] = companies['company_size'].astype(float)

In [12]:
# Check for missing values in each DataFrame
missing_values_companies = companies.isnull().sum()
missing_values_job_skills = job_skills.isnull().sum()
missing_values_skills = mapping_skills.isnull().sum()
missing_values_postings = job_posting.isnull().sum()

In [13]:
companies['company_size'].fillna(companies['company_size'].median(), inplace=True)
companies['description'].fillna("No description provided", inplace=True)
companies['state'].fillna("Unknown", inplace=True)
companies['city'].fillna("Unknown", inplace=True)
companies['zip_code'].fillna("Unknown", inplace=True)
companies['address'].fillna("Unknown", inplace=True)

In [14]:
# Merge job_skills with mapping_skills to include full skill names
job_skills_detailed = job_skills.merge(mapping_skills, on="skill_abr", how="left")

In [15]:
# Display summaries of missing values for quality assurance
print("Missing values in companies:\n", missing_values_companies)
print("\nMissing values in job_skills:\n", missing_values_job_skills)
print("\nMissing values in mapping_skills:\n", missing_values_skills)
print("\nMissing values in job_posting:\n", missing_values_postings)


Missing values in companies:
 company_id         0
name               0
description      296
company_size    2774
state             22
country            0
city               1
zip_code          28
address           22
url                0
dtype: int64

Missing values in job_skills:
 job_id       0
skill_abr    0
dtype: int64

Missing values in mapping_skills:
 skill_abr     0
skill_name    0
dtype: int64

Missing values in job_posting:
 job_id                             0
company_name                    1719
title                              0
description                        7
max_salary                     94056
pay_period                     87776
location                           0
company_id                      1717
views                           1689
med_salary                    117569
min_salary                     94056
formatted_work_type                0
applies                       100529
original_listed_time               0
remote_allowed                108603
job

In [16]:
# Display the first few rows of each transformed DataFrame
print("\nCompanies head:\n", companies.head())
print("\nJob Skills (detailed) head:\n", job_skills_detailed.head())
print("\nJob Posting head:\n", job_posting.head())


Companies head:
    company_id                        name  \
0        1009                         IBM   
1        1016               GE HealthCare   
2        1025  Hewlett Packard Enterprise   
3        1028                      Oracle   
4        1033                   Accenture   

                                         description  company_size  state  \
0  At IBM, we do more than work. We create. We cr...           7.0     NY   
1  Every day millions of people feel the impact o...           7.0      0   
2  Official LinkedIn of Hewlett Packard Enterpris...           7.0  Texas   
3  We’re a cloud technology company that provides...           7.0  Texas   
4  Accenture is a leading global professional ser...           7.0      0   

  country              city zip_code                                address  \
0      US  Armonk, New York    10504  International Business Machines Corp.   
1      US           Chicago        0                                      -   
2      US  