In [47]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [48]:
# Load the datasets using the specified paths
job_posting = pd.read_csv('./Raw_Data/postings.csv')
companies = pd.read_csv('./Raw_Data/companies/companies.csv')
job_skills = pd.read_csv('./Raw_Data/jobs/job_skills.csv')
mapping_skills = pd.read_csv('./Raw_Data/mappings/skills.csv')

In [49]:
# Companies dataset
companies = companies.astype({
    'company_id': 'Int64',
    'name': 'string',
    'company_size': 'float'
})

# Job postings dataset
job_posting = job_posting.astype({
    'job_id': 'Int64',
    'company_name': 'string',
    'min_salary': 'float'
})

# Job skills dataset
job_skills = job_skills.astype({
    'job_id': 'Int64',
    'skill_abr': 'string'
})

# Mapping skills dataset
mapping_skills = mapping_skills.astype({
    'skill_abr': 'string',
    'skill_name': 'string'
})

In [50]:
# Check for missing values in each DataFrame
missing_values_companies = companies.isnull().sum()
missing_values_job_skills = job_skills.isnull().sum()
missing_values_skills = mapping_skills.isnull().sum()
missing_values_postings = job_posting.isnull().sum()

In [51]:
# Display summaries of missing values after handling for verification
print("Missing values in companies:\n", missing_values_companies)
print("\nMissing values in job_skills:\n", missing_values_job_skills)
print("\nMissing values in mapping_skills:\n", missing_values_skills)
print("\nMissing values in job_posting:\n", missing_values_postings)


Missing values in companies:
 company_id         0
name               1
description      297
company_size    2774
state             22
country            0
city               1
zip_code          28
address           22
url                0
dtype: int64

Missing values in job_skills:
 job_id       0
skill_abr    0
dtype: int64

Missing values in mapping_skills:
 skill_abr     0
skill_name    0
dtype: int64

Missing values in job_posting:
 job_id                             0
company_name                    1719
title                              0
description                        7
max_salary                     94056
pay_period                     87776
location                           0
company_id                      1717
views                           1689
med_salary                    117569
min_salary                     94056
formatted_work_type                0
applies                       100529
original_listed_time               0
remote_allowed                108603
job

In [52]:
# Merge job_skills with mapping_skills to include full skill names
job_skills_detailed = job_skills.merge(mapping_skills, on="skill_abr", how="left")
# Display the first few rows of the merged data for verification
job_skills_detailed.head()

Unnamed: 0,job_id,skill_abr,skill_name
0,3884428798,MRKT,Marketing
1,3884428798,PR,Public Relations
2,3884428798,WRT,Writing/Editing
3,3887473071,SALE,Sales
4,3887465684,FIN,Finance


In [53]:
# Fill missing values in the companies DataFrame
fill_values = {
    'company_size': companies['company_size'].median(),
    'description': "No description provided",
    'state': "Unknown",
    'city': "Unknown",
    'zip_code': "Unknown",
    'address': "Unknown"
}

for column, value in fill_values.items():
    companies[column] = companies[column].fillna(value)

job_posting.fillna({'company_name': 'Unknown'}, inplace=True)
job_posting.fillna({'description': 'No description provided'}, inplace=True)


# Drop rows in job_posting where 'company_id' is missing
job_posting.dropna(subset=['company_id'], inplace=True)

# Convert 'company_id' to nullable integer to avoid float conversion
job_posting['company_id'] = job_posting['company_id'].astype('Int64')


In [54]:
# Print initial missing values in postings
print("Initial missing values in postings:\n", job_posting.isnull().sum())



Initial missing values in postings:
 job_id                             0
company_name                       0
title                              0
description                        0
max_salary                     92793
pay_period                     86568
location                           0
company_id                         0
views                           1627
med_salary                    115907
min_salary                     92793
formatted_work_type                0
applies                        99236
original_listed_time               0
remote_allowed                107318
job_posting_url                    0
application_url                35063
application_type                   0
expiry                             0
closed_time                   121076
formatted_experience_level     27949
skills_desc                   119696
listed_time                        0
posting_domain                 38252
sponsored                          0
work_type                          0
c

In [55]:
# Calculate median salary values for filling
salary_median = job_posting[['max_salary', 'min_salary']].median()
job_posting['max_salary'] = job_posting['max_salary'].fillna(salary_median['max_salary'])
job_posting['min_salary'] = job_posting['min_salary'].fillna(salary_median['min_salary'])

# Drop columns with too many missing values (over 80% missing)
columns_to_drop = ['med_salary', 'skills_desc', 'remote_allowed', 'closed_time','zip_code', 'fips', 'job_posting_url', 'application_url']
job_posting = job_posting.drop(columns=columns_to_drop, errors='ignore')


In [56]:
# Integration Step
job_data_combined = job_posting.merge(job_skills_detailed, on="job_id", how="left")
job_data_combined['company_id'] = job_data_combined['company_id'].astype('Int64')  # Ensure alignment with companies

# Now merge job_data_combined with companies on 'company_id' to include company information
full_data = job_data_combined.merge(companies, on="company_id", how="left")

# Check final missing values in full_data
print("Final missing values in full_data:\n", full_data.isnull().sum())


Final missing values in full_data:
 job_id                             0
company_name                       0
title                              0
description_x                      0
max_salary                         0
pay_period                    144438
location                           0
company_id                         0
views                           2802
min_salary                         0
formatted_work_type                0
applies                       165087
original_listed_time               0
application_type                   0
expiry                             0
formatted_experience_level     45677
listed_time                        0
posting_domain                 66789
sponsored                          0
work_type                          0
currency                      144438
compensation_type             144438
normalized_salary             144438
skill_abr                       1541
skill_name                      1541
name                               4
de

In [57]:
# Aggregation examples
job_count_per_company = full_data.groupby("company_id")['job_id'].count().reset_index()
job_count_per_company.rename(columns={"job_id": "total_jobs"}, inplace=True)
job_count_per_company = job_count_per_company.merge(companies[['company_id', 'name']], on='company_id', how='left')

print("Total job postings per company:\n", job_count_per_company[['name', 'total_jobs']].head())


Total job postings per company:
                          name  total_jobs
0                         IBM          33
1               GE HealthCare          85
2  Hewlett Packard Enterprise          24
3                      Oracle         127
4                   Accenture          37


In [58]:
skill_count = job_skills_detailed['skill_name'].value_counts().reset_index()
skill_count.columns = ['skill_name', 'count']
print("Top required skills across all jobs:\n", skill_count.head())

Top required skills across all jobs:
                skill_name  count
0  Information Technology  26137
1                   Sales  22475
2              Management  20861
3           Manufacturing  18185
4    Health Care Provider  17369


In [59]:
# Drop duplicates
full_data.drop_duplicates(inplace=True)

In [60]:
# Final sampling step
sampled_data = full_data.sample(frac=0.05, random_state=42).copy()
print("\nSample of fully processed data:\n", sampled_data.head())


Sample of fully processed data:
             job_id                                   company_name  \
179699  3905351309                           Signature HealthCARE   
20013   3887593646                           Keck Medicine of USC   
23240   3887872935                                        Netflix   
151463  3904577325  TMAC's Direct Hire (Executive Search) Service   
131876  3903840980                             FirstChoice Hiring   

                                                    title  \
179699                  Certified Nursing Assistant (CNA)   
20013   Licensed Vocational Nurse - Emergency Room - F...   
23240   Technical Program Manager 6 - Game SDK and Ser...   
151463     Field Medical Director, Pediatric Gene Editing   
131876       HVAC Technicians / Installers - Capital Heat   

                                            description_x  max_salary  \
179699  About Us\n\nWelcome to Signature Healthcare of...     80000.0   
20013   The Licensed Vocational Nurse 

In [63]:
# First, create the salary range and category if not already created
job_posting['salary_range'] = job_posting['max_salary'] - job_posting['min_salary']
job_posting['salary_range_category'] = pd.cut(job_posting['salary_range'], bins=[0, 20000, 50000, 100000], labels=['Low', 'Medium', 'High'])

# Select and print company name, salary details, and the salary range category
print(job_posting[['company_name', 'min_salary', 'max_salary', 'salary_range_category']].head())


                company_name  min_salary  max_salary salary_range_category
0      Corcoran Sawyer Smith        17.0        20.0                   Low
2     The National Exemplar      45000.0     65000.0                   Low
3     Abrams Fensterman, LLP    140000.0    175000.0                Medium
5  Downtown Raleigh Alliance        14.0        20.0                   Low
6                 Raw Cereal     60000.0    300000.0                   NaN


In [64]:
# Create the is_full_time column if not already created
job_posting['is_full_time'] = job_posting['work_type'].apply(lambda x: 1 if x == 'Full-Time' else 0)

# Select and print company name and the is_full_time indicator
print(job_posting[['company_name', 'is_full_time']].head())


                company_name  is_full_time
0      Corcoran Sawyer Smith             0
2     The National Exemplar              0
3     Abrams Fensterman, LLP             0
5  Downtown Raleigh Alliance             0
6                 Raw Cereal             0


In [47]:
# Zgjedhja e veçorive numerike për PCA
features = job_posting[['max_salary', 'min_salary', 'views', 'applies']]

# Imputimi i vlerave të humbura
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Normalizimi i të dhënave
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_imputed)

# Aplikimi i PCA
pca = PCA(n_components=2)  # Zgjedhja e dy përbërësve
principal_components = pca.fit_transform(features_scaled)

# Krijimi i DataFrame për përbërësit
pca_df = pd.DataFrame(data=principal_components, columns=['PCA1', 'PCA2'])

# Shtimi i kolonave origjinale për identifikim
pca_df['job_id'] = job_posting['job_id']
pca_df['company_name'] = job_posting['company_name']

# Shfaqja e rezultatit
print(pca_df)

            PCA1      PCA2      job_id                     company_name
0      -0.348124 -0.085349      921716            Corcoran Sawyer Smith
1      -0.349048 -0.188197     1829192                          Unknown
2      -0.084354 -0.131589    10998357           The National Exemplar 
3       0.418452 -0.067948    23221523           Abrams Fensterman, LLP
4      -0.010225 -0.173417    35982263                          Unknown
...          ...       ...         ...                              ...
123844  0.400500 -0.192176  3906267117                     Lozano Smith
123845 -0.010225 -0.173417  3906267126                        Pinterest
123846 -0.010225 -0.173417  3906267131                     EPS Learning
123847 -0.010163 -0.165130  3906267195  Trelleborg Applied Technologies
123848  0.008499 -0.181798  3906267224                        Solugenix

[123849 rows x 4 columns]


In [72]:
# Create a new folder named 'processed_data' (you can choose any name)
folder_name = 'Processed_Data'
os.makedirs(folder_name, exist_ok=True)  # Creates the folder if it doesn't exist

# Save the preprocessed data to a new CSV file in the created folder
file_path = os.path.join(folder_name, 'preprocessed_job_postings.csv')
job_posting.to_csv(file_path, index=False)

NameError: name 'os' is not defined