#### *Installing and Importing Libraries*

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
import re
import numpy as np

In [3]:
le = LabelEncoder()

## ***1 - Data Loading***

In [4]:
job_salary_df = pd.read_csv('Job_Salary_Prediction_Dataset.csv')

## ***2 - Data Overview Exploration***

#### *Displaying First Fifteen Rows*

In [5]:
job_salary_df.head(15)

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,Work Type,Company Size,Job Posting Date,Preference,Job Title,Role,Job Description,Benefits,skills,Responsibilities,Company
0,1089840000000000.0,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,Intern,26801,4/24/2022,Female,Digital Marketing Specialist,Social Media Manager,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises
1,398454000000000.0,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,Intern,100340,12/19/2022,Female,Web Developer,Frontend Web Developer,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group
2,481640000000000.0,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",Temporary,84525,9/14/2022,Male,Operations Manager,Quality Control Manager,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.
3,688193000000000.0,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,Full-Time,129896,2/25/2023,Female,Network Engineer,Wireless Network Engineer,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess
4,117058000000000.0,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,Intern,53944,10/11/2022,Female,Event Manager,Conference Manager,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy
5,116831000000000.0,4 to 12 Years,MCA,$59K-$93K,Brussels,Belgium,Full-Time,23196,7/25/2023,Male,Software Tester,Quality Assurance Analyst,A Quality Assurance Analyst tests software and...,"{'Life and Disability Insurance, Stock Options...",Quality assurance processes Testing methodolog...,Test software applications and systems to iden...,Adani Ports and Special Economic Zone
6,1292170000000000.0,3 to 15 Years,PhD,$63K-$103K,George Town,Cayman Islands,Temporary,26119,4/10/2023,Both,Teacher,Classroom Teacher,A Classroom Teacher educates students in a spe...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Teaching pedagogy Classroom management Curricu...,"Plan and deliver engaging lessons, adapting te...",FedEx
7,1498780000000000.0,2 to 8 Years,M.Com,$65K-$102K,SÃ£o TomÃ©,Sao Tome and Principe,Contract,40558,9/20/2022,Female,UX/UI Designer,User Interface Designer,User Interface Designers focus on the visual a...,"{'Employee Assistance Programs (EAP), Tuition ...",UI design principles and best practices Graphi...,Create visually appealing user interfaces (UI)...,Ryder System
8,1680290000000000.0,2 to 9 Years,BBA,$65K-$102K,Male,Maldives,Temporary,105343,2/19/2022,Female,UX/UI Designer,Interaction Designer,Interaction Designers specialize in designing ...,"{'Transportation Benefits, Professional Develo...",Interaction design principles User behavior an...,"Work on interaction design, defining how users...",Zee Entertainment Enterprises
9,255628000000000.0,1 to 10 Years,BBA,$60K-$80K,Saint John's,Antigua and Barbuda,Full-Time,102069,5/13/2022,Both,Wedding Planner,Wedding Consultant,A Wedding Consultant assists couples in planni...,"{'Legal Assistance, Bonuses and Incentive Prog...",Wedding planning Vendor coordination Event man...,Offer expert advice and guidance to couples pl...,CSX


#### *Displaying No. of Rows and Columns*

In [6]:
print(f"Rows: {job_salary_df.shape[0]}, Columns: {job_salary_df.shape[1]}")

Rows: 1048575, Columns: 17


#### *Displaying Summary of Non-Missing Values and Data Types of Columns*

In [7]:
job_salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 17 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Job Id            1048575 non-null  float64
 1   Experience        1048575 non-null  object 
 2   Qualifications    1048575 non-null  object 
 3   Salary Range      1048575 non-null  object 
 4   location          1048575 non-null  object 
 5   Country           1048575 non-null  object 
 6   Work Type         1048575 non-null  object 
 7   Company Size      1048575 non-null  int64  
 8   Job Posting Date  1048575 non-null  object 
 9   Preference        1048575 non-null  object 
 10  Job Title         1048575 non-null  object 
 11  Role              1048575 non-null  object 
 12  Job Description   1048575 non-null  object 
 13  Benefits          1048575 non-null  object 
 14  skills            1048575 non-null  object 
 15  Responsibilities  1048575 non-null  object 
 16  

#### *Displaying Summary Statistics of Numerical Columns*

In [8]:
job_salary_df.describe()

Unnamed: 0,Job Id,Company Size
count,1048575.0,1048575.0
mean,1549784000000000.0,73705.91
std,894804600000000.0,35304.37
min,181795000000.0,12646.0
25%,774612500000000.0,43121.0
50%,1549470000000000.0,73641.0
75%,2325065000000000.0,104317.5
max,3099620000000000.0,134834.0


## ***3 - Data Cleaning***

### ***Feature Selection - Dropping Unnecssary Columns***

#### *Dropping Job ID*

#### *Reasons*
* It Has No Predictive Value
* It Can Mislead the Model

In [9]:
job_salary_df.drop('Job Id', axis=1, inplace=True)

### ***Handling Missing Values***

#### *No Missing Value*

In [10]:
job_salary_df.isnull().sum()

Experience          0
Qualifications      0
Salary Range        0
location            0
Country             0
Work Type           0
Company Size        0
Job Posting Date    0
Preference          0
Job Title           0
Role                0
Job Description     0
Benefits            0
skills              0
Responsibilities    0
Company             0
dtype: int64

### ***Handling Duplicate Enteries / Records***

#### *No Duplicate Enteries / Records*

In [11]:
print(job_salary_df.duplicated().sum())

0


## ***4 - Data Transformation***

#### *Transforming Experience Column*
#### *Details*
* Transforming Experience Column into 2 Columns i.e. Min Experience and Max Experience
* 1 to 5 Years -> Min Experience : 1 , Max Experience : 5
#### *Reasons*
* Numerical Format is Required for Model Training to Identify Patterns
* Preserve Complete Information by Keeping Both Lower Bound and Upper Bound
* Allows Model to Learn How Minimum and Maximum Experience Impact Salary Prediction  

In [12]:
print(job_salary_df["Experience"].unique())

['5 to 15 Years' '2 to 12 Years' '0 to 12 Years' '4 to 11 Years'
 '1 to 12 Years' '4 to 12 Years' '3 to 15 Years' '2 to 8 Years'
 '2 to 9 Years' '1 to 10 Years' '3 to 10 Years' '1 to 8 Years'
 '1 to 9 Years' '5 to 14 Years' '0 to 11 Years' '3 to 12 Years'
 '5 to 9 Years' '0 to 15 Years' '0 to 10 Years' '2 to 14 Years'
 '3 to 9 Years' '4 to 15 Years' '2 to 10 Years' '4 to 8 Years'
 '3 to 8 Years' '1 to 14 Years' '1 to 13 Years' '0 to 8 Years'
 '5 to 10 Years' '2 to 13 Years' '4 to 9 Years' '1 to 15 Years'
 '4 to 10 Years' '5 to 12 Years' '0 to 13 Years' '4 to 14 Years'
 '1 to 11 Years' '4 to 13 Years' '0 to 9 Years' '5 to 8 Years'
 '2 to 15 Years' '5 to 13 Years' '5 to 11 Years' '0 to 14 Years'
 '3 to 13 Years' '2 to 11 Years' '3 to 11 Years' '3 to 14 Years']


In [13]:
def transformExperience(exp):
    numbers = list(map(int, re.findall(r'\d+', exp)))
    # Return minimum and maximum experience
    return numbers[0], numbers[1]

In [14]:
job_salary_df[['Min Experience', 'Max Experience']] = job_salary_df['Experience'].apply(
    lambda x: pd.Series(transformExperience(x))
)

In [15]:
job_salary_df.drop('Experience', axis=1, inplace=True)

#### *Transforming Qualifications Column*
#### *Details*
* Applying One-Hot Encoding
#### *Reasons*
* No Natural Order in Qualifications
* Assuming a Strict Hierarchy might Mislead the Model
* Different Qualifications have Different Values
* Qualifications Do not Necessarily Follows a Strict Linear Order in terms of Salary
* The Relevance of a Qualification can depend on the Industry , the Job Role , and the Specific Company's Requirement

In [16]:
print(job_salary_df["Qualifications"].unique())

['M.Tech' 'BCA' 'PhD' 'MBA' 'MCA' 'M.Com' 'BBA' 'B.Tech' 'B.Com' 'BA']


In [17]:
job_salary_df = pd.get_dummies(job_salary_df, columns=['Qualifications'], prefix='Qual')


#### *Transforming Salary Range Column*
#### *Details*
* Transforming Salary Range Column into Salary Column by Computing the Mean of Minimum and Maximum Salary Range 
#### *Reasons*
* Simplifies Target Variable
* Removes Ambiguity and Uncertainity During Model Training by Giving a Specific Learnable Value
* Prevents Model Confusion by Providing a Single-Valued Target Variable
* Minimize Overfitting

In [18]:
print(job_salary_df["Salary Range"].nunique())

561


In [19]:
job_salary_df[['min_salary', 'max_salary']] = job_salary_df['Salary Range']\
    .str.extract(r'\$(\d+)K-\$(\d+)K').astype(float)*1000

In [20]:
job_salary_df['Salary'] = (job_salary_df['min_salary'] + job_salary_df['max_salary']) / 2

In [21]:
job_salary_df.drop(columns=['Salary Range', 'min_salary', 'max_salary'], inplace=True)

#### *Transforming Work Type Column*
#### *Details*
* Applying Custom Label Encoding
#### *Reasons*
* Avoids Unnecessary Columns and Reduces Model Complexity
* Captures Ordinal Relationship

In [22]:
print(job_salary_df["Work Type"].unique())

['Intern' 'Temporary' 'Full-Time' 'Contract' 'Part-Time']


In [23]:
work_type_mapping = {
    'Intern': 0,
    'Part-Time': 1,
    'Temporary': 2,
    'Contract': 3,
    'Full-Time': 4
}

# Apply Custom Label Encoding
job_salary_df['Work Type'] = job_salary_df['Work Type'].map(work_type_mapping)

#### *Transforming Preference Column*
#### *Details*
* Applying Label Encoding
#### *Reasons*
* Low Cardinality as Preference Column contains only 3 Unique Categories
* Avoids Unnecessary Columns and Reduces Model Complexity
* Captures Ordinal Relationship
* Faster Computation

In [24]:
print(job_salary_df["Preference"].unique())

['Female' 'Male' 'Both']


In [25]:
job_salary_df['Preference'] = le.fit_transform(job_salary_df['Preference'])

#### *Transforming Job Posting Date Column*
#### *Details*
* Transforming Job Posting Date Column is Transformed into 2 Columns : 
1.  Job Post Age (Days)
2. Job Posting Year
#### *Reasons*
* Numeric Format for Models
* Captures Market Trends and Time Relevance
* Older Job Posts might offer Less Salaries
* Job Post Age (Days) - Gives a Continuous Numeric Feature Showing How Old the Job Posting Is
* Job Posting Year - Captures Year-Wise Hiring Patterns or Trends

In [26]:
job_salary_df['Job Posting Date'] = pd.to_datetime(job_salary_df['Job Posting Date'])

In [27]:
today = pd.to_datetime('today')
job_salary_df['Job Post Age (Days)'] = (today - job_salary_df['Job Posting Date']).dt.days

In [28]:
job_salary_df['Job Posting Year'] = job_salary_df['Job Posting Date'].dt.year


In [29]:
job_salary_df.drop(columns=['Job Posting Date'], inplace=True)

#### *Transforming Company Column*
#### *Details*
* Applying Target Mean Encoding
#### *Reasons*
* Directly Encodes Relationship with Target Variable
* Reduces High Cardinality Issues
* Does Not Falsely Impose Order in Companies
* Improves Model Accuracy

In [30]:
print(job_salary_df["Company"].nunique())

888


In [31]:
company_salary_mean = job_salary_df.groupby('Company')['Salary'].mean()

job_salary_df['Company Encoded'] = job_salary_df['Company'].map(company_salary_mean)

In [32]:
job_salary_df.drop('Company', axis=1, inplace=True)

#### *Transforming Job Title and Role Column*
#### *Details*
* Combining Job Title Column and Role Column into Job Profile Column
* Applying Target Mean Encoding on Job Profile

#### *Reasons*
* Directly Encodes Relationship with Target Variable
* Reduces High Cardinality Issues
* Does Not Falsely Impose Order in Roles
* Improves Model Accuracy

In [33]:
job_salary_df['Job Profile'] = job_salary_df['Job Title'] + ' - ' + job_salary_df['Role']

In [34]:
print(job_salary_df["Job Profile"].nunique())

376


In [35]:
role_salary_mean = job_salary_df.groupby('Job Profile')['Salary'].mean()

job_salary_df['Job Profile Encoded'] = job_salary_df['Job Profile'].map(role_salary_mean)

In [36]:
job_salary_df.drop(columns=['Job Profile','Job Title','Role'], inplace=True)

In [37]:
job_salary_df.head()

Unnamed: 0,location,Country,Work Type,Company Size,Preference,Job Description,Benefits,skills,Responsibilities,Min Experience,...,Qual_M.Com,Qual_M.Tech,Qual_MBA,Qual_MCA,Qual_PhD,Salary,Job Post Age (Days),Job Posting Year,Company Encoded,Job Profile Encoded
0,Douglas,Isle of Man,0,26801,1,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",5,...,False,True,False,False,False,79000.0,1070,2022,82513.178914,82384.347345
1,Ashgabat,Turkmenistan,0,100340,1,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",2,...,False,False,False,False,False,86000.0,831,2022,82470.489039,82545.395869
2,Macao,"Macao SAR, China",2,84525,2,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,0,...,False,False,False,False,True,82500.0,927,2022,82556.078767,82675.043706
3,Porto-Novo,Benin,4,129896,1,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",4,...,False,False,False,False,True,78000.0,763,2023,82318.065068,82830.108011
4,Santiago,Chile,0,53944,1,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,1,...,False,False,True,False,False,75500.0,900,2022,82476.5625,82736.552028
