In [144]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [145]:
# Importing data
data = pd.read_csv("job_descriptions.csv")

In [146]:
# Learning more about the data types and storage usage
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615940 entries, 0 to 1615939
Data columns (total 23 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Job Id            1615940 non-null  int64  
 1   Experience        1615940 non-null  object 
 2   Qualifications    1615940 non-null  object 
 3   Salary Range      1615940 non-null  object 
 4   location          1615940 non-null  object 
 5   Country           1615940 non-null  object 
 6   latitude          1615940 non-null  float64
 7   longitude         1615940 non-null  float64
 8   Work Type         1615940 non-null  object 
 9   Company Size      1615940 non-null  int64  
 10  Job Posting Date  1615940 non-null  object 
 11  Preference        1615940 non-null  object 
 12  Contact Person    1615940 non-null  object 
 13  Contact           1615940 non-null  object 
 14  Job Title         1615940 non-null  object 
 15  Role              1615940 non-null  object 
 16  

In [147]:
# Checking for null data
data.isnull().sum()

Job Id                 0
Experience             0
Qualifications         0
Salary Range           0
location               0
Country                0
latitude               0
longitude              0
Work Type              0
Company Size           0
Job Posting Date       0
Preference             0
Contact Person         0
Contact                0
Job Title              0
Role                   0
Job Portal             0
Job Description        0
Benefits               0
skills                 0
Responsibilities       0
Company                0
Company Profile     5478
dtype: int64

In [148]:
# Dropping data without Company Profile since using different results might affect final result
data.dropna(subset = ['Company Profile'], inplace= True)

In [149]:
# Getting rid of unecessary data and then splitting to max and minimum experience in years
split_exp = data["Experience"].str.split(" ")
min_exp = split_exp.str[0]
max_exp = split_exp.str[2]
data["min_exp"] = min_exp
data["max_exp"] = max_exp
data

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile,min_exp,max_exp
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie...",5,15
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com...",2,12
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P...",0,12
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O...",4,11
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ...",1,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615935,134563577088850,0 to 12 Years,B.Tech,$64K-$114K,"Malabo (de jure),",Equatorial Guinea,1.6508,10.2679,Full-Time,18281,...,Mechanical Design Engineer,ZipRecruiter,Mechanical Design Engineers create and develop...,"{'Employee Assistance Programs (EAP), Tuition ...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",The Hershey Company,"{""Sector"":""Food and Beverage/Confectionery"",""I...",0,12
1615936,618604818190827,2 to 14 Years,M.Tech,$62K-$130K,Warsaw,Poland,51.9194,19.1451,Intern,63621,...,IT Director,USAJOBS,An IT Director oversees an organizations IT de...,"{'Health Insurance, Retirement Plans, Paid Tim...",Strategic IT planning Leadership and managemen...,Provide strategic leadership for IT department...,EQT,"{""Sector"":""Energy"",""Industry"":""Energy"",""City"":...",2,14
1615937,615471367712200,4 to 15 Years,BCA,$60K-$96K,Ashgabat,Turkmenistan,38.9697,59.5563,Part-Time,114287,...,Mechanical Design Engineer,Indeed,Mechanical Design Engineers create and develop...,"{'Tuition Reimbursement, Stock Options or Equi...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",KLA,"{""Sector"":""Technology"",""Industry"":""Semiconduct...",4,15
1615938,804137342023945,5 to 15 Years,BCA,$65K-$103K,Ouagadougou,Burkina Faso,12.2383,-1.5616,Full-Time,45009,...,Training Coordinator,Stack Overflow Jobs,Training Coordinators design and implement emp...,"{'Casual Dress Code, Social and Recreational A...",Training program coordination Training materia...,"Coordinate employee training programs, track t...",Mahindra & Mahindra,"{""Sector"":""Automotive"",""Industry"":""Automotive""...",5,15


In [150]:
# Splitting salary range and transforming it into quantitative data and then splitting data into min and max
split_salary = data["Salary Range"].str.split("-")

min_salary = split_salary.str[0]
max_salary = split_salary.str[1]

data["min_salary"] = min_salary
data["min_salary"] = data["min_salary"].str.replace("$", "")
data["min_salary"] = data["min_salary"].str.replace("K", "000")

data["max_salary"] = max_salary
data["max_salary"] = data["max_salary"].str.replace("$", "")
data["max_salary"] = data["max_salary"].str.replace("K", "000")

data

  data["min_salary"] = data["min_salary"].str.replace("$", "")
  data["max_salary"] = data["max_salary"].str.replace("$", "")


Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Job Description,Benefits,skills,Responsibilities,Company,Company Profile,min_exp,max_exp,min_salary,max_salary
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie...",5,15,59000,99000
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com...",2,12,56000,116000
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P...",0,12,61000,104000
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O...",4,11,65000,91000
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ...",1,12,64000,87000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615935,134563577088850,0 to 12 Years,B.Tech,$64K-$114K,"Malabo (de jure),",Equatorial Guinea,1.6508,10.2679,Full-Time,18281,...,Mechanical Design Engineers create and develop...,"{'Employee Assistance Programs (EAP), Tuition ...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",The Hershey Company,"{""Sector"":""Food and Beverage/Confectionery"",""I...",0,12,64000,114000
1615936,618604818190827,2 to 14 Years,M.Tech,$62K-$130K,Warsaw,Poland,51.9194,19.1451,Intern,63621,...,An IT Director oversees an organizations IT de...,"{'Health Insurance, Retirement Plans, Paid Tim...",Strategic IT planning Leadership and managemen...,Provide strategic leadership for IT department...,EQT,"{""Sector"":""Energy"",""Industry"":""Energy"",""City"":...",2,14,62000,130000
1615937,615471367712200,4 to 15 Years,BCA,$60K-$96K,Ashgabat,Turkmenistan,38.9697,59.5563,Part-Time,114287,...,Mechanical Design Engineers create and develop...,"{'Tuition Reimbursement, Stock Options or Equi...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",KLA,"{""Sector"":""Technology"",""Industry"":""Semiconduct...",4,15,60000,96000
1615938,804137342023945,5 to 15 Years,BCA,$65K-$103K,Ouagadougou,Burkina Faso,12.2383,-1.5616,Full-Time,45009,...,Training Coordinators design and implement emp...,"{'Casual Dress Code, Social and Recreational A...",Training program coordination Training materia...,"Coordinate employee training programs, track t...",Mahindra & Mahindra,"{""Sector"":""Automotive"",""Industry"":""Automotive""...",5,15,65000,103000


In [151]:
# Manually encoding Preference column
data["Preference"] = data["Preference"].str.replace("Male", "0")
data["Preference"] = data["Preference"].str.replace("Female", "1")
data["Preference"] = data["Preference"].str.replace("Both", "2")

In [152]:
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Ordinally encodes different data so it can be processed as quantitative data
data["encoded_wt"] = label_encoder.fit_transform(data["Work Type"])
data["encoded_qual"] = label_encoder.fit_transform(data["Qualifications"])
data["encoded_portal"] = label_encoder.fit_transform(data["Job Portal"])

In [153]:
to_drop = ["Job Id", "Job Posting Date", "Contact Person", "Contact", "Experience", "Salary Range", "Company Profile",
               "Responsibilities", "skills", "Benefits", "Job Description", "Job Portal", "Qualifications", "Work Type"]
data.drop(to_drop, axis = 1, inplace = True)
data

Unnamed: 0,location,Country,latitude,longitude,Company Size,Preference,Job Title,Role,Company,min_exp,max_exp,min_salary,max_salary,encoded_wt,encoded_qual,encoded_portal
0,Douglas,Isle of Man,54.2361,-4.5481,26801,1,Digital Marketing Specialist,Social Media Manager,Icahn Enterprises,5,15,59000,99000,2,6,11
1,Ashgabat,Turkmenistan,38.9697,59.5563,100340,1,Web Developer,Frontend Web Developer,PNC Financial Services Group,2,12,56000,116000,2,4,4
2,Macao,"Macao SAR, China",22.1987,113.5439,84525,0,Operations Manager,Quality Control Manager,United Services Automobile Assn.,0,12,61000,104000,4,9,7
3,Porto-Novo,Benin,9.3077,2.3158,129896,1,Network Engineer,Wireless Network Engineer,Hess,4,11,65000,91000,1,9,2
4,Santiago,Chile,-35.6751,-71.5429,53944,1,Event Manager,Conference Manager,Cairn Energy,1,12,64000,87000,2,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615935,"Malabo (de jure),",Equatorial Guinea,1.6508,10.2679,18281,2,Mechanical Engineer,Mechanical Design Engineer,The Hershey Company,0,12,64000,114000,1,1,15
1615936,Warsaw,Poland,51.9194,19.1451,63621,0,IT Manager,IT Director,EQT,2,14,62000,130000,2,6,14
1615937,Ashgabat,Turkmenistan,38.9697,59.5563,114287,1,Mechanical Engineer,Mechanical Design Engineer,KLA,4,15,60000,96000,3,4,5
1615938,Ouagadougou,Burkina Faso,12.2383,-1.5616,45009,1,HR Coordinator,Training Coordinator,Mahindra & Mahindra,5,15,65000,103000,1,4,12


In [154]:
# Changing data types to either optimize memory usage or just to be able to use it
data['min_exp'] = data['min_exp'].astype("int")
data['max_exp'] = data['max_exp'].astype("int")
data['min_salary'] = data['min_salary'].astype("int")
data['max_salary'] = data['max_salary'].astype("int")
data['encoded_qual'] = data['encoded_qual'].astype("int")
data['encoded_wt'] = data['encoded_wt'].astype("int")
data['encoded_portal'] = data['encoded_portal'].astype("int")
data['Preference'] = data['Preference'].astype("int")

#data['Qualifications'] = data['Qualifications'].astype('category')
#data['Work Type'] = data['Work Type'].astype('category')
data['location'] = data['location'].astype('category')
data['Country'] = data['Country'].astype('category')
data['Company'] = data['Company'].astype('category')
data['Role'] = data['Role'].astype('category')
data['Job Title'] = data['Job Title'].astype('category')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1610462 entries, 0 to 1615939
Data columns (total 16 columns):
 #   Column          Non-Null Count    Dtype   
---  ------          --------------    -----   
 0   location        1610462 non-null  category
 1   Country         1610462 non-null  category
 2   latitude        1610462 non-null  float64 
 3   longitude       1610462 non-null  float64 
 4   Company Size    1610462 non-null  int64   
 5   Preference      1610462 non-null  int32   
 6   Job Title       1610462 non-null  category
 7   Role            1610462 non-null  category
 8   Company         1610462 non-null  category
 9   min_exp         1610462 non-null  int32   
 10  max_exp         1610462 non-null  int32   
 11  min_salary      1610462 non-null  int32   
 12  max_salary      1610462 non-null  int32   
 13  encoded_wt      1610462 non-null  int32   
 14  encoded_qual    1610462 non-null  int32   
 15  encoded_portal  1610462 non-null  int32   
dtypes: category(5), fl

In [155]:
percent_improvement = (1 - (113.7/283.6) ) * 100
print(f'\nThere was a {round(percent_improvement, 2)}% reduction in file size.')


There was a 59.91% reduction in file size.
