In [88]:
import sys
sys.path.append('/Users/gscerberus/Desktop/Salary_Prediction')

In [89]:
import os
os.chdir('/Users/gscerberus/Desktop/Salary_Prediction')

In [90]:
import pandas as pd
from data_preprocessing import DataPreprocessor
import numpy as np
import pickle
from typing import Type, Dict, List
from descriptions import COMPANY_LOCATION, EMPLOYMENT_TYPE, EXPERIENCE_LEVEL, COMPANY_SIZE, EXPERIENCE_LEVEL_MAPPER, COMPANY_SIZE_MAPPER
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

# Model Performance
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error



In [91]:
df = pd.read_csv('salaries.csv')

# creating the DataPreprocessor instance
data_preprocessor = DataPreprocessor(df)
# Getting the countries with most records, removing countries that don't show us any significant information
data = data_preprocessor.top_10_countries("company_location")
# Selecting salaries between 10k and 250k to ignore potential outliers
data = data.select_salaries('salary_in_usd')
# reseting index
data = data.reset_index()

# performing some mapping (in the end company size and experience level are mapped in an ordinal way)
data.map_column('company_location', COMPANY_LOCATION)
data.map_column('employment_type', EMPLOYMENT_TYPE)
data.map_column('experience_level', EXPERIENCE_LEVEL)
data.map_column('company_size', COMPANY_SIZE)

# first getting the string values from columns, to not appear them as numbers in the app
experience_level = data.get_unique_values('experience_level')
company_size = data.get_unique_values('company_size')


data.map_column('experience_level', EXPERIENCE_LEVEL_MAPPER)
data.map_column('company_size', COMPANY_SIZE_MAPPER)

# dropping columns we don't need for our study
data = data.drop_unnecessary_columns(cols =  ['salary','salary_currency','employee_residence','work_year','remote_ratio'])


In [92]:
# Getting unique values of employment type, job title and company location to rename the columns after one hot encoding
# Instead of having company_location_USA we can just have USA or employment_type_Part-Time we can just have Part-Time

employment_type = data.get_unique_values('employment_type')
job_title = data.get_unique_values('job_title')
company_location = data.get_unique_values('company_location')


categorical = [employment_type, job_title, company_location]

In [93]:
# One hot encoding the columns

columns_to_one_hot_encode = ['employment_type','job_title','company_location']
data_one_hot_encoded = data.one_hot_encode_df(columns_to_one_hot_encode)

# renaming each column
for i, id in enumerate(categorical):
    data_one_hot_encoded.rename_columns(id, columns_to_one_hot_encode[i])

In [94]:
final_df = DataPreprocessor.concat_dataframes(data_one_hot_encoded.data,data.data)

In [95]:
final_df.print()

Unnamed: 0,Contract,Freelancer,Full-Time,Part-Time,AI Architect,AI Developer,AI Engineer,AI Programmer,AI Research Engineer,AI Scientist,...,Netherlands,Portugal,Spain,United States,experience_level,employment_type,job_title,salary_in_usd,company_location,company_size
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,Full-Time,Machine Learning Engineer,150000,United States,1
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,Full-Time,Machine Learning Engineer,50000,United States,1
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,Full-Time,Data Engineer,73824,Great Britain,1
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,Full-Time,Data Engineer,55368,Great Britain,1
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2,Full-Time,Data Science Engineer,220000,United States,1


In [96]:
print(type(final_df)) #we can convert it into a dataframe to perform the necessary changes

<class 'data_preprocessing.DataPreprocessor'>


In [97]:
# dropping columns that were used to be one hot encoded

final_df = final_df.drop_unnecessary_columns(columns_to_one_hot_encode)

In [98]:
final_df.data.head()

Unnamed: 0,Contract,Freelancer,Full-Time,Part-Time,AI Architect,AI Developer,AI Engineer,AI Programmer,AI Research Engineer,AI Scientist,...,Germany,Great Britain,India,Netherlands,Portugal,Spain,United States,experience_level,salary_in_usd,company_size
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,150000,1
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,50000,1
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,73824,1
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,55368,1
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,220000,1


In [99]:
df = final_df.data #now we are dealing with a dataframe

X = df.drop(columns='salary_in_usd', axis=1)
y = df['salary_in_usd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb = XGBRegressor(lr=0.01, max_depth=3, n_estimators=100)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
y_pred_1 = xgb.predict(X_train)

print(mean_absolute_error(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))
print(np.sqrt(mean_squared_error(y_train, y_pred_1)))  # XGB regressor will be used and no problems with overfitting

32211.31498555038
40058.96183362004
38509.11905721688


In [100]:
# Testing results in inference
X = {'company_location':'United States',
     'company_size': 'Large',
     'experience_level':'Senior-Level',
     'job_title':'AI Engineer',
     'employment_type':'Full Time'}


# Create a row with all encoded columns, setting the other country and job title columns to 0
encoded_columns = data_one_hot_encoded.data.columns
encoded_row = [1 if col == X['company_location'] or col == X['job_title'] or col == X['employment_type'] else 0 for col in encoded_columns]

# Create a new dataframe for prediction with the same column names
df_prediction = pd.DataFrame([encoded_row], columns=[encoded_columns])

# The df_prediction dataframe now contains one-hot encoding for the specified features
df_prediction['experience_level'] = X['experience_level']
df_prediction['company_size'] = X['company_size']

In [101]:
df_prediction

Unnamed: 0,Contract,Freelancer,Full-Time,Part-Time,AI Architect,AI Developer,AI Engineer,AI Programmer,AI Research Engineer,AI Scientist,...,France,Germany,Great Britain,India,Netherlands,Portugal,Spain,United States,experience_level,company_size
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,Senior-Level,Large


In [102]:
# mapping company size from Large to the number and senior level as well, so in the app will appear
# for ease of functionality Large/Medium/Small or Senior-Level/Executive-Level and so on, but to perform our
# predictions, we need them converted into numbers
df_prediction['experience_level'] = df_prediction['experience_level'].map(lambda x: EXPERIENCE_LEVEL_MAPPER.get(x, x))
df_prediction['company_size'] = df_prediction['company_size'].map(lambda x: COMPANY_SIZE_MAPPER.get(x, x))

In [103]:
df_prediction

Unnamed: 0,Contract,Freelancer,Full-Time,Part-Time,AI Architect,AI Developer,AI Engineer,AI Programmer,AI Research Engineer,AI Scientist,...,France,Germany,Great Britain,India,Netherlands,Portugal,Spain,United States,experience_level,company_size
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,2,2


In [104]:
# getting the inference prediction
y_pred = xgb.predict(df_prediction)

y_pred

array([125982.9], dtype=float32)

In [105]:
data = {"model":xgb,
        "encoded_columns":encoded_columns,
        "experience_level": experience_level,
        "company_size":company_size,
        "employment_type":employment_type,
        "job_title":job_title,
        "company_location":company_location}

with open("saved_steps.pkl", "wb") as file:
    pickle.dump(data, file)