# Prediction

In [12]:
#Data modules
import pandas as pd

In [13]:
df = pd.read_csv('Data/data_science_jobs_dataset.csv')

In [14]:
#Both salary and salary_currency will be used for the regression thus salary_in_usd is dropped to avoid multicollinearity
df.drop('salary_in_usd',axis=1,inplace=True)

In [15]:
#I decided to make all variables but salary categorical, it makes the decision tree more accurate
df['work_year'] = df['work_year'].astype("string")
df['remote_ratio'] = df['remote_ratio'].astype("string")

In [16]:
#I have almost no data on non full-time jobs so I just drop the column and neglect the employment type in the regression
df.drop('employment_type', axis=1, inplace=True)

In [17]:
#All job titles that appear 3 or less times are categorized into "other"
job_title_cat = df.groupby('job_title')['job_title'].agg('count').sort_values(ascending=False)
job_title_cat_other = job_title_cat[job_title_cat<=3]
df.job_title = df.job_title.apply(lambda x: 'other' if x in job_title_cat_other else x )

In [18]:
#The same categorization is applied to employee residence with 2 or less entries
emp_res_cat = df.groupby('employee_residence')['employee_residence'].agg('count').sort_values(ascending=False)
emp_res_cat_other = emp_res_cat[emp_res_cat<=2]
df.employee_residence = df.employee_residence.apply(lambda x: 'other' if x in emp_res_cat_other else x )

In [19]:
#Same categorization in company location
com_loc_cat = df.groupby('company_location')['company_location'].agg('count').sort_values(ascending=False)
com_loc_cat_other = com_loc_cat[com_loc_cat<=2]
df.company_location = df.company_location.apply(lambda x: 'other' if x in com_loc_cat_other else x )

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
#Split the data, 80% of it is used to train the model and the other 20% to test it
train_data, test_data = train_test_split(df,test_size=0.2)

In [22]:
train = df.drop('salary',axis=1)
train_labels = df['salary']

In [23]:
#All columns are categorical, so we apply some categorical encoding to transform them into a binary representation.
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('one_hot_cat',OneHotEncoder())
])
 

train_pipelined = pipeline.fit_transform(train)

In [24]:
test = test_data.drop('salary',axis=1)
test_labels = test_data['salary']

In [25]:
test_pipelined = pipeline.transform(test)

In [26]:
# The most fitting decision tree is created using sklearn
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()

tree_reg.fit(train_pipelined,train_labels)

DecisionTreeRegressor()

In [27]:
tree_reg.score(test_pipelined,test_labels)

0.9946181543112581

In [28]:
#Both the tree and the categorical encoding of our data is saved into files in order to use them in our streammlit deployment
import pickle
pickle.dump(pipeline, open("Predictive_model/pipeline.p","wb"))
pickle.dump(tree_reg, open("Predictive_model/tree_regressor.p","wb"))