## Data pre-processing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('../Dataset/SO_survey_results_19-20.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../Dataset/SO_survey_results_19-20.csv'

In [None]:
df.head()

In [None]:
df = df[["Country", "EdLevel", "YearsCodePro", "Employment", "ConvertedComp"]] ##these are the only columns required for the model
df = df.rename({"ConvertedComp": "Salary"}, axis=1) ## ConvertedComp is salary in native currency converted to UsD
df.head()

In [None]:
df = df[df["Salary"].notnull()] ##we do not want null values.
df.head()

In [None]:
df.info()

In [None]:
df = df.dropna() ##To drop any data in above columns which is not a number.
df.isnull().sum()

In [None]:
df = df[df["Employment"] == "Employed full-time"] ##Changing data in such way that only data that remains is "employed full-time"
df = df.drop("Employment", axis=1) ##Droping the Employment column as it is not required.
df.info()

In [None]:
df['Country'].value_counts()

In [None]:
## This func combines the categorical data that doesnt meet our cutoff.
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [None]:
country_map = shorten_categories(df.Country.value_counts(), 400)
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12, 7))
df.boxplot('Salary', 'Country', ax=ax) ##This will show us the medians and the outliers.
plt.suptitle('Salary (US$) v Country')
plt.title('')
plt.ylabel('Salary')
plt.xticks(rotation=90)
plt.show()

In [None]:
df = df[df["Salary"] <= 120000] ##Above 120k dollars/anum is creating outliers.
df = df[df["Salary"] >= 10000] 
df = df[df['Country'] != 'Other']

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12, 7))
df.boxplot('Salary', 'Country', ax=ax)
plt.suptitle('Salary (US$) v Country')
plt.title('')
plt.ylabel('Salary')
plt.xticks(rotation=90)
plt.show()

In [None]:
df["YearsCodePro"].unique()

In [None]:
##The values need to be in float
def clean_experience(x):
    if x ==  'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5   
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(clean_experience)

In [None]:
df["YearsCodePro"].unique()

In [None]:
df["EdLevel"].unique()

In [None]:
##Edlevel catagories need to be simplified.
def clean_education(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than a Bachelors'

df['EdLevel'] = df['EdLevel'].apply(clean_education)

In [None]:
df["EdLevel"].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder as LE 

In [21]:
le_education = LE()
df['EdLevel'] = le_education.fit_transform(df['EdLevel'])
df['EdLevel'].unique() ##now model can understand the values as they are in integer format

array([0, 2, 1, 3])

In [22]:
le_country = LE()
df['Country'] = le_country.fit_transform(df['Country'])
df['Country'].unique()

array([35, 34, 29, 20, 11,  6,  3, 18,  4, 10, 12,  7,  5, 13, 26, 25, 14,
       19, 17,  9, 30,  2, 32, 24,  1,  0, 15, 22, 23, 33,  8, 16, 27, 28,
       31, 21])

In [23]:
x = df.drop('Salary', axis = 1)
y = df['Salary']
x , y

(        Country  EdLevel  YearsCodePro
 7            35        0          13.0
 9            34        2           4.0
 10           34        0           2.0
 11           29        1           7.0
 12           20        1          20.0
 ...         ...      ...           ...
 152773       35        0          11.0
 152779       24        0           2.0
 152787        9        2           7.0
 152789        2        2           9.0
 152791       12        0           5.0
 
 [50662 rows x 3 columns],
 7         116000.0
 9          32315.0
 10         40070.0
 11         14268.0
 12         38916.0
             ...   
 152773    120000.0
 152779     16032.0
 152787     82488.0
 152789     68745.0
 152791     22915.0
 Name: Salary, Length: 50662, dtype: float64)

In [24]:
from sklearn.model_selection import train_test_split as tts

In [25]:
train_X , test_X , train_Y , test_Y =tts(x,y,test_size=0.3, random_state=42)

## Model Selection & Creation

In [26]:
from sklearn.linear_model import LinearRegression as LR
linear_reg = LR()
linear_reg.fit(x, y.values)

LinearRegression()

In [27]:
y_pred = linear_reg.predict(x)

In [28]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y, y_pred))
error ##Error rate using linear regression

26443.01838008953

In [29]:
from sklearn.tree import DecisionTreeRegressor as DTR
dec_tree_reg = DTR(random_state = 0)
dec_tree_reg.fit(x, y.values)

DecisionTreeRegressor(random_state=0)

In [30]:
y_pred = dec_tree_reg.predict(x)

In [31]:
error = np.sqrt(mean_squared_error(y, y_pred))
error ##Error rate using Decision tree

16748.74747061543

In [32]:
from sklearn.ensemble import RandomForestRegressor as RFR
ra_forest_reg = RFR(random_state = 0)
ra_forest_reg.fit(x, y.values)

RandomForestRegressor(random_state=0)

In [33]:
y_pred = ra_forest_reg.predict(x)

In [34]:
error = np.sqrt(mean_squared_error(y, y_pred))
error ##Error rate using Random forest

16785.43238028018

In [35]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
param = {"max_depth": max_depth}

regressor = DTR(random_state = 0)
gs = GridSearchCV(regressor, param, scoring='neg_mean_squared_error')
gs.fit(x, y.values)

GridSearchCV(estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'max_depth': [None, 2, 4, 6, 8, 10, 12]},
             scoring='neg_mean_squared_error')

In [36]:
regress = gs.best_estimator_

regress.fit(x, y.values)
y_pred = regress.predict(x)
error = np.sqrt(mean_squared_error(y, y_pred))
error ##Error rate after choosing best estimator using GridSearchcv

17191.024430172416

In [37]:
x.head()

Unnamed: 0,Country,EdLevel,YearsCodePro
7,35,0,13.0
9,34,2,4.0
10,34,0,2.0
11,29,1,7.0
12,20,1,20.0


In [38]:
testX = np.array([["France", "Master’s degree", 6]])
testX

array([['France', 'Master’s degree', '6']], dtype='<U15')

In [39]:
testX[:, 0] = le_country.transform(testX[:, 0])
testX[:, 1] = le_education.transform(testX[:, 1])
X = testX.astype(float)
X

array([[10.,  2.,  6.]])

In [40]:
y = regress.predict(X)
y ## TEST RESULT

array([51087.30729167])

# Storing Model

In [41]:
import pickle

In [42]:
data = {"model": regress, "le_country": le_country, "le_education": le_education}
with open('saved_steps.pkl', 'wb') as file:  ##saving model & label-encoders in a .pkl file
    pickle.dump(data, file)

In [43]:
with open('saved_steps.pkl', 'rb') as File:
    data = pickle.load(File)

In [44]:
loadReg = data["model"]
country = data["le_country"]
Edu = data["le_education"]

In [45]:
loadY = loadReg.predict(X)
loadY

array([51087.30729167])