In [1]:
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('DataScience_salaries_2024.csv')
data.head()


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021,MI,FT,Data Scientist,30400000,CLP,40038,CL,100,CL,L
1,2021,MI,FT,BI Data Analyst,11000000,HUF,36259,HU,50,US,L
2,2020,MI,FT,Data Scientist,11000000,HUF,35735,HU,50,HU,L
3,2021,MI,FT,ML Engineer,8500000,JPY,77364,JP,50,JP,S
4,2022,SE,FT,Lead Machine Learning Engineer,7500000,INR,95386,IN,50,IN,L


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14838 entries, 0 to 14837
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           14838 non-null  int64 
 1   experience_level    14838 non-null  object
 2   employment_type     14838 non-null  object
 3   job_title           14838 non-null  object
 4   salary              14838 non-null  int64 
 5   salary_currency     14838 non-null  object
 6   salary_in_usd       14838 non-null  int64 
 7   employee_residence  14838 non-null  object
 8   remote_ratio        14838 non-null  int64 
 9   company_location    14838 non-null  object
 10  company_size        14838 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.2+ MB


In [3]:
data = data[['employee_residence' , 'experience_level', 'employment_type', 'salary_in_usd']]
data.head()

Unnamed: 0,employee_residence,experience_level,employment_type,salary_in_usd
0,CL,MI,FT,40038
1,HU,MI,FT,36259
2,HU,MI,FT,35735
3,JP,MI,FT,77364
4,IN,SE,FT,95386


In [4]:
data = data.dropna()
data.isnull().sum()

employee_residence    0
experience_level      0
employment_type       0
salary_in_usd         0
dtype: int64

In [5]:
data = data[data['employment_type'] == 'FT']
data = data.drop('employment_type', axis = 1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14772 entries, 0 to 14836
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   employee_residence  14772 non-null  object
 1   experience_level    14772 non-null  object
 2   salary_in_usd       14772 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 461.6+ KB


In [7]:
data['employee_residence'].value_counts()

employee_residence
US    12899
GB      646
CA      387
ES      130
DE       87
      ...  
CZ        1
MU        1
BG        1
IQ        1
ID        1
Name: count, Length: 83, dtype: int64

In [9]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i]>= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'other'
    return categorical_map
          

In [11]:
employee_residence_map = shorten_categories(data.employee_residence.value_counts(), 15)
data['employee_residence' ] = data['employee_residence'].map(employee_residence_map)
data.employee_residence.value_counts()


employee_residence
US       12899
GB         646
CA         387
other      275
ES         130
DE          87
IN          72
FR          64
AU          50
PT          30
NL          27
BR          23
IT          18
GR          17
CO          16
LT          16
ZA          15
Name: count, dtype: int64

In [12]:
data = data[data['salary_in_usd'] <= 250000]
data = data[data['salary_in_usd'] >= 5000]
data = data[data['employee_residence'] != 'other']

In [13]:
data['experience_level'].unique()

array(['SE', 'EX', 'MI', 'EN'], dtype=object)

In [19]:
from sklearn.preprocessing import LabelEncoder
le_experience_level = LabelEncoder()
data['experience_level'] = le_experience_level.fit_transform(data['experience_level'])
le_employee_residence = LabelEncoder()
data['employee_residence'] = le_employee_residence.fit_transform(data['employee_residence'])

In [22]:
x = data.drop('salary_in_usd', axis = 1)
y = data['salary_in_usd']

In [23]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(x,y.values)

In [24]:
y_pred = linear_reg.predict(x)

In [26]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y,y_pred))

In [27]:
error

47213.58589622816

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
max_depth = [None, 2,4,6,8,10,12]
parameters = {'max_depth': max_depth}
regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor,parameters, scoring ='neg_mean_squared_error')
gs.fit(x,y.values)

In [33]:
regressor = gs.best_estimator_
regressor.fit(x,y.values)
y_pred = regressor.predict(x)
error = np.sqrt(mean_squared_error(y,y_pred))


In [35]:
error

44244.51969465634

In [36]:
x

Unnamed: 0,employee_residence,experience_level
4,9,3
6,9,3
8,9,1
10,9,2
11,9,0
...,...,...
14816,8,2
14819,9,0
14820,13,0
14824,8,0


In [53]:
x = np.array([['US', 'EX',]])
x

array([['US', 'EX']], dtype='<U2')

In [54]:
x[:,0] = le_employee_residence.transform(x[:,0])
x[:,1] = le_experience_level.transform(x[:,1])
x = x.astype(float)
x

array([[14.,  1.]])

In [55]:
y_pred = regressor.predict(x)
y_pred



array([176102.76751592])

In [56]:
import pickle


In [57]:
data = {'model' : regressor, 'le_employee_residence' : le_employee_residence, 'le_experience_level' : le_experience_level}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data,file)

In [58]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)
    
regressor_loaded = data['model']
le_employee_residence = data['le_employee_residence']
le_experience_level = data['le_experience_level']


In [59]:
y_pred = regressor_loaded.predict(x)
y_pred



array([176102.76751592])