# Data-Science Salaries 

### Importing Libraries

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
#from imblearn.over_sampling import SMOTE
import itertools
import warnings
warnings.filterwarnings('ignore')
import time

# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.utils import resample

In [51]:
df = pd.read_csv('ds_salaries.csv')

### Cleaning The Data

In [52]:
df['experience_level'].value_counts()
df['experience_level'] = df['experience_level'].replace({'EN': 1, 'MI': 2, 'SE': 3, 'EX': 4})
df['employment_type'] = df['employment_type'].replace({'FL':0 ,'PT': 1, 'FT': 2, 'CT': 3,})
display(df)

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,2,2,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,3,2,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,3,2,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,2,2,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,3,2,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...,...
602,602,2022,3,2,Data Engineer,154000,USD,154000,US,100,US,M
603,603,2022,3,2,Data Engineer,126000,USD,126000,US,100,US,M
604,604,2022,3,2,Data Analyst,129000,USD,129000,US,0,US,M
605,605,2022,3,2,Data Analyst,150000,USD,150000,US,100,US,M


In [53]:
df.drop(['Unnamed: 0','salary','salary_currency'], axis= 1, inplace= True)
display(df)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,2,2,Data Scientist,79833,DE,0,DE,L
1,2020,3,2,Machine Learning Scientist,260000,JP,0,JP,S
2,2020,3,2,Big Data Engineer,109024,GB,50,GB,M
3,2020,2,2,Product Data Analyst,20000,HN,0,HN,S
4,2020,3,2,Machine Learning Engineer,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...
602,2022,3,2,Data Engineer,154000,US,100,US,M
603,2022,3,2,Data Engineer,126000,US,100,US,M
604,2022,3,2,Data Analyst,129000,US,0,US,M
605,2022,3,2,Data Analyst,150000,US,100,US,M


In [54]:
df.remote_ratio.value_counts()
df['company_size'].value_counts()
df['remote_ratio'] = df['remote_ratio'].replace({100: 3, 50: 2, 0: 1})
df.company_size = df.company_size.replace({'S':1 ,'M': 2, 'L': 3})

In [55]:
outliers = df.quantile(.97)
df = df[(df['salary_in_usd']<outliers['salary_in_usd'])]
display(df)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,2,2,Data Scientist,79833,DE,1,DE,3
2,2020,3,2,Big Data Engineer,109024,GB,2,GB,2
3,2020,2,2,Product Data Analyst,20000,HN,1,HN,1
4,2020,3,2,Machine Learning Engineer,150000,US,2,US,3
5,2020,1,2,Data Analyst,72000,US,3,US,3
...,...,...,...,...,...,...,...,...,...
602,2022,3,2,Data Engineer,154000,US,3,US,2
603,2022,3,2,Data Engineer,126000,US,3,US,2
604,2022,3,2,Data Analyst,129000,US,1,US,2
605,2022,3,2,Data Analyst,150000,US,3,US,2


In [56]:
df.corr()['salary_in_usd']

work_year           0.303301
experience_level    0.538059
employment_type     0.151219
salary_in_usd       1.000000
remote_ratio        0.153260
company_size        0.133352
Name: salary_in_usd, dtype: float64

### Encoding The Data

In [57]:
df2 = pd.get_dummies(df)

In [58]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
display(df2)
y=df2['salary_in_usd']
x=df2.drop(['salary_in_usd'],axis=1)
x = scaler.fit_transform(x)
display(x)
print(x.shape)

Unnamed: 0,work_year,experience_level,employment_type,salary_in_usd,remote_ratio,company_size,job_title_3D Computer Vision Researcher,job_title_AI Scientist,job_title_Analytics Engineer,job_title_Applied Data Scientist,...,company_location_PL,company_location_PT,company_location_RO,company_location_RU,company_location_SG,company_location_SI,company_location_TR,company_location_UA,company_location_US,company_location_VN
0,2020,2,2,79833,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020,3,2,109024,2,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020,2,2,20000,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020,3,2,150000,2,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,2020,1,2,72000,3,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,2022,3,2,154000,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
603,2022,3,2,126000,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
604,2022,3,2,129000,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
605,2022,3,2,150000,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


array([[0.        , 0.33333333, 0.66666667, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.66666667, 0.66666667, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.33333333, 0.66666667, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.66666667, 0.66666667, ..., 0.        , 1.        ,
        0.        ],
       [1.        , 0.66666667, 0.66666667, ..., 0.        , 1.        ,
        0.        ],
       [1.        , 0.33333333, 0.66666667, ..., 0.        , 1.        ,
        0.        ]])

(588, 161)


### Building A Model And Training it 

In [59]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

In [60]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
model = GradientBoostingRegressor()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
display(y_pred)
print('The r2_score of this model is :',r2_score(y_test,y_pred))

array([151407.82534205, 147087.44761955,  80517.05905387, 154717.75740282,
        73099.61443322,  26862.76388536, 153030.05156503,  43265.09874967,
        59202.93023093, 154717.75740282, 115609.29200954,  80747.53494852,
       115609.29200954, 153030.05156503,  46462.64078751,  59202.93023093,
        94171.07156336, 151407.82534205,  84963.54177696, 154717.75740282,
       121610.73497252, 162645.95729572,  73099.61443322,  82468.27568145,
        51908.50192289,  65143.58840214, 154717.75740282,  80517.05905387,
       151531.57681501, 189159.11301751, 155134.99489291,  81800.48198021,
        50815.08485933,  98410.87599041, 100843.25057148,  53840.58462358,
       153030.05156503,  71903.86855935,  57860.82935958, 115609.29200954,
        97883.90171862, 153030.05156503,  40701.98133191,  39951.42629826,
        63988.21625067, 115609.29200954,  65379.966579  ,  95645.89976244,
       127677.09039096,  93966.74679495,  45650.39857032, 178762.90824271,
       153030.05156503, 1

The r2_score of this model is : 0.6000487098545371
