In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [26]:
df = pd.read_csv('eda_data.csv')

In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,age,python_yn,R_yn,spark,aws,excel,job_simp,seniority,desc_len,num_comp
0,0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\r\nLocation: Albuquerque, NM\r\...",3.8,Tecolote Research\r\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,...,47,1,0,0,0,1,data scientist,na,2536,0
1,1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\r\n\r\nI. General Summary\r\...,3.4,University of Maryland Medical System\r\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,...,36,1,0,0,0,0,data scientist,na,4783,0
2,2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\r\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,...,10,1,0,1,0,1,data scientist,na,3461,0
3,3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\r\nJob ID: 310709\r...,3.8,PNNL\r\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,...,55,1,0,0,0,0,data scientist,na,3883,3
4,4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\r\nAffinity Solutions / Marketi...,2.9,Affinity Solutions\r\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,...,22,1,0,0,0,1,data scientist,na,2728,3


In [28]:
# choose relevant columns 
df.columns 

Index(['Unnamed: 0', 'Job Title', 'Salary Estimate', 'Job Description',
       'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary',
       'company_txt', 'job_state', 'same_state', 'age', 'python_yn', 'R_yn',
       'spark', 'aws', 'excel', 'job_simp', 'seniority', 'desc_len',
       'num_comp'],
      dtype='object')

In [29]:
df_model = df[['avg_salary','Rating','Size','Type of ownership','Industry','Sector','Revenue','num_comp','hourly','employer_provided',
             'job_state','same_state','age','python_yn','spark','aws','excel','job_simp','seniority','desc_len']]
df_model.shape

(742, 20)

In [30]:
# Getting dummy data
df_dum = pd.get_dummies(df_model)
df_dum.shape

(742, 178)

In [31]:
# Train test split
X = df_dum.drop(columns='avg_salary')
y = df_dum.avg_salary
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [32]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
X_train = st.fit_transform(X_train)
X_test = st.transform(X_test)


In [33]:
# Multiple linear regression 
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.model_selection import cross_val_score

lm = LinearRegression()
lm.fit(X_train,y_train)

cross_val_score(lm,X_train,y_train,scoring='neg_mean_absolute_error')

array([-1.24160157e+15, -3.85227244e+14, -9.77275342e+14, -4.20196034e+14,
       -1.71201391e+14])

In [34]:
lm_l = Lasso()
lm_l.fit(X_train,y_train)

cross_val_score(lm_l,X_train,y_train,scoring='neg_mean_absolute_error')

array([-19.74482231, -18.32707311, -20.27473381, -16.78257531,
       -18.77332796])

In [35]:
from sklearn.ensemble import RandomForestRegressor

lm_rf = RandomForestRegressor()
lm_rf.fit(X_train,y_train)

cross_val_score(lm_rf,X_train,y_train,scoring='neg_mean_absolute_error')

array([-16.2542    , -16.26655   , -17.54575758, -13.75181818,
       -14.84015152])

In [36]:
# Tuning using gridsearch 
from sklearn.model_selection import RandomizedSearchCV
    
parameters = {'n_estimators':range(10,300,10), 'criterion':('poisson', 'friedman_mse', 'squared_error', 'absolute_error'), 'max_features':('auto','sqrt','log2')}

gs = RandomizedSearchCV(lm_rf,parameters,scoring='neg_mean_absolute_error')
gs.fit(X_train,y_train)



In [37]:
print('Best parameters are : ',gs.best_params_)
print('Best score is : ',gs.best_score_)

Best parameters are :  {'n_estimators': 80, 'max_features': 'auto', 'criterion': 'absolute_error'}
Best score is :  -15.732954861111107


In [38]:
y_pred_lm = lm.predict(X_test)
y_pred_lm_l = lm_l.predict(X_test)
y_pred_gs = gs.best_estimator_.predict(X_test)

In [39]:
from sklearn.metrics import mean_absolute_error,accuracy_score
print(mean_absolute_error(y_test,y_pred_lm)) # for linear regression
print(mean_absolute_error(y_test,y_pred_lm_l)) # for lasso regression
print(mean_absolute_error(y_test,y_pred_gs)) # for randomize cv 


49961959181141.875
19.473622246964762
12.742691326530613
