In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df = pd.read_csv('Cleaned_Data_For_EDA&Models.csv')
df.head()

In [None]:
df.columns

In [None]:
df_model = df[['avg_salary','Rating','Size','Type of ownership','Industry','Sector','Revenue','num_comp','hourly','employer_provided',
             'job_state','Same_Location_as_HQ','age','python_jd','spark_jd','aws_jd','excel_jd','job_simp','seniority','desc_len']]

In [None]:
df_dum = pd.get_dummies(df_model)
df_dum.shape

In [None]:
from sklearn.model_selection import train_test_split
X = df_dum.drop('avg_salary',axis=1)
y = df_dum.avg_salary.values # better pratice to take the .values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import statsmodels.api as sm

X_sm = X = sm.add_constant(X)
model = sm.OLS(y,X_sm)
model.fit().summary()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [None]:
lm = LinearRegression()
Neg_MAE = np.mean(cross_val_score(lm,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))
print('The negative mean error by using a Linear Regression using its default parameters is: {}'.format(Neg_MAE))

In [None]:
from sklearn.linear_model import Lasso

# selecting the best alpha value for Lasso
alpha = []
error = []
for i in range(1,100):
    alpha.append(i/100)
    lml = Lasso(alpha=(i/100))
    error.append(np.mean(cross_val_score(lml,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3)))
    
plt.plot(alpha,error)

In [None]:
err = tuple(zip(alpha,error))
df_err = pd.DataFrame(err, columns = ['alpha','error'])
df_err[df_err.error == max(df_err.error)]


In [None]:
lm_l = Lasso(alpha=0.13)
lm_l.fit(X_train,y_train)
Neg_MAE_Lasso = np.mean(cross_val_score(lm_l,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))
print('The negative mean error by using a Lasso Regression using its best alpha value is: {}'.format(Neg_MAE_Lasso))

In [None]:
from xgboost import XGBClassifier
# fit model no training data
Xgb = XGBClassifier()
Neg_MAE_Xgboost = np.mean(cross_val_score(Xgb,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))

In [None]:
print('The negative mean error by using an XGboost model using its default parameters is: {}'.format(Neg_MAE_Xgboost))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
Neg_MAE_RF = np.mean(cross_val_score(rf,X_train,y_train,scoring = 'neg_mean_absolute_error', cv= 3))
print('The negative mean error by using a Random Forest model using its default parameters is: {}'.format(Neg_MAE_RF))

In [None]:
lm.fit(X_train, y_train)
lm_l.fit(X_train,y_train)
Xgb.fit(X_train,y_train)

In [None]:
parameters = {'n_estimators':range(10,300,10), 'criterion':('mse','mae'), 'max_features':('auto','sqrt','log2')}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
gs = GridSearchCV(rf,parameters,scoring='neg_mean_absolute_error',cv=3)
gs.fit(X_train,y_train)

In [None]:
gs.best_score_

In [None]:
gs.best_estimator_

In [None]:
rf_tuned = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=270, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
rf_tuned.fit(X_train,y_train)

In [None]:
tpred_lm = lm.predict(X_test)
tpred_lml = lm_l.predict(X_test)
tpred_xgboost = Xgb.predict(X_test)
tpred_rf = rf_tuned.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,tpred_lm)
mean_absolute_error(y_test,tpred_lml)
mean_absolute_error(y_test,tpred_xgboost)
mean_absolute_error(y_test,tpred_rf)
print('The Mean absolute error in Linear Regression Model is: {}'.format(mean_absolute_error(y_test,tpred_lm)))
print('The Mean absolute error in Lasso Regression Model is: {}'.format(mean_absolute_error(y_test,tpred_lml)))
print('The Mean absolute error in Xgboost Model is: {}'.format(mean_absolute_error(y_test,tpred_xgboost)))
print('The Mean absolute error in Random Forest Model is: {}'.format(mean_absolute_error(y_test,tpred_rf)))

In [None]:
mean_absolute_error(y_test,(tpred_xgboost+tpred_rf)/2)

In [None]:
## Saving the best model.
import pickle
pickl = {'model': rf_tuned}
pickle.dump( pickl, open( 'model_file' + ".p", "wb" ) )

file_name = "model_file.p"
with open(file_name, 'rb') as pickled:
    data = pickle.load(pickled)
    model = data['model']

print('The predicted salary is: {}'.format(model.predict(np.array(list(X_test.iloc[1,:])).reshape(1,-1))[0]))