# Game Plan:


- linear regression, tree regression, random forest regression
- we are checking each one with cross validation
- we will check them with either mae or rmse (we'll read which is preferable)
- we find which is the best based on the smallest error
- and then we can do our feature selection (forward/backward) or forest tree selection for features
- and then fine tuning of hyper parameter depending on which regression we use (grid search)
- finally, we test our model :)

In [43]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.dummy import DummyRegressor


In [44]:
df=pd.read_csv('cleaned_glassdoor_dataset.csv',index_col=0)
df.columns


Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'Salary_Per_Hour', 'Min_Salary', 'Max_Salary', 'Avg Salary', 'State',
       'Is_Headquarters', 'Age of Company', 'Python', 'Spark', 'AWS', 'Excel',
       'Job Categories', 'Seniority', 'Description_Length', 'Competitor Count',
       'Revenue_Adj'],
      dtype='object')

In [45]:
#need to know what this is
df2 = pd.read_csv('no_outlier_cleaned_data.csv',index_col=0)


In [46]:
df['Size'].value_counts()

1001 to 5000 employees     150
501 to 1000 employees      134
10000+ employees           130
201 to 500 employees       117
51 to 200 employees         94
5001 to 10000 employees     76
1 to 50 employees           31
Unknown                      9
-1                           1
Name: Size, dtype: int64

In [47]:

df_n = pd.get_dummies(df.drop(['Job Title', 'Salary Estimate', 'Job Description',
                               'Company Name','Location','Headquarters','Founded','Competitors',
                               'Min_Salary','Max_Salary','Revenue'], axis=1))


X = df_n.drop(['Avg Salary', 'Size_-1','Sector_-1','Industry_-1'], axis=1)
y = df_n['Avg Salary'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)

X_train

Unnamed: 0,Rating,Is_Headquarters,Age of Company,Python,Spark,AWS,Excel,Description_Length,Competitor Count,Size_1 to 50 employees,...,Job Categories_Other,Job Categories_Other Engineer,Job Categories_Software Engineer,Seniority_Junior,Seniority_Senior,Seniority_none,Revenue_Adj_Unknown / Non-Applicable,Revenue_Adj_big,Revenue_Adj_medium,Revenue_Adj_small
318,3.2,0,30,1,1,1,1,208,0,1,...,0,0,0,0,1,0,0,0,0,1
533,3.6,1,33,1,0,1,1,753,0,0,...,0,0,0,0,0,1,1,0,0,0
539,3.1,1,147,0,0,0,1,698,0,0,...,0,0,0,0,1,0,0,1,0,0
653,4.2,0,14,1,0,0,1,765,0,0,...,0,0,0,0,0,1,1,0,0,0
586,3.5,0,30,1,0,0,0,158,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,3.5,0,25,1,1,0,0,151,3,0,...,0,0,0,0,0,1,1,0,0,0
79,3.6,1,95,1,0,0,1,370,0,0,...,0,0,0,0,0,1,0,1,0,0
759,2.8,1,50,0,0,0,0,481,3,0,...,0,0,0,0,0,1,0,0,1,0
730,3.6,0,171,0,1,1,1,541,0,0,...,1,0,0,0,0,1,0,1,0,0


In [48]:
df_n2 = pd.get_dummies(df2.drop(['Job Title', 'Salary Estimate', 'Job Description',
                               'Company Name','Location','Headquarters','Founded','Competitors',
                               'Min_Salary','Max_Salary','Revenue'], axis=1))


X2 = df_n2.drop(['Avg Salary', 'Size_-1','Sector_-1','Industry_-1'], axis=1)
y2 = df_n2['Avg Salary'].values

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=777)

X2_train

Unnamed: 0,Rating,Is_Headquarters,Age of Company,Python,Spark,AWS,Excel,Description_Length,Competitor Count,Size_1 to 50 employees,...,Job Categories_Other,Job Categories_Other Engineer,Job Categories_Software Engineer,Seniority_Junior,Seniority_Senior,Seniority_none,Revenue_Adj_Unknown / Non-Applicable,Revenue_Adj_big,Revenue_Adj_medium,Revenue_Adj_small
325,2.8,1,8,1,0,0,1,777.0,0,0,...,0,0,0,0,1,0,1,0,0,0
674,4.4,1,38,1,0,0,0,345.0,0,0,...,0,0,0,0,1,0,0,1,0,0
396,3.3,1,14,0,0,0,0,232.0,0,0,...,0,0,0,0,0,1,1,0,0,0
895,3.3,1,34,0,0,0,1,1017.0,3,0,...,0,0,0,0,0,1,0,1,0,0
859,3.2,1,64,1,0,0,1,575.0,3,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878,3.5,1,17,1,1,1,1,569.0,0,0,...,0,0,0,0,0,1,1,0,0,0
80,2.7,0,44,0,0,0,1,319.0,0,0,...,1,0,0,0,0,1,0,0,0,1
782,4.2,1,11,1,0,0,1,668.0,0,0,...,0,0,0,0,0,1,1,0,0,0
745,3.9,1,13,1,0,0,1,420.0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [49]:
numerical_columns = ['Rating', 'Age of Company', 'Description_Length', 'Competitor Count']


In [50]:
# Standardization of train & test:

standard = StandardScaler()
normal = MinMaxScaler()

X_test_stand = X_test.copy()
X_train_stand = X_train.copy()
y_train_stand = y_train.copy()
y_test_stand = y_test.copy()


X_train_stand[numerical_columns] = standard.fit_transform(X_train_stand[numerical_columns])
X_test_stand[numerical_columns] = standard.transform(X_test_stand[numerical_columns])

print(X_train_stand.mean(axis=0))


Rating                                 -1.018485e-16
Is_Headquarters                         5.767285e-01
Age of Company                         -2.883210e-17
Python                                  5.278246e-01
Spark                                   2.276560e-01
                                            ...     
Seniority_none                          6.964587e-01
Revenue_Adj_Unknown / Non-Applicable    2.765599e-01
Revenue_Adj_big                         3.220911e-01
Revenue_Adj_medium                      8.094435e-02
Revenue_Adj_small                       3.204047e-01
Length: 166, dtype: float64


In [51]:
# Robust 

robust = RobustScaler()

X_test_robust = X_test.copy()
X_train_robust = X_train.copy()

X_train_robust[numerical_columns] = robust.fit_transform(X_train_robust[numerical_columns])
X_test_robust[numerical_columns] = robust.transform(X_test_robust[numerical_columns])

In [52]:
# Normalization of train & test:

normal = MinMaxScaler()

X_test_norm = X_test.copy()
X_train_norm = X_train.copy()

X_train_norm[numerical_columns] = normal.fit_transform(X_train_norm[numerical_columns])
X_test_norm[numerical_columns] = normal.transform(X_test_norm[numerical_columns])

In [53]:
# our means of checking how well our model is doing hehe

dummy_model = DummyRegressor()

dummy_model.fit(X_train, y_train)

R2 = dummy_model.score(X_test, y_test)

y_predict = dummy_model.predict(X_test)

RMSE = MSE(y_test, y_predict)**(0.5)

print('Dummy Model Scores:')
print(f'R-Squared Value: {R2}') 
print(f'Root Mean Square Error: {RMSE}')


Dummy Model Scores:
R-Squared Value: -0.007879122082713241
Root Mean Square Error: 40.74573295143369


Our dummy model is taking the mean average salary and using that to "predict" the rest.
Dummy models set the standard for what a bad model looks like.
The R2 score is showing the percentage of accuracy of a model (from 0 to 1 mostly). When it is negative,
it is showing that the model does NOT follow the trend, which would make sense for our dummy model. 
Clearly this is showing us a very "bad" model to set the standard for how our models are doing in terms
of their RMSE scores. For this model, we are deviating by $40K, which we now know is a very poor score

In [54]:
# our means of checking how well our model is doing hehe

dummy_model2 = DummyRegressor()

dummy_model2.fit(X2_train, y2_train)

R22 = dummy_model2.score(X2_test, y2_test)

y2_predict = dummy_model2.predict(X2_test)

RMSE2 = MSE(y2_test, y2_predict)**(0.5)

print('Dummy Model Scores:')
print(f'R-Squared Value: {R22}') 
print(f'Root Mean Square Error: {RMSE2}')

Dummy Model Scores:
R-Squared Value: -0.0034232925890058663
Root Mean Square Error: 35.78505290868794


# Linear Regression Model

In [76]:
# Linear Regression Model
#looked at rmse ,wondered why you switched ,im ok with it butg wanted to know

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

linear_reg1 = LinearRegression()
linear_reg2 = LinearRegression()
linear_reg3 = LinearRegression()
linear_reg5 = LinearRegression()

linear_reg1.fit(X_train,y_train)
linear_reg2.fit(X_train_norm,y_train)
linear_reg3.fit(X_train_stand,y_train)
linear_reg5.fit(X2_train,y2_train)

lrm = np.mean(cross_val_score(linear_reg1, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))

norm_lrm = np.mean(cross_val_score(linear_reg2, X_train_norm, y_train, scoring='neg_mean_absolute_error', cv=3))

stand_lrm = np.mean(cross_val_score(linear_reg3, X_train_stand, y_train, scoring='neg_mean_absolute_error', cv=3))

out_lrm = np.mean(cross_val_score(linear_reg5, X2_train, y2_train, scoring='neg_mean_absolute_error', cv=3))
#
lrm_rmse = np.mean(cross_val_score(linear_reg1, X_train, y_train, scoring='neg_root_mean_squared_error', cv=3))

norm_lrm_rmse = np.mean(cross_val_score(linear_reg2, X_train_norm, y_train, scoring='neg_root_mean_squared_error', cv=3))

stand_lrm_rmse = np.mean(cross_val_score(linear_reg3, X_train_stand, y_train, scoring='neg_root_mean_squared_error', cv=3))

out_lrm_rmse = np.mean(cross_val_score(linear_reg5, X2_train, y2_train, scoring='neg_root_mean_squared_error', cv=3))

r2_lin= np.mean(cross_val_score(linear_reg1, X_train, y_train, cv=3))
r2out = np.mean(cross_val_score(linear_reg5, X2_train, y2_train, cv=3))
stand_r2 = np.mean(cross_val_score(linear_reg3, X_train_stand, y_train, cv=3))



print(f"mae with outliers: {lrm}")

print(f"mae with standarization: {stand_lrm}")

print(f"mae without outliers: {out_lrm}")
#
print(f"rmse with outliers: {lrm_rmse}")

print(f"rmse with standarization: {stand_lrm_rmse}")

print(f"rmse without outliers: {out_lrm_rmse}")


print(f"r2 with outliers: {r2_lin}")

print(f"r2 without outliers: {r2out}")

print(f"r2 with standarization: {stand_r2}")



mae with outliers: -19.574260252998112
mae with standarization: -74624701407.83623
mae without outliers: -20.9394597098857
rmse with outliers: -27.429612512945145
rmse with standarization: -497460013098.5535
rmse without outliers: -29.358972804618507
r2 with outliers: 0.4145995408707575
r2 without outliers: 0.36942922879481693
r2 with standarization: -2.492221499320103e+20


What we learned so far: 

-normalization of data for linear regression does not work! It creates a non-linear model

-removing outliers actually negatively affects the performance of linear regression

In [56]:
# trying feature selection so we can improve our score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

# feature selection
def select_features(X_train, y_train, X_test):
    
    # configure to select a subset of features
    fs = SelectKBest(score_func=f_regression, k=64)
    
    # learn relationship from training data
    fs.fit(X_train, y_train)
    
    # transform train input data
    X_train_fs = fs.transform(X_train)
    
    # transform test input data
    X_test_fs = fs.transform(X_test)
    
    return X_train_fs, X_test_fs, fs

# 64 is clearly the best rn
#why?

In [57]:
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)


  corr /= X_norms


In [58]:
# improving our linear regression score with automatic feature selection

model = LinearRegression()
model.fit(X_train_fs, y_train)

model_improved = np.mean(cross_val_score(model, X_train_fs, y_train, scoring='neg_mean_absolute_error', cv=3))
r2_improved= np.mean(cross_val_score(model, X_train_fs, y_train, cv=3))


print(model_improved)
print(r2_improved)

-18.398649554620963
0.5165190399847649


After trying several values, we found that 64 of the "best" features creates the most accurate model

Our score definitely improved, but it appears that this model may not be our best bet. Let's try others!

In [59]:
# what are the most correlated variables?

df3 = pd.get_dummies(df2.drop(['Job Title', 'Salary Estimate', 'Job Description',
                               'Company Name','Location','Headquarters','Founded','Competitors',
                               'Min_Salary','Max_Salary','Revenue'], axis=1))
corr_matrix = df3.corr()
corr_matrix['Avg Salary'].sort_values(ascending=False)

correlations = abs(corr_matrix['Avg Salary']).sort_values(ascending=False)
correlations.drop('Avg Salary', inplace=True)
correlations.head(25)


Job Categories_Data Analyst                  0.393697
Job Categories_Data Scientist                0.345837
Seniority_Senior                             0.344446
Seniority_none                               0.335689
Python                                       0.330520
State_ CA                                    0.302434
Job Categories_Director                      0.255496
Type of ownership_Nonprofit Organization     0.200887
Salary_Per_Hour_annually                     0.193550
Salary_Per_Hour_hourly                       0.193550
Sector_Information Technology                0.185860
Job Categories_Other                         0.172034
AWS                                          0.172008
Spark                                        0.164927
Sector_Health Care                           0.147159
Industry_Health Care Services & Hospitals    0.147159
Industry_Food & Beverage Manufacturing       0.136895
Rating                                       0.128693
Industry_Financial Analytics

# Lasso Regression Function

In [61]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=0.1)
lasso_reg2 = Lasso(alpha=0.1)
lasso_reg3 = Lasso(alpha=0.1)
lasso_reg4 = Lasso(alpha=0.1)


lasso_reg.fit(X_train,y_train)
lasso_reg2.fit(X2_train,y2_train)
lasso_reg3.fit(X_train_norm, y_train)
lasso_reg4.fit(X_train_stand, y_train)


lasso1 = np.mean(cross_val_score(lasso_reg, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))

lasso2 = np.mean(cross_val_score(lasso_reg2, X2_train, y2_train, scoring='neg_mean_absolute_error', cv=3))

lasso3 = np.mean(cross_val_score(lasso_reg3, X_train_norm, y_train, scoring='neg_mean_absolute_error', cv=3))

lasso4 = np.mean(cross_val_score(lasso_reg4, X_train_stand, y_train, scoring='neg_mean_absolute_error', cv=3))




lasso1_r2 = np.mean(cross_val_score(lasso_reg, X_train, y_train, cv=3))

lasso2_r2 = np.mean(cross_val_score(lasso_reg2, X2_train, y2_train, cv=3))

lasso3_r2 = np.mean(cross_val_score(lasso_reg3, X_train_norm, y_train, cv=3))

lasso4_r2 = np.mean(cross_val_score(lasso_reg4, X_train_stand, y_train, cv=3))



print(f"mae with outliers: {lasso1}")

print(f"mae without outliers: {lasso2}")

print(f"mae norm: {lasso3}")

print(f"mae stand: {lasso4}")


print(f"r2 with outliers: {lasso1_r2}")

print(f"r2 without outliers: {lasso2_r2}")

print(f"r2 norm: {lasso3_r2}")

print(f"r2 stand: {lasso4_r2}")




mae with outliers: -19.64814501875747
mae without outliers: -20.682212851177947
mae norm: -19.644701850993766
mae stand: -19.647148700345543
r2 with outliers: 0.47435324601345724
r2 without outliers: 0.43388910630860567
r2 norm: 0.4743342479976158
r2 stand: 0.4747883632042346


It appears that the error is around the same as linear regression but the r2 score is better. 
standarization/normalization aren't making much of a difference

Let's see if feature engineering can optimize it. We will once again work with the model with outliers

In [62]:
def select_features2(X_train, y_train, X_test):
    
    # configure to select a subset of features
    fs = SelectKBest(score_func=f_regression, k=53)
    
    # learn relationship from training data
    fs.fit(X_train, y_train)
    
    # transform train input data
    X_train_fs = fs.transform(X_train)
    
    # transform test input data
    X_test_fs = fs.transform(X_test)
    
    return X_train_fs, X_test_fs, fs


In [63]:
X_train_fs2, X_test_fs2, fs2 = select_features2(X_train, y_train, X_test)


  corr /= X_norms


In [64]:
model2 = Lasso(alpha=0.1)
model2.fit(X_train_fs2, y_train)

model2_improved = np.mean(cross_val_score(model2, X_train_fs2, y_train, scoring='neg_mean_absolute_error', cv=3))
r2_improved2= np.mean(cross_val_score(model2, X_train_fs2, y_train, cv=3))


print(model2_improved)
print(r2_improved2)

-19.315385896769133
0.4995811863608683


53 is our best score in this feature selection. 

the linear model is currently doing better with the feature selection.

let's see if there is anything else we can do to improve the model.

In [66]:
#need explenation
alpha = []
error = []
r2 = []

for i in range (1,100):
    alpha.append(i/100)
    lasso = Lasso(alpha=(i/100))
    lasso.fit(X_train_fs2, y_train)
    error.append(np.mean(cross_val_score(lasso, X_train_fs2, y_train, scoring='neg_mean_absolute_error', cv=3)))
    r2.append(np.mean(cross_val_score(lasso, X_train_fs2, y_train, cv=3)))
    
score = tuple(zip(alpha,error,r2))
df_score = pd.DataFrame(score, columns = ['alpha','error','r2'])

df_score[df_score.r2 == max(df_score.r2)]

Unnamed: 0,alpha,error,r2
1,0.02,-18.780318,0.512834


this is our most optimized. linear regression is doing better

# Random Forest Regression

In [67]:
from sklearn.ensemble import RandomForestRegressor as RFR

forest_reg = RFR()
forest_reg2 = RFR()
forest_reg3 = RFR()
forest_reg4 = RFR()
forest_reg5 = RFR()


forest_reg.fit(X_train,y_train)
forest_reg2.fit(X2_train,y2_train)
forest_reg3.fit(X_train_norm, y_train)
forest_reg4.fit(X_train_stand, y_train)
forest_reg5.fit(X_train_robust, y_train)



forest1 = np.mean(cross_val_score(forest_reg, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))

forest2 = np.mean(cross_val_score(forest_reg2, X2_train, y2_train, scoring='neg_mean_absolute_error', cv=3))

forest3 = np.mean(cross_val_score(forest_reg3, X_train_norm, y_train, scoring='neg_mean_absolute_error', cv=3))

forest4 = np.mean(cross_val_score(forest_reg4, X_train_stand, y_train, scoring='neg_mean_absolute_error', cv=3))

forest5 = np.mean(cross_val_score(forest_reg5, X_train_robust, y_train, scoring='neg_mean_absolute_error', cv=3))




forest1_r2 = np.mean(cross_val_score(forest_reg, X_train, y_train, cv=3))

forest2_r2 = np.mean(cross_val_score(forest_reg2, X2_train, y2_train, cv=3))

forest3_r2 = np.mean(cross_val_score(forest_reg3, X_train_norm, y_train, cv=3))

forest4_r2 = np.mean(cross_val_score(forest_reg4, X_train_stand, y_train, cv=3))

forest5_r2 = np.mean(cross_val_score(forest_reg5, X_train_robust, y_train, cv=3))



print(f"mae with outliers: {forest1}")

print(f"mae without outliers: {forest2}")

print(f"mae norm: {forest3}")

print(f"mae stand: {forest4}")

print(f"mae robust: {forest5}")



print(f"r2 with outliers: {forest1_r2}")

print(f"r2 without outliers: {forest2_r2}")

print(f"r2 norm: {forest3_r2}")

print(f"r2 stand: {forest4_r2}")

print(f"r2 robust: {forest5_r2}")


mae with outliers: -15.058208096190329
mae without outliers: -16.64090662446807
mae norm: -14.883310302688473
mae stand: -14.968049701755286
mae robust: -14.843062050282862
r2 with outliers: 0.612814839041788
r2 without outliers: 0.5247714440692182
r2 norm: 0.6037948751400836
r2 stand: 0.6029761651335028
r2 robust: 0.615002912303979


In [73]:
#i think we should keep these functions
#def choose_k(X_train, y_train, X_test):
   # for i in range (1,166):
    #    fs = SelectKBest(score_func=f_regression, k=i)
        

In [74]:
def select_features3(X_train, y_train, X_test):
    
    # configure to select a subset of features
    fs = SelectKBest(score_func=f_regression, k=163)
    
    # learn relationship from training data
    fs.fit(X_train, y_train)
    
    # transform train input data
    X_train_fs = fs.transform(X_train)
    
    # transform test input data
    X_test_fs = fs.transform(X_test)
    
    return X_train_fs, X_test_fs, fs

In [75]:
X_train_fs3, X_test_fs3, fs3 = select_features3(X_train, y_train, X_test)


  corr /= X_norms


In [71]:
model3 = RFR()
model3.fit(X_train_fs3, y_train)

model3_improved = np.mean(cross_val_score(model3, X_train_fs3, y_train, scoring='neg_mean_absolute_error', cv=3))
r2_improved3= np.mean(cross_val_score(model3, X_train_fs3, y_train, cv=3))


print(model3_improved)
print(r2_improved3)

# 156 - 14.744172520466938

# 164 - 
#-14.748348608761047
#0.6102484997748273

#163?

-14.972887995863886
0.614713641169638


# Decision Tree Regression

In [72]:
from sklearn.tree import DecisionTreeRegressor as DTR

tree_reg = RFR()
tree_reg2 = RFR()
tree_reg3 = RFR()
tree_reg4 = RFR()
tree_reg5 = RFR()

tree_reg.fit(X_train,y_train)
tree_reg2.fit(X2_train,y2_train)
tree_reg3.fit(X_train_norm, y_train)
tree_reg4.fit(X_train_stand, y_train)
tree_reg5.fit(X_train_robust, y_train)



tree1 = np.mean(cross_val_score(tree_reg, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))

tree2 = np.mean(cross_val_score(tree_reg2, X2_train, y2_train, scoring='neg_mean_absolute_error', cv=3))

tree3 = np.mean(cross_val_score(tree_reg3, X_train_norm, y_train, scoring='neg_mean_absolute_error', cv=3))

tree4 = np.mean(cross_val_score(tree_reg4, X_train_stand, y_train, scoring='neg_mean_absolute_error', cv=3))

tree5 = np.mean(cross_val_score(tree_reg5, X_train_robust, y_train, scoring='neg_mean_absolute_error', cv=3))





tree1_r2 = np.mean(cross_val_score(tree_reg, X_train, y_train, cv=3))

tree2_r2 = np.mean(cross_val_score(tree_reg2, X2_train, y2_train, cv=3))

tree3_r2 = np.mean(cross_val_score(tree_reg3, X_train_norm, y_train, cv=3))

tree4_r2 = np.mean(cross_val_score(tree_reg4, X_train_stand, y_train, cv=3))

tree5_r2 = np.mean(cross_val_score(tree_reg5, X_train_robust, y_train, cv=3))




print(f"mae with outliers: {tree1}")

print(f"mae without outliers: {tree2}")

print(f"mae norm: {tree3}")

print(f"mae stand: {tree4}")

print(f"mae robust: {tree5}")



print(f"r2 with outliers: {tree1_r2}")

print(f"r2 without outliers: {tree2_r2}")

print(f"r2 norm: {tree3_r2}")

print(f"r2 stand: {tree4_r2}")

print(f"r2 robust: {tree5_r2}")

mae with outliers: -14.963443209591686
mae without outliers: -16.62662090700283
mae norm: -14.854988228306757
mae stand: -14.927261361499943
mae robust: -14.985340695448565
r2 with outliers: 0.616004612483115
r2 without outliers: 0.524252382644341
r2 norm: 0.609530272470553
r2 stand: 0.6203841978508332
r2 robust: 0.6145524276602505


let's notice something: our decision tree and random forest seem to be very neck and neck with their score,

both are clearly better than linear regression and lasso.

it seems like the normalization is actually helping slightyl, especially in decision tree.
our best model so far appears to be either decision tree with a robust or standarized scaling