In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import root_mean_squared_error
import warnings

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn)

1 decision making


In [2]:
# Train data frames
X_train_with_outliers_sel = pd.read_csv('../data/processed/X_train_with_outliers_sel.csv')
X_train_without_outliers_sel = pd.read_csv('../data/processed/X_train_without_outliers_sel.csv')
X_train_with_outliers_norm_sel = pd.read_csv('../data/processed/X_train_with_outliers_norm_sel.csv')
X_train_without_outliers_norm_sel = pd.read_csv('../data/processed/X_train_without_outliers_norm_sel.csv')
X_train_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_with_outliers_minmax_sel.csv')
X_train_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_without_outliers_minmax_sel.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')

# Test data frames
X_test_with_outliers_sel = pd.read_csv('../data/processed/X_test_with_outliers_sel.csv')
X_test_without_outliers_sel = pd.read_csv('../data/processed/X_test_without_outliers_sel.csv')
X_test_with_outliers_norm_sel = pd.read_csv('../data/processed/X_test_with_outliers_norm_sel.csv')
X_test_without_outliers_norm_sel = pd.read_csv('../data/processed/X_test_without_outliers_norm_sel.csv')
X_test_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_with_outliers_minmax_sel.csv')
X_test_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_without_outliers_minmax_sel.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [3]:
train_dicts = {
  "X_train_with_outliers_sel": X_train_with_outliers_sel,
  "X_train_without_outliers_sel": X_train_without_outliers_sel,
  "X_train_with_outliers_norm_sel": X_train_with_outliers_norm_sel,
  "X_train_without_outliers_norm_sel": X_train_without_outliers_norm_sel,
  "X_train_with_outliers_minmax_sel": X_train_with_outliers_minmax_sel,
  "X_train_without_outliers_minmax_sel": X_train_without_outliers_minmax_sel
}

test_dicts = {
  "X_test_with_outliers_sel": X_test_with_outliers_sel,
  "X_test_without_outliers_sel": X_test_without_outliers_sel,
  "X_test_with_outliers_norm_sel": X_test_with_outliers_norm_sel,
  "X_test_without_outliers_norm_sel": X_test_without_outliers_norm_sel,
  "X_test_with_outliers_minmax_sel": X_test_with_outliers_minmax_sel,
  "X_test_without_outliers_minmax_sel": X_test_without_outliers_minmax_sel
}

train_dfs = [
  X_train_with_outliers_sel,
  X_train_without_outliers_sel,
  X_train_with_outliers_norm_sel,
  X_train_without_outliers_norm_sel,
  X_train_with_outliers_minmax_sel,
  X_train_without_outliers_minmax_sel
]
test_dfs = [
  X_test_with_outliers_sel,
  X_test_without_outliers_sel,
  X_test_with_outliers_norm_sel,
  X_test_without_outliers_norm_sel,
  X_test_with_outliers_minmax_sel,
  X_test_without_outliers_minmax_sel
]

results = []

for df_index in range(len(train_dfs)):
  model = LinearRegression()
  train_df = train_dfs[df_index]
  model.fit(train_df, y_train)
  y_train_pred = model.predict(train_df)
  y_test_pred = model.predict(test_dfs[df_index])

  results.append(
    {
        "index": df_index,
        "train_df": list(train_dicts.keys())[df_index],
        "Coefficient": model.coef_,
        "MAE": round(mean_absolute_error(y_test, y_test_pred), 6),
        "RMSE": round(root_mean_squared_error(y_test, y_test_pred), 6),
        "R2_score": round(r2_score(y_test, y_test_pred), 6)
    }
  )

resultados = sorted(results, key = lambda x: x["R2_score"], reverse = True)
resultados

[{'index': 0,
  'train_df': 'X_train_with_outliers_sel',
  'Coefficient': array([[-1.26863070e-01, -1.14565523e-01,  5.29116846e-02,
          -3.60727093e-01,  3.58260495e-02, -2.53915172e-01,
           2.12524333e-01,  4.41728882e-01,  1.88399410e-01,
          -5.07164408e-03, -2.42455646e-02,  6.54236190e-02,
          -9.44178867e-03,  9.86828583e-02, -5.00279575e-02,
           7.53195766e-02,  6.04350669e-01,  2.61295803e-01,
           1.57215554e+01, -6.88566188e+02, -7.01048698e+02,
           6.97217800e+02,  2.72439309e-01,  2.51622944e-01,
           4.55992847e-01,  3.16251439e-01,  8.33874623e-02,
          -2.12058919e+01,  1.92308618e+01, -2.80550748e+01,
          -9.99944511e-03, -1.45074958e-01,  1.25076051e-01,
          -1.96344974e-02,  5.08414226e-02, -7.04759086e-02,
          -4.69170409e+01, -9.99944597e-03,  1.07842114e+01,
           6.13898585e+00,  1.39691807e+00, -1.03373855e+02,
           1.13600697e+02, -1.25982353e+02,  1.99382962e+02,
          -2.

In [4]:
print (f"The best train dataframe is |{resultados[0]['train_df']}|.\n\
======================================================      \n\
| MAE: {resultados[0]['MAE']}   |\n\
----------------------\n\
| RMSE: {resultados[0]['RMSE']}    |\n\
----------------------\n\
| R2_score: {resultados[0]['R2_score']} |\n\
======================")

The best train dataframe is |X_train_with_outliers_sel|.
| MAE: 1019.249559   |
----------------------
| RMSE: 2136.97424    |
----------------------
| R2_score: 0.99875 |


## Lasso

In [8]:
from sklearn.linear_model import Lasso

alpha = 1.0
lasso_model = Lasso(alpha = alpha)

# Training the model
lasso_model.fit(train_dfs[0], y_train)

# We evaluate the performance of the model on the test data
score = lasso_model.score(test_dfs[0], y_test)
print("Coefficients:", lasso_model.coef_)
print("R2 score:", score)

Coefficients: [ 1.13470472e-02  3.58592206e-01  3.88112357e-01  3.22498503e-01
  7.19230128e-01  3.24047326e-01 -2.39802575e-03  4.68274054e-01
 -6.36617928e-02 -1.82920759e-01 -1.49469778e-01 -3.89087828e-02
 -1.92454411e-01  3.81512572e-01 -4.29545750e-01  4.26346113e-02
  1.28919537e-01  1.09554931e-01 -3.93326841e+01  3.81183647e+01
 -2.70807790e+01  2.21002479e+01  1.93692442e-01  8.70958840e-02
  2.43825193e-01  1.62956423e-02  2.20968588e-01 -2.65612997e+01
  4.83888364e+00 -2.40306901e+01 -3.21828930e-01 -1.09784124e-01
  2.02886483e-01 -7.59152831e-02  2.67468541e-04 -1.48906402e-01
 -1.22973441e+02  2.00620502e-01  1.05129694e+01  2.25470982e+01
 -1.93550103e+01 -1.14824281e+02  1.42433399e+02 -2.27134528e+01
  9.00337564e+01 -2.01551053e+01 -4.16440647e-02  3.08860183e+02
  2.16101659e+00 -2.67296215e+01  6.26227866e-01  1.80325786e+00
 -6.37878467e-02  6.31230272e-01  1.55352267e-01 -1.86453189e+02
  1.69197568e+02 -8.53306929e-02  5.57266808e+02 -4.00866369e+02
  7.7697597

## Ridge

In [9]:
from sklearn.linear_model import Ridge

alpha = 1.0
ridge_model = Ridge(alpha = alpha)

# Training the model
ridge_model.fit(train_dfs[0], y_train)

# We evaluate the performance of the model on the test data
score_ridge= ridge_model.score(test_dfs[0], y_test)
print("Coefficients:", ridge_model.coef_)
print("R2 score:", score_ridge)

Coefficients: [[-1.29493759e-01 -1.16623980e-01  5.14141627e-02 -3.62430474e-01
   3.42779239e-02 -2.55506968e-01  2.10927052e-01  4.40160443e-01
   1.86621074e-01 -5.75197831e-03 -2.49468983e-02  6.49120370e-02
  -1.01230025e-02  9.82953016e-02 -5.08778609e-02  7.85382047e-02
   6.03875704e-01  2.61082198e-01  1.58662047e+01 -4.49872088e+02
  -4.62120617e+02  4.58313515e+02  2.72359485e-01  2.51783331e-01
   4.55963715e-01  3.16327058e-01  8.34394030e-02 -2.12192034e+01
   1.98858242e+01 -2.85130722e+01 -9.96661527e-03 -1.45000935e-01
   1.24992993e-01 -2.02664001e-02  5.14616039e-02 -7.00094408e-02
  -4.62546963e+01 -1.00586336e-02  1.07231049e+01  5.96548506e+00
   1.56394492e+00 -1.03089465e+02  1.13518686e+02 -1.22922933e+02
   1.94682885e+02 -2.79311765e+01 -4.98641509e-02  2.25872633e+02
   2.79515144e+01 -2.41604968e+01 -5.65574381e+00 -3.88179890e+00
   1.03639398e-01 -3.17756685e-01  1.62624395e-01 -1.08599228e+02
   9.81420228e+01 -3.06405728e-01  5.70459092e+02 -4.08057120e