In [7]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import root_mean_squared_error
import warnings

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn)

1 decision making


In [8]:
# Train data frames
X_train_with_outliers_sel = pd.read_csv('../data/processed/X_train_with_outliers_sel.csv')
X_train_without_outliers_sel = pd.read_csv('../data/processed/X_train_without_outliers_sel.csv')
X_train_with_outliers_norm_sel = pd.read_csv('../data/processed/X_train_with_outliers_norm_sel.csv')
X_train_without_outliers_norm_sel = pd.read_csv('../data/processed/X_train_without_outliers_norm_sel.csv')
X_train_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_with_outliers_minmax_sel.csv')
X_train_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_without_outliers_minmax_sel.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')

# Test data frames
X_test_with_outliers_sel = pd.read_csv('../data/processed/X_test_with_outliers_sel.csv')
X_test_without_outliers_sel = pd.read_csv('../data/processed/X_test_without_outliers_sel.csv')
X_test_with_outliers_norm_sel = pd.read_csv('../data/processed/X_test_with_outliers_norm_sel.csv')
X_test_without_outliers_norm_sel = pd.read_csv('../data/processed/X_test_without_outliers_norm_sel.csv')
X_test_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_with_outliers_minmax_sel.csv')
X_test_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_without_outliers_minmax_sel.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [12]:
train_dicts = {
  "X_train_with_outliers_sel": X_train_with_outliers_sel,
  "X_train_without_outliers_sel": X_train_without_outliers_sel,
  "X_train_with_outliers_norm_sel": X_train_with_outliers_norm_sel,
  "X_train_without_outliers_norm_sel": X_train_without_outliers_norm_sel,
  "X_train_with_outliers_minmax_sel": X_train_with_outliers_minmax_sel,
  "X_train_without_outliers_minmax_sel": X_train_without_outliers_minmax_sel
}

test_dicts = {
  "X_test_with_outliers_sel": X_test_with_outliers_sel,
  "X_test_without_outliers_sel": X_test_without_outliers_sel,
  "X_test_with_outliers_norm_sel": X_test_with_outliers_norm_sel,
  "X_test_without_outliers_norm_sel": X_test_without_outliers_norm_sel,
  "X_test_with_outliers_minmax_sel": X_test_with_outliers_minmax_sel,
  "X_test_without_outliers_minmax_sel": X_test_without_outliers_minmax_sel
}

train_dfs = [
  X_train_with_outliers_sel,
  X_train_without_outliers_sel,
  X_train_with_outliers_norm_sel,
  X_train_without_outliers_norm_sel,
  X_train_with_outliers_minmax_sel,
  X_train_without_outliers_minmax_sel
]
test_dfs = [
  X_test_with_outliers_sel,
  X_test_without_outliers_sel,
  X_test_with_outliers_norm_sel,
  X_test_without_outliers_norm_sel,
  X_test_with_outliers_minmax_sel,
  X_test_without_outliers_minmax_sel
]

results = []

for df_index in range(len(train_dfs)):
  model = LinearRegression()
  train_df = train_dfs[df_index]
  model.fit(train_df, y_train)
  y_train_pred = model.predict(train_df)
  y_test_pred = model.predict(test_dfs[df_index])

  results.append(
    {
        "index": df_index,
        "train_df": list(train_dicts.keys())[df_index],
        "Coefficient": model.coef_,
        "MAE": round(mean_absolute_error(y_test, y_test_pred), 6),
        "RMSE": round(root_mean_squared_error(y_test, y_test_pred), 6),
        "R2_score": round(r2_score(y_test, y_test_pred), 6)
    }
  )

resultados = sorted(results, key = lambda x: x["R2_score"], reverse = True)
resultados

[{'index': 2,
  'train_df': 'X_train_with_outliers_norm_sel',
  'Coefficient': array([[-5.88321872e+15, -5.74400000e+03,  7.86650000e+03,
          -1.11947500e+04, -2.34465000e+04, -6.22368750e+03,
          -5.29687500e+02, -5.62593750e+02, -5.43518750e+03,
          -4.89108594e+03,  2.30251172e+03, -7.97413672e+03,
          -1.24968750e+02, -2.69718750e+02, -3.03250000e+02,
           1.44093750e+02, -6.17187500e+02, -5.73896094e+03,
          -1.54246094e+03, -9.27664062e+02,  1.05953125e+02,
           1.66234375e+02, -8.65937500e+01,  1.61656250e+02,
          -7.12803125e+03, -1.25495391e+04, -2.24588398e+04,
          -2.70297070e+04,  2.95762305e+03,  1.01695312e+02,
          -2.61546875e+02,  2.58125000e+01,  9.35437500e+02,
          -8.31734375e+02, -5.43541992e+03, -5.67971875e+03,
           3.39892578e+02,  3.59296875e+01, -7.85117188e+01,
           1.79000000e+02, -2.67406250e+02,  3.12406250e+02,
          -1.90632812e+02, -4.66562500e+01,  1.35036719e+03,
        

In [None]:
print (f"The best train dataframe is |{resultados[0]['train_df']}|.\n\
======================================================      \n\
| MAE: {resultados[0]['MAE']}   |\n\
----------------------\n\
| RMSE: {resultados[0]['RMSE']}    |\n\
----------------------\n\
| R2_score: {resultados[0]['R2_score']} |\n\
======================")

The best train dataframe is |X_train_with_outliers_norm_sel|.
| MAE: 504.909081   |
----------------------
| RMSE: 1114.602982    |
----------------------
| R2_score: 0.995716 |
