In [37]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import root_mean_squared_error
import warnings

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn)

1 decision making


In [38]:
# Train data frames
X_train_with_outliers_sel = pd.read_csv('../data/processed/X_train_with_outliers_sel.csv')
X_train_without_outliers_sel = pd.read_csv('../data/processed/X_train_without_outliers_sel.csv')
X_train_with_outliers_norm_sel = pd.read_csv('../data/processed/X_train_with_outliers_norm_sel.csv')
X_train_without_outliers_norm_sel = pd.read_csv('../data/processed/X_train_without_outliers_norm_sel.csv')
X_train_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_with_outliers_minmax_sel.csv')
X_train_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_without_outliers_minmax_sel.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')

# Test data frames
X_test_with_outliers_sel = pd.read_csv('../data/processed/X_test_with_outliers_sel.csv')
X_test_without_outliers_sel = pd.read_csv('../data/processed/X_test_without_outliers_sel.csv')
X_test_with_outliers_norm_sel = pd.read_csv('../data/processed/X_test_with_outliers_norm_sel.csv')
X_test_without_outliers_norm_sel = pd.read_csv('../data/processed/X_test_without_outliers_norm_sel.csv')
X_test_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_with_outliers_minmax_sel.csv')
X_test_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_without_outliers_minmax_sel.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [39]:
train_dicts = {
  "X_train_with_outliers_sel": X_train_with_outliers_sel,
  "X_train_without_outliers_sel": X_train_without_outliers_sel,
  "X_train_with_outliers_norm_sel": X_train_with_outliers_norm_sel,
  "X_train_without_outliers_norm_sel": X_train_without_outliers_norm_sel,
  "X_train_with_outliers_minmax_sel": X_train_with_outliers_minmax_sel,
  "X_train_without_outliers_minmax_sel": X_train_without_outliers_minmax_sel
}

test_dicts = {
  "X_test_with_outliers_sel": X_test_with_outliers_sel,
  "X_test_without_outliers_sel": X_test_without_outliers_sel,
  "X_test_with_outliers_norm_sel": X_test_with_outliers_norm_sel,
  "X_test_without_outliers_norm_sel": X_test_without_outliers_norm_sel,
  "X_test_with_outliers_minmax_sel": X_test_with_outliers_minmax_sel,
  "X_test_without_outliers_minmax_sel": X_test_without_outliers_minmax_sel
}

train_dfs = [
  X_train_with_outliers_sel,
  X_train_without_outliers_sel,
  X_train_with_outliers_norm_sel,
  X_train_without_outliers_norm_sel,
  X_train_with_outliers_minmax_sel,
  X_train_without_outliers_minmax_sel
]
test_dfs = [
  X_test_with_outliers_sel,
  X_test_without_outliers_sel,
  X_test_with_outliers_norm_sel,
  X_test_without_outliers_norm_sel,
  X_test_with_outliers_minmax_sel,
  X_test_without_outliers_minmax_sel
]

results = []

for df_index in range(len(train_dfs)):
  model = LinearRegression()
  train_df = train_dfs[df_index]
  model.fit(train_df, y_train)
  y_train_pred = model.predict(train_df)
  y_test_pred = model.predict(test_dfs[df_index])

  results.append(
    {
        "index": df_index,
        "train_df": list(train_dicts.keys())[df_index],
        "Coefficient": model.coef_,
        "MAE": round(mean_absolute_error(y_test, y_test_pred), 6),
        "RMSE": round(root_mean_squared_error(y_test, y_test_pred), 6),
        "R2_score": round(r2_score(y_test, y_test_pred), 6)
    }
  )

resultados = sorted(results, key = lambda x: x["R2_score"], reverse = True)
resultados

[{'index': 0,
  'train_df': 'X_train_with_outliers_sel',
  'Coefficient': array([[-1.26863070e-01, -1.14565523e-01,  5.29116846e-02,
          -3.60727093e-01,  3.58260495e-02, -2.53915172e-01,
           2.12524333e-01,  4.41728882e-01,  1.88399410e-01,
          -5.07164408e-03, -2.42455646e-02,  6.54236190e-02,
          -9.44178867e-03,  9.86828583e-02, -5.00279575e-02,
           7.53195766e-02,  6.04350669e-01,  2.61295803e-01,
           1.57215554e+01, -6.88566188e+02, -7.01048698e+02,
           6.97217800e+02,  2.72439309e-01,  2.51622944e-01,
           4.55992847e-01,  3.16251439e-01,  8.33874623e-02,
          -2.12058919e+01,  1.92308618e+01, -2.80550748e+01,
          -9.99944511e-03, -1.45074958e-01,  1.25076051e-01,
          -1.96344974e-02,  5.08414226e-02, -7.04759086e-02,
          -4.69170409e+01, -9.99944597e-03,  1.07842114e+01,
           6.13898585e+00,  1.39691807e+00, -1.03373855e+02,
           1.13600697e+02, -1.25982353e+02,  1.99382962e+02,
          -2.

In [40]:
print (f"The best train dataframe is |{resultados[0]['train_df']}|.\n\
======================================================      \n\
| MAE: {resultados[0]['MAE']}   |\n\
----------------------\n\
| RMSE: {resultados[0]['RMSE']}    |\n\
----------------------\n\
| R2_score: {resultados[0]['R2_score']} |\n\
======================")

The best train dataframe is |X_train_with_outliers_sel|.
| MAE: 1019.249559   |
----------------------
| RMSE: 2136.97424    |
----------------------
| R2_score: 0.99875 |


## Lasso

In [41]:
from sklearn.linear_model import Lasso

# Load of train and test data
# These data must have been standardized and correctly processed in a complete EDA

lasso_model = Lasso(alpha = 0.1, max_iter = 300)

lasso_model.fit(train_dfs[0], y_train)

y_pred = lasso_model.predict(y_test)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- anycondition_number
Feature names seen at fit time, yet now missing:
- 0-9
- 19-Oct
- 20-29
- 30-39
- 40-49
- ...


## Ridge

In [None]:
from sklearn.linear_model import Ridge

# Load of train and test data
# These data must have been standardized and correctly processed in a complete EDA

ridge_model = Ridge(alpha = 0.1, max_iter = 300)

ridge_model.fit(train_dfs[0], y_train)

y_pred = ridge_model.predict(y_test)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- anycondition_number
Feature names seen at fit time, yet now missing:
- 0-9
- 19-Oct
- 20-29
- 30-39
- 40-49
- ...
