# Regularized Linear Regression

---
**Purpose:**

- A version of linear regression that includes a penalty term to prevent overfitting, making the model more generalizable to new data.

**Types of Regularization:**

- **L1 Regularization (Lasso Regression):**

    - Adds a penalty equal to the absolute value of the coefficients, which can shrink some coefficients to zero, effectively performing feature selection.

- **L2 Regularization (Ridge Regression):**

    - Adds a penalty equal to the square of the coefficients, which helps in reducing the magnitude of the coefficients without making any of them exactly zero.

- **Elastic Net:**

    - A combination of both L1 and L2 regularization.

**Why use Regularization?:**

- It helps in dealing with multicollinearity and improves the model’s ability to generalize by preventing it from fitting the noise in the training data.

**Impact on Model:**


- Regularization adds a complexity penalty to the loss function, helping to manage the trade-off between bias and variance.
---

Imported Libraries

In [1]:
# Data processing
# ==================================================================================
import pandas as pd
import numpy as np

# Preprocessing and modeling
# ==================================================================================
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import root_mean_squared_error

# Warnings Configuration
# ==================================================================================
import warnings

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn)

---

## Decision making: Which is the best dataset?


In [2]:
# Train data frames
X_train_with_outliers_sel = pd.read_csv('../data/processed/X_train_with_outliers_sel.csv')
X_train_without_outliers_sel = pd.read_csv('../data/processed/X_train_without_outliers_sel.csv')
X_train_with_outliers_norm_sel = pd.read_csv('../data/processed/X_train_with_outliers_norm_sel.csv')
X_train_without_outliers_norm_sel = pd.read_csv('../data/processed/X_train_without_outliers_norm_sel.csv')
X_train_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_with_outliers_minmax_sel.csv')
X_train_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_without_outliers_minmax_sel.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')

# Test data frames
X_test_with_outliers_sel = pd.read_csv('../data/processed/X_test_with_outliers_sel.csv')
X_test_without_outliers_sel = pd.read_csv('../data/processed/X_test_without_outliers_sel.csv')
X_test_with_outliers_norm_sel = pd.read_csv('../data/processed/X_test_with_outliers_norm_sel.csv')
X_test_without_outliers_norm_sel = pd.read_csv('../data/processed/X_test_without_outliers_norm_sel.csv')
X_test_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_with_outliers_minmax_sel.csv')
X_test_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_without_outliers_minmax_sel.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [3]:
# train_dicts (dict)
# =====================================================================================
train_dicts = {
  "X_train_with_outliers_sel": X_train_with_outliers_sel,
  "X_train_without_outliers_sel": X_train_without_outliers_sel,
  "X_train_with_outliers_norm_sel": X_train_with_outliers_norm_sel,
  "X_train_without_outliers_norm_sel": X_train_without_outliers_norm_sel,
  "X_train_with_outliers_minmax_sel": X_train_with_outliers_minmax_sel,
  "X_train_without_outliers_minmax_sel": X_train_without_outliers_minmax_sel
}

# test_dicts (dict)
# =====================================================================================
test_dicts = {
  "X_test_with_outliers_sel": X_test_with_outliers_sel,
  "X_test_without_outliers_sel": X_test_without_outliers_sel,
  "X_test_with_outliers_norm_sel": X_test_with_outliers_norm_sel,
  "X_test_without_outliers_norm_sel": X_test_without_outliers_norm_sel,
  "X_test_with_outliers_minmax_sel": X_test_with_outliers_minmax_sel,
  "X_test_without_outliers_minmax_sel": X_test_without_outliers_minmax_sel
}

# -.-.--.-.-.-.-.-.-.-.--.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.--.-.-.-.-.-.-

# train_dfs (list)
# =====================================================================================
train_dfs = [
  X_train_with_outliers_sel,
  X_train_without_outliers_sel,
  X_train_with_outliers_norm_sel,
  X_train_without_outliers_norm_sel,
  X_train_with_outliers_minmax_sel,
  X_train_without_outliers_minmax_sel
]

# test_dfs (list)
# =====================================================================================
test_dfs = [
  X_test_with_outliers_sel,
  X_test_without_outliers_sel,
  X_test_with_outliers_norm_sel,
  X_test_without_outliers_norm_sel,
  X_test_with_outliers_minmax_sel,
  X_test_without_outliers_minmax_sel
]

# -.-.--.-.-.-.-.-.-.-.--.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.--.-.-.-.-.-.-

# Print .shape
# =====================================================================================
print("|X_train|")
print("=================================================================")
print(f"X_train_with_outliers_sel shape: {X_train_with_outliers_sel.shape} ")
print(f"X_train_without_outliers_sel shape: {X_train_without_outliers_sel.shape}")
print(f"X_train_with_outliers_norm_sel shape: {X_train_with_outliers_norm_sel.shape}")
print(f"X_train_without_outliers_norm_sel shape: {X_train_without_outliers_norm_sel.shape}")
print(f"X_train_with_outliers_minmax_sel shape: {X_train_with_outliers_minmax_sel.shape}")
print(f"X_train_without_outliers_minmax_sel shape: {X_train_without_outliers_minmax_sel.shape}\n")

print("|X_test|")
print("=================================================================")
print(f"X_test_with_outliers_sel shape: {X_test_with_outliers_sel.shape} ")
print(f"X_test_without_outliers_sel shape: {X_test_without_outliers_sel.shape}")
print(f"X_test_with_outliers_norm_sel shape: {X_test_with_outliers_norm_sel.shape}")
print(f"X_test_without_outliers_norm_sel shape: {X_test_without_outliers_norm_sel.shape}")
print(f"X_test_with_outliers_minmax_sel shape: {X_test_with_outliers_minmax_sel.shape}")
print(f"X_test_without_outliers_minmax_sel shape: {X_test_without_outliers_minmax_sel.shape}\n")

print("|Y_train|")
print("=================================================================")
print(f"y_train shape: {y_train.shape}\n ")

print("|Y_test|")
print("=================================================================")
print(f"y_test shape: {y_test.shape} ")

|X_train|
X_train_with_outliers_sel shape: (2512, 65) 
X_train_without_outliers_sel shape: (2512, 65)
X_train_with_outliers_norm_sel shape: (2512, 65)
X_train_without_outliers_norm_sel shape: (2512, 65)
X_train_with_outliers_minmax_sel shape: (2512, 65)
X_train_without_outliers_minmax_sel shape: (2512, 65)

|X_test|
X_test_with_outliers_sel shape: (628, 65) 
X_test_without_outliers_sel shape: (628, 65)
X_test_with_outliers_norm_sel shape: (628, 65)
X_test_without_outliers_norm_sel shape: (628, 65)
X_test_with_outliers_minmax_sel shape: (628, 65)
X_test_without_outliers_minmax_sel shape: (628, 65)

|Y_train|
y_train shape: (2512, 1)
 
|Y_test|
y_test shape: (628, 1) 


In [4]:
results = []

for df_index in range(len(train_dfs)):
  model = LinearRegression()
  train_df = train_dfs[df_index]
  model.fit(train_df, y_train)
  y_train_pred = model.predict(train_df)
  y_test_pred = model.predict(test_dfs[df_index])

  results.append(
    {
        "index": df_index,
        "train_df": list(train_dicts.keys())[df_index],
        "Coefficient": model.coef_,
        "MAE": round(mean_absolute_error(y_test, y_test_pred), 6),
        "RMSE": round(root_mean_squared_error(y_test, y_test_pred), 6),
        "R2_score": round(r2_score(y_test, y_test_pred), 6)
    }
  )

resultados = sorted(results, key = lambda x: x["R2_score"], reverse = True)
resultados

[{'index': 0,
  'train_df': 'X_train_with_outliers_sel',
  'Coefficient': array([[ 1.43681900e+11,  1.43681900e+11,  1.43681900e+11,
           1.43681900e+11,  1.43681900e+11,  1.43681900e+11,
           1.43681900e+11,  1.43681900e+11,  1.43681900e+11,
          -2.05259857e+10, -2.05259857e+10, -2.05259857e+10,
          -2.05259857e+10, -2.05259857e+10, -2.05259857e+10,
          -1.23155914e+11,  6.09233856e-01,  2.60856628e-01,
          -7.46812094e+02, -7.57728906e+02,  7.53545944e+02,
           2.72506714e-01,  2.53040314e-01,  4.54620361e-01,
           3.15917969e-01,  8.38317871e-02, -2.82269641e+01,
          -8.57543945e-03, -1.44496918e-01,  1.23657227e-01,
          -2.48718262e-02,  5.69992065e-02, -6.66685104e-02,
          -5.20984752e+01, -1.02119446e-02,  1.05564502e+01,
           4.86564088e+00,  1.06042163e+00, -1.19950738e+02,
           1.35361221e+02,  5.79697059e+01, -2.78891273e+01,
          -2.42450625e-01,  2.24665668e+02,  2.77494503e+01,
          -2.

In [7]:
print (f"The best train dataframe is |{resultados[0]['train_df']}|.\n\
======================================================      \n\
| MAE: {resultados[0]['MAE']}   |\n\
----------------------\n\
| RMSE: {resultados[0]['RMSE']}  |\n\
----------------------\n\
| R2_score: {resultados[0]['R2_score']}  |\n\
======================")

The best train dataframe is |X_train_with_outliers_sel|.
| MAE: 1013.684615   |
----------------------
| RMSE: 2129.041321  |
----------------------
| R2_score: 0.99876  |


---

## Lasso

In [11]:
from sklearn.linear_model import Lasso

alpha = 1.0
lasso_model = Lasso(alpha = alpha)

# Training the model
lasso_model.fit(train_dfs[0], y_train)

# We evaluate the performance of the model on the test data
score = lasso_model.score(test_dfs[0], y_test)
print("Coefficients:", lasso_model.coef_)
print("R2 score:", score)

Coefficients: [ 4.41679273e-03  3.64135798e-01  3.87879709e-01  3.21991157e-01
  7.22714734e-01  3.19642710e-01 -1.49131163e-03  4.61556051e-01
 -5.96267446e-02 -1.82623628e-01 -1.49177878e-01 -3.58067154e-02
 -1.92375445e-01  3.86404282e-01 -4.29968892e-01  4.25672848e-02
  1.29611895e-01  1.07906236e-01  3.81579871e+01 -3.06200820e+01
  2.51735294e+01  1.94280577e-01  8.58236205e-02  2.44346126e-01
  1.59993916e-02  2.21007680e-01 -4.02186480e+01 -3.20350917e-01
 -1.15351464e-01  2.13175873e-01 -7.54716180e-02  2.78646435e-04
 -1.55128996e-01 -1.23954623e+02  1.94522183e-01  1.11422400e+01
  1.46969431e+01 -1.00220134e+01 -1.00208743e+02  1.26979587e+02
  4.49099069e+01 -2.01752728e+01 -1.28778033e-01  3.11252639e+02
  2.55707895e+00 -2.64840909e+01  5.46344676e-01  1.54690589e+00
 -6.38924173e-02  6.30409804e-01 -1.76909067e+02  1.63845655e+02
 -8.49314901e-02  5.52169888e+02 -3.96969689e+02  7.76032066e-01
 -3.48015828e+02  1.16382069e-01 -3.00834952e+02  5.96896139e-01
 -4.0444570

---

## Ridge

In [12]:
from sklearn.linear_model import Ridge

alpha = 1.0
ridge_model = Ridge(alpha = alpha)

# Training the model
ridge_model.fit(train_dfs[0], y_train)

# We evaluate the performance of the model on the test data
score_ridge= ridge_model.score(test_dfs[0], y_test)
print("Coefficients:", ridge_model.coef_)
print("R2 score:", score_ridge)

Coefficients: [[-1.38350015e-01 -1.12894826e-01  5.54619149e-02 -3.61764917e-01
   3.66128215e-02 -2.57123743e-01  2.18499625e-01  4.35634684e-01
   1.89938605e-01 -4.96319066e-03 -2.40959887e-02  6.70996978e-02
  -9.49942722e-03  9.12404952e-02 -4.60112495e-02  8.04048087e-02
   6.05884976e-01  2.59935096e-01 -4.88804674e+02 -4.99501186e+02
   4.95397031e+02  2.73685921e-01  2.55219785e-01  4.56464034e-01
   3.18060867e-01  8.39110121e-02 -2.80115615e+01 -9.68860610e-03
  -1.44431520e-01  1.24603740e-01 -2.17269715e-02  5.35329706e-02
  -7.00772743e-02 -5.16644414e+01 -1.01780206e-02  1.04829219e+01
   4.72131343e+00  1.23318147e+00 -1.19233681e+02  1.34754450e+02
   5.67119675e+01 -2.78834575e+01 -2.35542425e-01  2.24626946e+02
   2.78486485e+01 -2.41909792e+01 -5.67501412e+00 -3.97571289e+00
   1.03413518e-01 -3.16323211e-01 -1.20290829e+02  1.12840951e+02
  -3.12545726e-01  5.72910527e+02 -4.06883372e+02  7.89207845e-01
  -3.17097019e+02 -5.15118407e-01 -3.23847680e+02  1.12321770e