### Imported libreries

In [10]:
# Data manipulation

import pandas as pd
import numpy as np

# Modeling

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### Selection of the best dataset

In [2]:
X_train_WITH_outliers = pd.read_excel("../data/processed/X_train_WITH_outliers.xlsx")
X_train_WITH_outliers_norm = pd.read_excel("../data/processed/X_train_WITH_outliers_norm.xlsx")
X_train_WITH_outliers_scal = pd.read_excel("../data/processed/X_train_WITH_outliers_scal.xlsx")
X_train_WITHOUT_outliers = pd.read_excel("../data/processed/X_train_WITHOUT_outliers.xlsx")
X_train_WITHOUT_outliers_norm = pd.read_excel("../data/processed/X_train_WITHOUT_outliers_norm.xlsx")
X_train_WITHOUT_outliers_scal = pd.read_excel("../data/processed/X_train_WITHOUT_outliers_scal.xlsx")

y_train = pd.read_excel("../data/processed/y_train.xlsx")
y_test = pd.read_excel("../data/processed/y_test.xlsx")

In [12]:
train_datasets = {
    "X_train_WITH_outliers": X_train_WITH_outliers,
    "X_train_WITH_outliers_norm": X_train_WITH_outliers_norm,
    "X_train_WITH_outliers_scal": X_train_WITH_outliers_scal,
    "X_train_WITHOUT_outliers": X_train_WITHOUT_outliers,
    "X_train_WITHOUT_outliers_norm": X_train_WITHOUT_outliers_norm,
    "X_train_WITHOUT_outliers_scal": X_train_WITHOUT_outliers_scal
}

models = {}
metrics = {}

for name, dataset in train_datasets.items():
  model = LinearRegression()
  model.fit(dataset, y_train)
  y_pred = model.predict(dataset)
  
  mse = mean_squared_error(y_train, y_pred)
  r2 = r2_score(y_train, y_pred)


  models[name] = model
  metrics[name] = {"MSE": mse, "R2": r2}

  print(f"{name}: MSE={mse:.2f}, R2={r2:.4f}")

best_dataset_name = max(metrics, key=lambda k: metrics[k]["R2"])
best_X_train = train_datasets[best_dataset_name]

best_test_name = best_dataset_name.replace("X_train", "X_test")
best_X_test = pd.read_excel(f"../data/processed/{best_test_name}.xlsx")

print("\n================ CONCLUSION ================")
print(f"Best dataset: {best_dataset_name}")
print(f"MSE: {metrics[best_dataset_name]['MSE']:.2f}")
print(f"R2: {metrics[best_dataset_name]['R2']:.4f}")

X_train_WITH_outliers: MSE=36755158.39, R2=0.7539
X_train_WITH_outliers_norm: MSE=36755158.39, R2=0.7539
X_train_WITH_outliers_scal: MSE=36755158.39, R2=0.7539
X_train_WITHOUT_outliers: MSE=36736235.95, R2=0.7540
X_train_WITHOUT_outliers_norm: MSE=36736235.95, R2=0.7540
X_train_WITHOUT_outliers_scal: MSE=36736235.95, R2=0.7540

Best dataset: X_train_WITHOUT_outliers
MSE: 36736235.95
R2: 0.7540


### Hyperparameter optimization

# **The linear regression model cannot be optimized**

In [4]:
params = {
    "penalty": ["l1", "l2"],
    "C": [0.1, 1.0, 10.0],
    "solver": ["liblinear", "saga"]
}
params_prueba = {
    "C": [0.1, 1.0],
    "solver": ["liblinear"],
    "penalty": ["l2"]
}

#### Grid Search

In [None]:
"""grid_search = GridSearchCV(LogisticRegression(random_state=10, max_iter=500), params_prueba, scoring = "accuracy", n_jobs = -1)
grid_search.fit(best_X_train, y_train) # Entreno el optimizador con el dataset GANADOR
print("\n===== GridSearchCV RESULTS =====")
print("Best params:", grid_search.best_params_)
best_model_grid = grid_search.best_estimator_
print("The best model is: ", best_model_grid)
print("The score for this model is: ", grid_search.best_score_)"""

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



===== GridSearchCV RESULTS =====
Best params: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
The best model is:  LogisticRegression(max_iter=500, penalty='l2', random_state=10,
                   solver='liblinear')
The score for this model is:  0.8974195506982392


  y = column_or_1d(y, warn=True)


#### Random Search

In [None]:
"""random_search = RandomizedSearchCV(LogisticRegression(random_state=10, max_iter=500), params, scoring = "accuracy", n_iter = 5, n_jobs = -1)
random_search.fit(best_X_train, y_train) # Entreno el optimizador con el dataset GANADOR
print("RandomizedSearchCV best params:", random_search.best_params_)
best_model_random = random_search.best_estimator_
print("The best model is: ",best_model_random)
print("The score for this model is: ",random_search.best_score_)"""

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV best params: {'solver': 'saga', 'penalty': 'l1', 'C': 1.0}
The best model is:  LogisticRegression(max_iter=500, penalty='l1', random_state=10, solver='saga')
The score for this model is:  0.89802671523983




In [None]:
"""final_model = LogisticRegression(l1_ratio=1, random_state=10, solver='saga')
final_model.fit(best_X_train, y_train)
y_pred_train = final_model.predict(best_X_train)
metric_train = accuracy_score(y_train, y_pred_train)
print(f"The metric for X_train is: {metric_train:.4f}")

y_pred_test = final_model.predict(best_X_test)
metric_test = accuracy_score(y_test, y_pred_test)
print(f"The metric for X_test is: {metric_test:.4f}")"""

  y = column_or_1d(y, warn=True)


The metric for X_train is: 0.8965
The metric for X_test is: 0.9017


