In [53]:
import os
import pandas as pd

# Extraer datos

BASE_PATH = "../data/processed"
TRAIN_PATHS = [
    "X_train_con_outliers.xlsx",
    "X_train_sin_outliers.xlsx",
    "X_train_con_outliers_norm_standard.xlsx",
    "X_train_sin_outliers_norm_standard.xlsx",
    "X_train_con_outliers_norm_minmax.xlsx",
    "X_train_sin_outliers_norm_minmax.xlsx"
]

TRAIN_DATASETS = []
for path in TRAIN_PATHS:
    TRAIN_DATASETS.append(
        # pd.read_excel(BASE_PATH + "/" + path)
        pd.read_excel(f"{BASE_PATH}/{path}")
        # pd.read_excel(os.path.join(BASE_PATH, path))
    )

TEST_PATHS = [
    "X_test_con_outliers.xlsx",
    "X_test_sin_outliers.xlsx",
    "X_test_con_outliers_norm_standard.xlsx",
    "X_test_sin_outliers_norm_standard.xlsx",
    "X_test_con_outliers_norm_minmax.xlsx",
    "X_test_sin_outliers_norm_minmax.xlsx"
]

TEST_DATASETS = []
for path in TEST_PATHS:
    TEST_DATASETS.append(
        pd.read_excel(f"{BASE_PATH}/{path}")
    )

y_train = pd.read_excel(f"{BASE_PATH}/y_train.xlsx")
y_test = pd.read_excel(f"{BASE_PATH}/y_test.xlsx")

In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# crea el modelo, entrena el modelo, guardas el target de train y de test. Sacar la precisión para comparar cual es el mejor de todos
results_mae = []
results_r2_score = []
for index, dataset in enumerate(TRAIN_DATASETS):
    print(index)
    model = LinearRegression()
    model.fit(dataset, y_train)
    print(f"Intercepto (a): {model.intercept_}")
    print(f"Coeficientes (b): {model.coef_}")
    y_pred_train = model.predict(dataset)   # x_train
    y_pred_test = model.predict(TEST_DATASETS[index])   # x_text

    results_mae.append(
        {
            "train": mean_absolute_error(y_train, y_pred_train),
            "test": mean_absolute_error(y_test, y_pred_test)
        }
    )
    results_r2_score.append(
        {
            "train": r2_score(y_train, y_pred_train),
            "test": r2_score(y_test, y_pred_test)
        }
        )
    # print tiempo que ha tardado
results_mae


0
Intercepto (a): [-55876.314956]
Coeficientes (b): [[ 7.07092522e-02 -1.99382137e-01  9.42200224e+01 -1.40666153e-01
   4.60542910e+01  6.47543591e-02  9.36758910e+01 -2.92046203e-01
  -7.33803720e+01  9.80222715e-02 -7.00308557e+01 -2.49385507e-01
   1.33340768e+02  1.77489527e-01  4.43965372e+01  4.59525274e-01
  -1.17817541e+02  1.52397808e-01 -1.50458748e+02 -1.60123661e-02
   2.37443368e+01 -3.60628530e-02  3.21438167e+01  8.00893062e-02
  -4.65087572e+00 -1.40826242e-02 -8.71870055e+01  1.22412001e-01
   3.27281427e+01 -6.56342744e-02  3.22158203e+00  7.07092853e-02
   5.87497975e-01  2.86688831e-01 -8.99951811e+02  9.12780790e+02
   8.93828072e+02  2.62718330e-01  2.32608842e-01  4.68073204e-01
   3.10307414e-01  4.34508993e+02  4.59492602e+02  3.83316518e+02
   4.56662903e+02  8.20437048e-02 -2.95057909e+01  3.54529291e+01
  -6.02080035e+01 -6.37819329e-03 -1.19359556e-01  1.06603093e-01
  -3.92691317e-02  7.37851003e-02 -1.13054249e-01  1.40523238e+01
  -6.37819329e-03  3.159

[{'train': 944.783558302352, 'test': 1011.5766258958581},
 {'train': 33616.45861320671, 'test': 31569.727515476345},
 {'train': 944.9161183505691, 'test': 1014.5280218796536},
 {'train': 33616.45861319524, 'test': 31569.727515469764},
 {'train': 944.9161077608728, 'test': 1014.5280085278165},
 {'train': 33616.45861319543, 'test': 31569.72751546982}]

In [55]:
results_r2_score

[{'train': 0.9997093648861551, 'test': 0.9987077858524684},
 {'train': 0.44396236191263805, 'test': 0.4696487146626598},
 {'train': 0.999709550504631, 'test': 0.9987023194872439},
 {'train': 0.44396236191263805, 'test': 0.4696487146627659},
 {'train': 0.9997095505022927, 'test': 0.9987023194937983},
 {'train': 0.4439623619126378, 'test': 0.4696487146627697}]

Se observa que se han obtenido valores casi perfectos cuando se trata de con outliers, sin importar si está con los datos en bruto, si se encuentra normalizado estándar o si se encuentra con el escalado min-max. Se escoge el que corresponde a la 2a posición, es decir, con_outliers_norm_standard.

In [56]:
from sklearn import linear_model
from sklearn.model_selection import ParameterGrid, GridSearchCV, RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")
path_train = "X_train_con_outliers_norm_standard.xlsx"
path_test = "X_test_con_outliers_norm_standard.xlsx"
winner_train = pd.read_excel(f"{BASE_PATH}/{path_train}")
winner_test = pd.read_excel(f"{BASE_PATH}/{path_test}")

# Regularized Linear Regression - Ridge Regression

In [None]:
ridge = linear_model.Ridge(random_state=42)
# Definir el grid de hiperparámetros
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'solver': ['auto', 'saga', 'lsqr'],
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'max_iter': [100, 500, 1000, 5000],
    'tol': [1e-4, 1e-3, 1e-2]
}

# GridSearchCV para encontrar los mejores parámetros
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')
train_regularized_linear_regression_ridge = grid_search.fit(winner_train, y_train)

final_model_ridge = train_regularized_linear_regression_ridge.best_estimator_
final_model_ridge
# Documentación: Se obtienen los coeficientes y el intercept_


In [58]:
y_pred_train_ridge = final_model_ridge.predict(winner_train)
r2_score(y_pred_train_ridge, y_train)

0.9997012717744266

In [59]:
y_pred_test_ridge = final_model_ridge.predict(winner_test)
r2_score(y_pred_test_ridge, y_test)

0.998811760900234

# Regularized Linear Regression - Lasso Regression

In [60]:
lasso = linear_model.Lasso(random_state = 42)
# Definir el grid de hiperparámetros
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularización
    'fit_intercept': [True, False],          # Incluir o no el intercepto
    'copy_X': [True, False],                 # Copiar datos o modificarlos en memoria
    'max_iter': [100, 500, 1000, 5000],      # Número de iteraciones
    'tol': [1e-4, 1e-3, 1e-2]               # Tolerancia de convergencia
}

# GridSearchCV para encontrar los mejores parámetros
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='r2')
train_regularized_linear_regression_lasso = grid_search.fit(winner_train, y_train)

final_model_lasso = train_regularized_linear_regression_lasso.best_estimator_
final_model_lasso


In [61]:
y_pred_train_lasso = final_model_lasso.predict(winner_train)
r2_score(y_pred_train_lasso, y_train)

0.9996638646805973

In [62]:
y_pred_test_lasso = final_model_lasso.predict(winner_test)
r2_score(y_pred_test_lasso, y_test)

0.9989295554786901

# Regularized Linear Regression - Elastic-Net

In [65]:
elasticnet = linear_model.ElasticNet(random_state = 42)
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularización (más alto = más penalización)
    'l1_ratio': [0.1, 0.5, 0.9, 1.0],        # Proporción L1 (Lasso) vs. L2 (Ridge)
    'fit_intercept': [True, False],          # Incluir o no el intercepto
    'copy_X': [True, False],                 # Copiar datos o modificarlos en memoria
    'max_iter': [100, 500, 1000, 5000],      # Número de iteraciones
    'tol': [1e-4, 1e-3, 1e-2]               # Tolerancia de convergencia
}
# GridSearchCV para encontrar los mejores parámetros
grid_search = GridSearchCV(elasticnet, param_grid, cv=5, scoring='r2')
train_regularized_linear_regression_elasticnet = grid_search.fit(winner_train, y_train)

final_model_elasticnet = train_regularized_linear_regression_elasticnet.best_estimator_
final_model_elasticnet



In [66]:
y_pred_train_elasticnet = final_model_elasticnet.predict(winner_train)
r2_score(y_pred_train_elasticnet, y_train)

0.9996638646805973

In [67]:
y_pred_test_elasticnet = final_model_elasticnet.predict(winner_test)
r2_score(y_pred_test_elasticnet, y_test)

0.9989295554786901