# Proceso de Machine Learning

## Selección del mejor Dataset

In [18]:
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

import warnings
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

X_train_CON_outliers = pd.read_excel("../data/processed/X_train_CON_outliers.xlsx")
X_train_CON_outliers_norm = pd.read_excel("../data/processed/X_train_CON_outliers_norm.xlsx")
X_train_CON_outliers_scal = pd.read_excel("../data/processed/X_train_CON_outliers_scal.xlsx")
X_train_SIN_outliers = pd.read_excel("../data/processed/X_train_SIN_outliers.xlsx")
X_train_SIN_outliers_norm = pd.read_excel("../data/processed/X_train_SIN_outliers_norm.xlsx")
X_train_SIN_outliers_scal = pd.read_excel("../data/processed/X_train_SIN_outliers_scal.xlsx")

y_train = pd.read_excel("../data/processed/y_train.xlsx")

y_train = y_train.squeeze()

#Utilizaremos el modelo de regresion logistica
datasets = [X_train_CON_outliers,
    X_train_CON_outliers_norm,
    X_train_CON_outliers_scal,
    X_train_SIN_outliers,
    X_train_SIN_outliers_norm,
    X_train_SIN_outliers_scal
    ]

models = []
metrics = []

for dataset in datasets:

    model = LinearRegression()
    model.fit(dataset, y_train)
    y_pred = model.predict(dataset)
    metric = model.score(dataset, y_train) # Calculado por coeficiente de determinacion
    metrics.append(metric)
    models.append(model)


best_metric = max(metrics)
best_index = metrics.index(best_metric)
print(f"El mejor dataset es: \n{datasets[best_index]}")
print(f"Error cuadrático medio: {mean_squared_error(y_train, y_pred)}")
print(f"Coeficiente de determinación: {metric}")

El mejor dataset es: 
      age  sex_n     bmi  children  smoker_n  region_n
0      19      1  35.530         0         1         2
1      50      1  27.455         1         1         3
2      18      0  30.115         0         1         3
3      18      1  34.100         0         1         1
4      53      1  29.480         0         1         1
...   ...    ...     ...       ...       ...       ...
1065   42      0  41.325         1         1         3
1066   20      0  31.920         0         1         2
1067   51      0  25.800         1         1         0
1068   42      1  34.100         0         1         0
1069   44      1  34.320         1         1         1

[1070 rows x 6 columns]
Error cuadrático medio: 35017902.935242556
Coeficiente de determinación: 0.7635424927625618


## Regularización

In [19]:
diccionario_modelos = {
    "lasso_model": {
        "model": Lasso(random_state=10),
        "params": {
                       "fit_intercept": [True, False],
                       "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
                       "max_iter":[100, 300, 400, 500],
                       "selection": ["cyclic", "random"],
                       "tol":[00.1, 0.0001, 0.000001]
                  }
    },

    "ridge_model": {
        "model": Ridge(random_state=10),
        "params": {
                       "fit_intercept": [True, False],
                       "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
                       "max_iter":[100, 300, 400, 500],
                       "tol":[00.1, 0.0001, 0.000001],
                       "solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"]
                  }
    },

    "elasticNet_model": {
        "model": ElasticNet(random_state=10),
        "params": {
                       "fit_intercept": [True, False],
                       "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
                       "max_iter":[100, 300, 400, 500],
                       "l1_ratio": [0, 0.5, 1],
                       "fit_intercept": [True, False],
                       "selection": ["cyclic", "random"],
                       "tol":[00.1, 0.0001, 0.000001]
                  }
    }                        
}

### Grid search

In [20]:
from sklearn.model_selection import GridSearchCV

best_scored_model = 0
best_model_gr = ""
name_model = ""

for name, info in diccionario_modelos.items():
    grid = GridSearchCV(info["model"], info["params"], scoring="r2", n_jobs = -1)
    grid.fit(datasets[best_index], y_train)

    if best_scored_model < grid.best_score_:
        best_scored_model = grid.best_score_
        best_model_gr = grid.best_estimator_
        name_model = name

print("El mejor modelo es: ", name_model)
print("El mejor estimador es: ", best_model_gr)
print("La mejor puntuación es: ", best_scored_model)

El mejor modelo es:  lasso_model
El mejor estimador es:  Lasso(alpha=100, max_iter=100, random_state=10, tol=0.1)
La mejor puntuación es:  0.7566790388544054


### Random search

In [21]:
from sklearn.model_selection import RandomizedSearchCV

best_scored_model = 0
best_model_gr = ""
name_model = ""

for name, info in diccionario_modelos.items():
    grid = RandomizedSearchCV(info["model"], info["params"], scoring="r2", n_iter = 30, n_jobs = -1)
    grid.fit(datasets[best_index], y_train)

    if best_scored_model < grid.best_score_:
        best_scored_model = grid.best_score_
        best_model_rnd = grid.best_estimator_
        name_model = name

print("El mejor modelo es: ", name_model)
print("El mejor estimador es: ", best_model_rnd)
print("La mejor puntuación es: ", best_scored_model)

El mejor modelo es:  lasso_model
El mejor estimador es:  Lasso(alpha=10, max_iter=500, random_state=10, tol=1e-06)
La mejor puntuación es:  0.7566392301080163


## Métricas finales

Volvemos a entrenar el modelo con el mejor dataset de x_train y x_test

In [22]:
X_test_CON_outliers = pd.read_excel("../data/processed/X_test_CON_outliers.xlsx")
X_test_CON_outliers_norm = pd.read_excel("../data/processed/X_test_CON_outliers_norm.xlsx")
X_test_CON_outliers_scal = pd.read_excel("../data/processed/X_test_CON_outliers_scal.xlsx")
X_test_SIN_outliers = pd.read_excel("../data/processed/X_test_SIN_outliers.xlsx")
X_test_SIN_outliers_norm = pd.read_excel("../data/processed/X_test_SIN_outliers_norm.xlsx")
X_test_SIN_outliers_scal = pd.read_excel("../data/processed/X_test_SIN_outliers_scal.xlsx")

y_test = pd.read_excel("../data/processed/y_test.xlsx")

y_test = y_test.squeeze()

datasets_test = [X_test_CON_outliers,
    X_test_CON_outliers_norm,
    X_test_CON_outliers_scal,
    X_test_SIN_outliers,
    X_test_SIN_outliers_norm,
    X_test_SIN_outliers_scal
    ]


model_f = Lasso(alpha=100, max_iter=100, random_state=10, tol=0.1)
model_f.fit(datasets[best_index], y_train) #en vez de model_f se podria utilizar best_model ya que esa variable es la que contiene el modelo entrenado de gridSearch con lo hiperparametros optimizados

y_pred = model_f.predict(datasets[best_index])
metric_train = mean_squared_error(y_train, y_pred)
metric_R2_train = r2_score(y_train, y_pred)


y_pred = model_f.predict(datasets_test[best_index])
metric_test = mean_squared_error(y_test, y_pred)
metric_R2_test = r2_score(y_test, y_pred)


print(f"La mejor métrica de nuestros datasets x_train son: {metric_train} y los x_test son: {metric_test}")
print(f"El mejor coeficiente de determinacion de nuestros datasets x_train son: {metric_R2_train} y los x_test son: {metric_R2_test}")


La mejor métrica de nuestros datasets x_train son: 35113063.67484413 y los x_test son: 42504079.520892166
El mejor coeficiente de determinacion de nuestros datasets x_train son: 0.7628999222661312 y los x_test son: 0.6969421172519427
