# Proceso de Machine Learning

## Selección del mejor Dataset

In [19]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
from sklearn.exceptions import FitFailedWarning

warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

X_train_CON_outliers = pd.read_excel("../data/processed/X_train_CON_outliers.xlsx")
X_train_CON_outliers_norm = pd.read_excel("../data/processed/X_train_CON_outliers_norm.xlsx")
X_train_CON_outliers_scal = pd.read_excel("../data/processed/X_train_CON_outliers_scal.xlsx")
X_train_SIN_outliers = pd.read_excel("../data/processed/X_train_SIN_outliers.xlsx")
X_train_SIN_outliers_norm = pd.read_excel("../data/processed/X_train_SIN_outliers_norm.xlsx")
X_train_SIN_outliers_scal = pd.read_excel("../data/processed/X_train_SIN_outliers_scal.xlsx")

y_train = pd.read_excel("../data/processed/y_train.xlsx")

y_train = y_train.squeeze()

#Utilizaremos el modelo de regresion logistica
datasets = [X_train_CON_outliers,
    X_train_CON_outliers_norm,
    X_train_CON_outliers_scal,
    X_train_SIN_outliers,
    X_train_SIN_outliers_norm,
    X_train_SIN_outliers_scal
    ]

models = []
metrics = []

for dataset in datasets:

    model = LogisticRegression(random_state=10, solver='lbfgs', max_iter=400)
    model.fit(dataset, y_train)
    y_pred = model.predict(dataset)
    metric = accuracy_score(y_train, y_pred)
    metrics.append(metric)
    models.append(model)

best_metric = max(metrics)
best_index = metrics.index(best_metric)
print(f"El mejor dataset es: \n{datasets[best_index]}")

print(datasets[best_index])
print(datasets[0])

El mejor dataset es: 
            age  campaign     pdays  previous  cons.price.idx  cons.conf.idx  \
0      1.052220 -0.561472  0.196750 -0.349154        0.721203       0.887461   
1      0.956361 -0.207022  0.196750 -0.349154        0.721203       0.887461   
2     -1.056659 -0.207022  0.196750  1.662495       -1.180467      -1.232666   
3      0.189497  1.210776  0.196750 -0.349154       -0.866113      -1.427372   
4     -1.440092 -0.207022  0.196750 -0.349154        0.589934      -0.475478   
...         ...       ...       ...       ...             ...            ...   
32945  1.723226  0.501877  0.196750  3.674143        1.102918       0.043737   
32946 -0.960801  0.147427  0.196750 -0.349154       -0.866113      -1.427372   
32947  0.093638 -0.561472  0.196750  1.662495       -0.866113      -1.427372   
32948 -1.056659 -0.561472 -5.082733  1.662495        1.102918       0.043737   
32949  0.764645  5.109722  0.196750 -0.349154        0.589934      -0.475478   

       euribor3m 

## Optimización de hiperparámetros

In [20]:
params = {
    "l1_ratio": [0, 0.5, 1],
    "C": [0.1, 0.5, 1.0],
    "dual": [True,False],
    "tol":[00.1, 0.0001, 0.000001],
    "fit_intercept": [True, False],
#    "intercept_scaling":[1, 10, 20, 50],
    "solver": ["liblinear", "saga", "lbfgs"],
    "max_iter": [100, 200, 400]
}

### Grid search

In [21]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(LogisticRegression(random_state=10), params, scoring="accuracy", n_jobs = -1)
grid.fit(datasets[best_index], y_train)
print("Los mejores parámetros son: ", grid.best_params_)
best_model = grid.best_estimator_
print("El mejor estimador es: ", best_model)
print("La mejor puntuación es: ", grid.best_score_)

Los mejores parámetros son:  {'C': 0.1, 'dual': False, 'fit_intercept': True, 'l1_ratio': 1, 'max_iter': 100, 'solver': 'liblinear', 'tol': 0.1}
El mejor estimador es:  LogisticRegression(C=0.1, l1_ratio=1, random_state=10, solver='liblinear',
                   tol=0.1)
La mejor puntuación es:  0.9008194233687405


### Random search

In [22]:
from sklearn.model_selection import RandomizedSearchCV

grid = RandomizedSearchCV(LogisticRegression(random_state=10), params, scoring="accuracy", n_iter = 30, n_jobs = -1)
grid.fit(datasets[best_index], y_train)
print("Los mejores parámetros son: ", grid.best_params_)
best_model = grid.best_estimator_
print("El mejor estimador es: ", best_model)
print("La mejor puntuación es: ", grid.best_score_)

Los mejores parámetros son:  {'tol': 1e-06, 'solver': 'liblinear', 'max_iter': 400, 'l1_ratio': 1, 'fit_intercept': True, 'dual': False, 'C': 0.1}
El mejor estimador es:  LogisticRegression(C=0.1, l1_ratio=1, max_iter=400, random_state=10,
                   solver='liblinear', tol=1e-06)
La mejor puntuación es:  0.900546282245827


## Métricas con hiperparametros optimizados

Volvemos a entrenar el modelo con los datos hiperparametrizados de x_train y x_test

In [23]:
X_test_CON_outliers = pd.read_excel("../data/processed/X_test_CON_outliers.xlsx")
X_test_CON_outliers_norm = pd.read_excel("../data/processed/X_test_CON_outliers_norm.xlsx")
X_test_CON_outliers_scal = pd.read_excel("../data/processed/X_test_CON_outliers_scal.xlsx")
X_test_SIN_outliers = pd.read_excel("../data/processed/X_test_SIN_outliers.xlsx")
X_test_SIN_outliers_norm = pd.read_excel("../data/processed/X_test_SIN_outliers_norm.xlsx")
X_test_SIN_outliers_scal = pd.read_excel("../data/processed/X_test_SIN_outliers_scal.xlsx")

y_test = pd.read_excel("../data/processed/y_test.xlsx")
y_test = y_test.squeeze()

datasets_test = [X_test_CON_outliers,
    X_test_CON_outliers_norm,
    X_test_CON_outliers_scal,
    X_test_SIN_outliers,
    X_test_SIN_outliers_norm,
    X_test_SIN_outliers_scal
    ]


model_f = LogisticRegression(C=0.1, l1_ratio=1, random_state=10, solver='liblinear',
                   tol=0.1)
model_f.fit(datasets[best_index], y_train) #en vez de model_f se podria utilizar best_model ya que esa variable es la que contiene el modelo entrenado de gridSearch con lo hiperparametros optimizados

y_pred = model_f.predict(datasets[best_index])
metric_train = accuracy_score(y_train, y_pred)

y_pred = model_f.predict(datasets_test[best_index])
metric_test = accuracy_score(y_test, y_pred)


print(f"La mejor métrica de nuestros datasets x_train son: {metric_train} y los x_test son: {metric_test}")

La mejor métrica de nuestros datasets x_train son: 0.9007890743550835 y los x_test son: 0.896698227725176
