# GridSearch & Pipelines
GridSearch es una herramienta de optimización que usamos cuando ajustamos hiperparámetros. Definimos la cuadrícula(grid) de parámetros que queremos buscar y seleccionamos la mejor combinación de parámetros para nuestros datos.


## Método 1
Itera un único algoritmo sobre un conjunto de hiperparámetros, mediante la validación cruzada, iterando con el dataset dividido en train y val para recoger los errores y evaluar la mejor métrica. 

In [None]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()

parameters = {
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'degree': [1,2,3,4,5,6,7],
    'gamma': ['scale', 'auto']
}

svc = svm.SVC()

clf = GridSearchCV(estimator = svc,
                  param_grid = parameters,
                  n_jobs = -1,
                  cv = 10,
                  scoring="accuracy")

clf.fit(iris.data, iris.target)

In [None]:
clf.best_estimator_

In [None]:
print(clf.best_params_)
print(clf.best_score_)

In [None]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(C=0.1, degree=2, gamma='auto', kernel='poly')
scores = cross_val_score(clf, iris.data, iris.target, cv=10)
scores

In [None]:
import numpy as np
print(np.mean(scores))
print(np.std(scores))

## Método 2

Una forma más senior es montar un único gridsearch para iterar con varios modelos con otros hiperparámetros y con la validación cruzada.

In [None]:
import pickle

In [None]:
# Load libraries
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn import svm
# Set random seed
np.random.seed(0)

In [None]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2)

In [None]:
pipe = Pipeline(steps=[("scaler", StandardScaler()),
    ('classifier', RandomForestClassifier())
])

logistic_params = {
    'classifier': [LogisticRegression(max_iter=1000, solver='liblinear'), LogisticRegression(max_iter=10, solver='liblinear')],
    'classifier__penalty': ['l1', 'l2']
}

random_forest_params = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'classifier': [RandomForestClassifier()],
    'classifier__max_depth': [2,3]
}

svm_param = {
    'classifier': [svm.SVC()], # kernel rbf (gaussiano)
    'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
}

search_space = [
    logistic_params,
    random_forest_params,
    svm_param
]

clf = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 5,
                  n_jobs=-1)

clf.fit(X_train, y_train)

In [None]:
print(clf.best_estimator_)
print(clf.best_score_)
print(clf.best_params_)

In [None]:
clf.best_estimator_.predict(X_test)

In [None]:
clf.best_estimator_.score(X_test,y_test)

## Método 3

Otro uso puede ser la construcción de pipelines (tuberías) específicos para cada tipo de modelo.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest # basado en p-values
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
reg_log = Pipeline(steps = [
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler()),
    ("reglog", LogisticRegression())
])
reg_log_param = {
    "imputer__strategy": ['mean', 'median'],
#    "reglog__penalty": ['l1', 'l2'],
    "reglog__penalty": ['l2'],
    "reglog__C": np.logspace(0, 4, 10)
}

In [None]:
rand_forest = RandomForestClassifier()
rand_forest_param = {
    "n_estimators": [50, 100, 150],
    "max_features": [1,2,3]
}


svm = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("svm", SVC())
])


svm_param = {
    'selectkbest__k': [2, 3, 4],
    'svm__kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'svm__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'svm__degree': [1,2,3,4],
    'svm__gamma': ['scale', 'auto']
}


gs_reg_log = GridSearchCV(reg_log,
                         reg_log_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs = -1)

gs_rand_forest = GridSearchCV(rand_forest,
                         rand_forest_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs = -1)

gs_svm = GridSearchCV(svm,
                         svm_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs = -1)

grids = {"gs_reg_log": gs_reg_log,
        "gs_rand_forest": gs_rand_forest,
        "gs_svm": gs_svm}

In [None]:
from sklearn.model_selection import train_test_split 
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

In [None]:
print(gs_reg_log.best_score_)
print(gs_reg_log.best_params_)
print(gs_reg_log.best_estimator_)
print(gs_reg_log.best_estimator_['reglog'])

In [None]:
print(gs_rand_forest.best_score_)
print(gs_rand_forest.best_params_)
print(gs_rand_forest.best_estimator_)

In [None]:
print(gs_svm.best_score_)
print(gs_svm.best_params_)
print(gs_svm.best_estimator_)
print(gs_svm.best_estimator_['svm'])

In [None]:
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

In [None]:
gs_svm.best_estimator_

In [None]:
preds = gs_svm.best_estimator_.predict(X_test)
accuracy_score(y_test, preds)

In [None]:
gs_reg_log.best_estimator_

In [None]:
preds = gs_reg_log.best_estimator_.predict(X_test)
accuracy_score(y_test, preds)

In [None]:
preds = gs_rand_forest.best_estimator_.predict(X_test)
accuracy_score(y_test, preds)

 Tanto la regresión logística (pipeline) como el random forest son los modelos que mejor generalizan

In [None]:
gs_svm.best_estimator_

In [None]:
gs_svm.best_estimator_['svm']

In [None]:
preds = gs_svm.best_estimator_.predict(X_test)
accuracy_score(y_test, preds)

In [None]:
gs_svm.best_estimator_['svm']

In [None]:
# El mejor modelo ha sido
best_model = gs_reg_log.best_estimator_
best_model.score(X_test, y_test)

In [None]:
gs_reg_log.best_params_

In [None]:
# El mejor modelo ha sido
best_model = gs_reg_log.best_estimator_
best_model.score(X_test, y_test)

In [None]:
gs_reg_log.best_estimator_

En train cv el reg_log mejor que random forest. Además es el más sencillo.

In [None]:
# El mejor modelo ha sido
best_model = gs_reg_log.best_estimator_
best_model.score(X_test, y_test)

In [None]:
gs_reg_log.best_estimator_

In [None]:
import pickle

filename = 'finished_model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best_model, archivo_salida)

In [None]:
with open(filename, 'rb') as archivo_entrada:
    modelo_importado = pickle.load(archivo_entrada)

In [None]:
modelo_importado.score(X_test, y_test)*100

In [None]:
modelo_importado.predict(X_test)

In [None]:
modelo_importado

In [None]:
# modelo_importado.predict(X_new)

Ya hemos escogido modelo gracias a los datos de validación. Ahora habría que entrenar el modelo con TODOS los datos de train.

## RandomSearch
El problema que tiene el GridSearchCV es que computacionalmente es muy costoso cuando el espacio dimensional de los hiperparámetros es grande.

Mediante el RandomSearch no se prueban todas las combinaciones, sino unas cuantas de manera aleatoria. Funciona bien con datasets con pocas features. Incluso [hay papers](https://www.jmlr.org/papers/v13/bergstra12a.html) que aseguran que es más eficiente RandomSearch frente a GridSearch


In [None]:
np.logspace(0, 4, 10)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

reg_log = Pipeline(steps=[
                          ("imputer",SimpleImputer()),
                          ("scaler",StandardScaler()),
                          ("reglog",LogisticRegression())
                         ])

reg_log_param = {    
                 "imputer__strategy": ['mean', 'median', 'most_frequent'],
#                 "reglog__penalty": ["l1","l2"], 
                 "reglog__penalty": ["l2"], 
                 "reglog__C": np.logspace(0, 4, 10)
                }


search = RandomizedSearchCV(reg_log,
                           reg_log_param,
                           n_iter = 50,
                           scoring='accuracy',
                           n_jobs=-1,
                           cv=10)

# execute search
result = search.fit(X_train, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

In [None]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

### Otros Steps

In [None]:
df = pd.read_csv('data/titanic.csv')

In [None]:
df.info()

In [None]:
df = df[['survived', 'age', 'fare', 'class', 'embark_town']]
df.head()

In [None]:
X = df.drop('survived', axis=1)
y = df['survived']

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


column_transformer = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'fare']),
        ('cat', OneHotEncoder(), ['class', 'embark_town'])
    ]
)

X_transformado = column_transformer.fit_transform(X)

In [None]:
X_transformado

In [None]:
X_new = pd.DataFrame(X_transformado)
X_new.columns = column_transformer.get_feature_names_out()
X_new.head()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('log', FunctionTransformer(np.log1p), ['columna_a_transformar']),

    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Definir preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'fare']),
        ('cat', OneHotEncoder(), ['class', 'embark_town'])
    ]
)

# Crear el pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Entrenar el pipeline
pipeline.fit(X_train, y_train)

# Predecir
y_pred = pipeline.predict(X_test)

In [None]:
y_pred

# Análisis exploratorio automatizado

In [None]:
import pandas as pd

In [None]:
from sklearn import datasets

In [None]:
iris = datasets.load_iris()

In [None]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.head()

In [None]:
# %pip install ydata-profiling setuptools

In [None]:
from ydata_profiling import ProfileReport

In [None]:
profile = ProfileReport(df, title="Profiling Report")
profile.to_file("your_report.html")