<a href="https://colab.research.google.com/github/ArturoSbr/Nonlinear_Classifiers_Python/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importar herramientas a ambiente local

In [None]:
# Importar librerías
import numpy as np
import pandas as pd

# Cargar datos
from sklearn.datasets import load_breast_cancer

# Importar modelo
from sklearn.ensemble import GradientBoostingClassifier

# Importar función para partir datos
from sklearn.model_selection import train_test_split

# Importar algoritmo para reducir variables
from sklearn.feature_selection import RFECV

# Importar algoritmo para optimizar híper parámetros
from sklearn.model_selection import GridSearchCV

# Importar funciones de métricas
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Librería de visualización
from matplotlib import pyplot as plt

Cargar datos

In [None]:
# Declarar variables (X) independientes y variable dependiente (y)
X = load_breast_cancer()['data']
y = load_breast_cancer()['target']

# Pasar X a pandas
X = pd.DataFrame(X, columns=['col_' + str(i) for i in range(X.shape[1])])

Modificar datos

In [None]:
# Crear `col_new` como nueva columna
X['col_new'] = X['col_0'].div(X['col_1'])
# X['otro'] = X['col_2'] * X['col_11'] + 5

Visualizar variables independientes

In [None]:
X.head()

Partir datos en sets de entrenamiento y validación

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

Declarar modelo y ajustarlo a los datos

In [None]:
# Declarar modelo en `clf`
clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.001,
                                 n_estimators=300, max_depth=4,
                                 max_features='sqrt', random_state=123)

# Ajustar a los datos de entrenamiento
clf.fit(X_train, y_train)

Predecir probabilidades

In [None]:
scores_train = clf.predict_proba(X_train)[:, 1]
scores_test = clf.predict_proba(X_test)[:, 1]

Métricas de performance

In [None]:
# En train
roc_auc_score(y_true=y_train, y_score=scores_train)

In [None]:
# En test
roc_auc_score(y_true=y_test, y_score=scores_test)

Visualizar ROC Curve

In [None]:
# ROC Curve sobre training data y luego testing data
fpr_train, tpr_train, thr_train = roc_curve(y_true=y_train, y_score=scores_train)
fpr_test, tpr_test, thr_test = roc_curve(y_true=y_test, y_score=scores_test)

# Plot
plt.plot(fpr_train, tpr_train, color='blue', label='Train')
plt.plot(fpr_test, tpr_test, color='green', label='Test')
# Estilo
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.show()

Importancia de atributos

In [None]:
# Crear pandas.DataFrame
imp = pd.DataFrame({'variable':X.columns,
                    'importancia':clf.feature_importances_})

# Top 10
imp.sort_values('importancia', ascending=False).head(10)

Reducción de número de atributos

In [None]:
# Inicializar eliminación recursiva
rfe = RFECV(estimator=clf, step=1, min_features_to_select=1, cv=5,
            scoring='roc_auc', verbose=1, n_jobs=-1)

# Ajustar objeto a datos
rfe.fit(X_train, y_train)

In [None]:
# Visualizar selección
print('El algoritmo conservó', rfe.support_.sum(), 'variables independientes.')
sel = pd.DataFrame({'variable':X_train.columns, 'seleccionada':rfe.support_})
sel

Buscar mejores híper parámetros exhaustivamente

Nota: Solo usaremos las variables finalistas



In [None]:
# Guardar atributos finalistas en `vars`
vars = X_train.columns[rfe.support_]

# Red de parámetros a probar
grid = {'max_depth':[3, 4, 5],
        'n_estimators':[100, 150, 200]}

# Inicializar algoritmo de búsqueda
search = GridSearchCV(estimator=clf, param_grid=grid, scoring='roc_auc', cv=5,
                      verbose=1, n_jobs=-1)

# Ajustar búsqueda a datos
search.fit(X_train[vars], y_train)

In [None]:
# Guardar resultados en pandas
res = pd.DataFrame(search.cv_results_)

# Visualizar resultados
res.sort_values('rank_test_score', ascending=False)