## Tutorial sobre ensambles

### Configuración gráfica inicial para visualizaciones

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [None]:
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams.update({'font.size': 14})

### Generación de conjuntos de datos

In [None]:
from sklearn import datasets

X, Y = datasets.make_blobs(n_samples=1000, centers=5, random_state=0, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='rainbow');

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
plt.scatter(X_train[:, 0], X_train[:, 1], c=Y_train, s=50, cmap='rainbow');

In [None]:
plt.scatter(X_test[:, 0], X_test[:, 1], c=Y_test, s=50, cmap='rainbow');

### Visualizador de superficies de decisión

In [None]:
import numpy as np

def visualize_classifier(model, X, Y, ax=None, cmap='rainbow'):
  ax = ax or plt.gca()
  ax.scatter(X[:, 0], X[:, 1], c=Y, s=30, cmap=cmap, clim=(Y.min(), Y.max()), zorder=3)
  ax.axis('tight')
  ax.axis('off')
  xlim = ax.get_xlim()
  ylim = ax.get_ylim()
  xx, yy = np.meshgrid(np.linspace(*xlim, num=200), np.linspace(*ylim, num=200))
  Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
  n_classes = len(np.unique(Y))
  contours = ax.contourf(xx, yy, Z, alpha=0.3, levels=np.arange(n_classes + 1) - 0.5, cmap=cmap, zorder=1)
  ax.set(xlim=xlim, ylim=ylim)

### Entrenamiento y evaluación de clasificadores

In [None]:
from sklearn import metrics

def train_and_eval(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    predicted = model.predict(X_test)
    print(f'Accuracy: {metrics.accuracy_score(Y_test, predicted):.2f}')

**1. Árbol de decisión**

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
train_and_eval(tree, X_train, Y_train, X_test, Y_test)
visualize_classifier(tree, X_train, Y_train)

**2. Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=0)
train_and_eval(forest, X_train, Y_train, X_test, Y_test)
visualize_classifier(forest, X_train, Y_train);

**3. Gradient Boosting**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boosting = GradientBoostingClassifier(random_state=0)
train_and_eval(grad_boosting, X_train, Y_train, X_test, Y_test)
visualize_classifier(grad_boosting, X_train, Y_train)

**4. ¿Bagging con k-NN? Por supuesto**

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
bagging = BaggingClassifier(knn, random_state=0)
train_and_eval(bagging, X_train, Y_train, X_test, Y_test)
visualize_classifier(bagging, X_train, Y_train)

### ¿Cómo podemos encontrar la mejor combinación de parámetros para estos ensambles?

In [None]:
from sklearn.model_selection import GridSearchCV

forest_hyperparameters = [{'n_estimators': [10,50,100],'max_depth': [1,2,5]}]
forest = GridSearchCV(RandomForestClassifier(random_state=0), forest_hyperparameters)
train_and_eval(forest, X_train, Y_train, X_test, Y_test)
visualize_classifier(forest, X_train, Y_train)

In [None]:
grad_boosting_hyperparameters = [{'n_estimators': [10,50,100],'max_depth': [1,2,5]}]
grad_boosting = GridSearchCV(GradientBoostingClassifier(random_state=0), grad_boosting_hyperparameters)
train_and_eval(grad_boosting, X_train, Y_train, X_test, Y_test)
visualize_classifier(grad_boosting, X_train, Y_train)

In [None]:
bagging_hyperparameters = [{'n_estimators': [10, 50, 100], 'base_estimator__n_neighbors':[1,2,3,5]}]
knn = KNeighborsClassifier()
bagging = GridSearchCV(BaggingClassifier(knn, max_samples=0.1, random_state=0), bagging_hyperparameters)
train_and_eval(bagging, X_train, Y_train, X_test, Y_test)
visualize_classifier(bagging, X_train, Y_train)

### Algunas conclusiones
* Implementar y entrenar ensambles en scikit-learn es fácil y rápido
* Los ensambles mejoran el rendimiento de los clasificadores simples, en la gran mayoría de los casos
* Nunca descartar k-NN