## Ejercicio breast cancer de sklearn

1. Carga el dataset [breast_cancer de `sklearn`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html)
2. Prueba todos los métodos de clasificación vistos hasta ahora mediante GridSearchCV. Utiliza pipeline si es necesario.

In [2]:
import pandas as pd
import numpy as np
from sklearn import datasets
cancer = datasets.load_breast_cancer()


from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold

In [3]:
df = pd.DataFrame(data= np.c_[cancer.data, cancer.target],
                     columns= list(cancer.feature_names) + ['target'])

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1],
                                                    df.iloc[:, -1], test_size=0.2)

In [3]:
reg_log = Pipeline(steps=[
                          ("scaler",StandardScaler()),
                          ("reglog",LogisticRegression())
                         ])

rand_forest = RandomForestClassifier()

svm = Pipeline(steps=[("scaler",StandardScaler()),
                      ("svm",SVC())])

gbc = GradientBoostingClassifier()

knn = KNeighborsClassifier()

knn_scal = Pipeline(steps=[
                          ("scaler",StandardScaler()),
                          ("knn",KNeighborsClassifier())
                         ])


# Definimos sus hiperparametros
reg_log_param = {    
                 "reglog__penalty": ["l1","l2"], 
                 "reglog__C": np.logspace(0, 4, 10) # Coeficiente de la regularizacion elegida
                }

rand_forest_param = {
    'n_estimators': [10, 100, 300], # Numero alto de estimadores no tiene por qué dar overfitting. Mas de 500 ya es mucho.
    'max_features': [1, 2, 3] # Cuantas features contempla cada split de cada arbol. Cuanto mas bajo, mejor generalizará
    # el max_depth no sobreajusta tanto como si fuese un decission tree
    }


svm_param = {                    
            'svm__C': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 
            'svm__kernel': ["linear","poly","rbf"],
            'svm__coef0': [-10.,-1., 0., 0.1, 0.5, 1, 10, 100],
            'svm__gamma': ('scale', 'auto')
            }


gbc_param = {
    'learning_rate': [0.05, 0.1, 0.5], # cuanto mas alto el learning rate, mas aporta cada arbol. Pero cuidado con combinarlo con muchos estimadores
    # que puede producir sobreajuste
    'n_estimators': [20,50,100,200], # numero alto de estimadores puede producir overfitting
    'max_depth': [1,2,3,4,5]
    }

knn_param = {
    'n_neighbors': [1,2,3,4,5,6]
}

knn_param_scal = {
    'knn__n_neighbors': [1,2,3,4,5,6]
}

In [39]:
gs_reg_log = GridSearchCV(reg_log,
                            reg_log_param,
                            cv=10,
                            scoring="accuracy",
                            verbose=1,
                            n_jobs=-1)

gs_rand_forest = GridSearchCV(rand_forest,
                            rand_forest_param,
                            cv=10,
                            scoring="accuracy",
                            verbose=1,
                            n_jobs=-1)

gs_svm = GridSearchCV(svm,
                        svm_param,
                        cv=10,
                        scoring="accuracy",
                        verbose=1,
                        n_jobs=-1)

gs_gbc = GridSearchCV(gbc,
                        gbc_param,
                        cv=10,
                        scoring="accuracy",
                        verbose=1,
                        n_jobs=-1)

gs_knn = GridSearchCV(knn,
                        knn_param,
                        cv=10,
                        scoring="accuracy",
                        verbose=1,
                        n_jobs=-1)

gs_knn_scal = GridSearchCV(knn_scal,
                        knn_param_scal,
                        cv=10,
                        scoring="accuracy",
                        verbose=1,
                        n_jobs=-1)

grids = {"gs_reg_log":gs_reg_log,
         "gs_rand_forest":gs_rand_forest,
         "gs_svm":gs_svm,
         "gs_gbc":gs_gbc,
        'gs_knn': gs_knn,
        'gs_knn_scal': gs_knn_scal}

In [40]:
%%time

for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)
    

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    5.0s finished


Fitting 10 folds for each of 384 candidates, totalling 3840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2160 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 3840 out of 3840 | elapsed:    8.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 498 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   44.7s finished


Fitting 10 folds for each of 6 candidates, totalling 60 fits
Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Wall time: 1min


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.1s finished


In [41]:
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

Unnamed: 0,Grid,Best score
2,gs_svm,0.986812
3,gs_gbc,0.978019
0,gs_reg_log,0.97343
5,gs_knn_scal,0.969227
1,gs_rand_forest,0.960386
4,gs_knn,0.936232


In [11]:
df.iloc[:, -1]

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
564    0.0
565    0.0
566    0.0
567    0.0
568    1.0
Name: target, Length: 569, dtype: float64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1],
                                                    df.iloc[:, -1], test_size=0.2)

In [16]:
pipe = Pipeline(steps=[('classifier', LogisticRegression())])


logistic_params = {
    'classifier': [LogisticRegression()],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': np.logspace(0, 4, 10)
    }

kneighbors_params = {
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors': [3,5,11,19],
    'classifier__weights': ["uniform", "distance"],
    'classifier__metric': ["euclidean", "manhattan"]
    }

svm_params = {
    'classifier': [SVC()],
    'classifier__kernel':('linear', 'rbf', 'sigmoid'), 
    'classifier__C':[0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 
    'classifier__coef0': [-10.,-1., 0., 0.1, 0.5, 1, 10, 100],
    'classifier__gamma': ('scale', 'auto')
    }


# Create space of candidate learning algorithms and their hyperparameters
search_space = [
    logistic_params,
    kneighbors_params,
    svm_params
    ]

In [17]:
%%time
cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# Create grid search 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, cv = cv, verbose=4, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_train, y_train)
# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
#SAVE MODEL
# save the model to disk
filename = 'finished_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

Fitting 10 folds for each of 420 candidates, totalling 4200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 982 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 1272 tasks      | elapsed:   25.4s


KeyboardInterrupt: 

In [4]:
# Ejemplo de que se puede iterar los hiperparametros del baggingclassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

param_grid = {
    'base_estimator__max_depth' : [1, 2, 3, 4, 5],
    'max_samples' : [0.05, 0.1, 0.2, 0.5]
}

clf = GridSearchCV(BaggingClassifier(DecisionTreeClassifier(),
                                     n_estimators = 100, max_features = 0.5),
                   param_grid)
clf.fit(X_train, y_train)

GridSearchCV(estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                         max_features=0.5, n_estimators=100),
             param_grid={'base_estimator__max_depth': [1, 2, 3, 4, 5],
                         'max_samples': [0.05, 0.1, 0.2, 0.5]})