# Maestría en Maestría en Ciencia de Datos e Inteligencia Artificial
#### 8. Machine Learning and Deep Learning
#### Docente: Msc. Renzo Claure Aracena.

## SELECCION DE MODELOS
### Validacion cruzada

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [4]:
base = pd.read_csv('../datasets/cancer.csv', delimiter = ';', decimal=',')

In [5]:
base.head()

Unnamed: 0,ID,Tipo,Radio,Textura,Perimetro,Area,Suavidad,Compactavidad,Concavidad,Puntos_concavos,Simetria,Fractal
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [6]:
y = base['Tipo']
y.value_counts()

Tipo
B    357
M    212
Name: count, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y) 

In [8]:
X = base.drop(['ID', 'Tipo'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=0)

In [9]:
clasif = SVC(kernel='linear', C=1)
clasif.fit(X_train, y_train)
clasif.score(X_test, y_test)

0.9090909090909091

In [10]:
print('Validacion Cruzada Entrenamiento (exactitud/acierto):', cross_val_score(clasif, X_train, y_train, cv=5))

Validacion Cruzada Entrenamiento (exactitud/acierto): [0.93023256 0.90588235 0.84705882 0.95294118 0.94117647]


In [11]:
cv_score = cross_val_score(clasif, X_train, y_train, cv=5)
cv_score.mean()

np.float64(0.9154582763337894)

In [12]:
from sklearn.metrics import roc_auc_score, f1_score

In [13]:
y_pred = clasif.predict(X_test)
auc_score = roc_auc_score(y_test, y_pred)
print(f'El indicador AUC sobre la base de comprobación = {auc_score:.4f}')

El indicador AUC sobre la base de comprobación = 0.9084


In [14]:
#Obtener distintos Cross Val con métricas distintas
print('Validacion Cruzada ENTRENAMIENTO (AUC):',    cross_val_score(clasif, X_train, y_train, cv=5, scoring='roc_auc'))
print('Validacion Cruzada ENTRENAMIENTO (Recall):', cross_val_score(clasif, X_train, y_train, cv=5, scoring='recall'))
print('Validacion Cruzada ENTRENAMIENTO (Precision):', cross_val_score(clasif, X_train, y_train, cv=5, scoring='precision'))
print('Validacion Cruzada ENTRENAMIENTO (F1):', cross_val_score(clasif, X_train, y_train, cv=5, scoring='f1'))

Validacion Cruzada ENTRENAMIENTO (AUC): [0.98206019 0.97670251 0.93573113 0.98584906 0.98349057]
Validacion Cruzada ENTRENAMIENTO (Recall): [0.84375    0.77419355 0.78125    0.90625    0.96875   ]
Validacion Cruzada ENTRENAMIENTO (Precision): [0.96428571 0.96       0.80645161 0.96666667 0.88571429]
Validacion Cruzada ENTRENAMIENTO (F1): [0.9        0.85714286 0.79365079 0.93548387 0.92537313]


In [15]:
f1_score(y_test, clasif.predict(X_test))

0.8807339449541285

In [16]:
#Con balanceo de clases de la variable objetivo
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=0)
clasif = SVC(kernel='linear', C=1, class_weight='balanced')
clasif.fit(X_train, y_train)
clasif.score(X_test, y_test)

0.916083916083916

In [17]:
print('Validacion Cruzada (exactitud/acierto):', cross_val_score(clasif, X_train, y_train, cv=5))
print('Validacion Cruzada (AUC):',               cross_val_score(clasif, X_train, y_train, cv=5, scoring='roc_auc'))
print('Validacion Cruzada (Recall):',            cross_val_score(clasif, X_train, y_train, cv=5, scoring='recall'))

Validacion Cruzada (exactitud/acierto): [0.94186047 0.90588235 0.84705882 0.95294118 0.88235294]
Validacion Cruzada (AUC): [0.98032407 0.97491039 0.93396226 0.98290094 0.98231132]
Validacion Cruzada (Recall): [0.875      0.80645161 0.8125     0.96875    0.96875   ]


### Búsqueda GRID con CrossVal

In [19]:
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, recall_score, precision_score, accuracy_score
import time

In [20]:
clasif = SVC(kernel='rbf')  # Modelo a entrenar, ya configurado
grid_val = {'gamma': [0.001, 0.0015, 0.01, 0.1, 1, 5, 10, 50]} #El hyperparametro elegido es gamma

grid_clas_ex = GridSearchCV(clasif, param_grid = grid_val, cv=3)

#Medir el tiempo
start_time = time.time()
grid_clas_ex.fit(X_train, y_train)
end_time = time.time()

print('Mejor parametro Grid (Max Exactitud): ', grid_clas_ex.best_params_)
print('Mejor score Grid (Max Exactitud): ', grid_clas_ex.best_score_)
print('Tiempo total :', end_time - start_time)

Mejor parametro Grid (Max Exactitud):  {'gamma': 0.001}
Mejor score Grid (Max Exactitud):  0.8661971830985915
Tiempo total : 1.0679945945739746


In [21]:
pd.DataFrame(grid_clas_ex.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.015709,0.004653,0.013312,0.002854,0.001,{'gamma': 0.001},0.894366,0.866197,0.838028,0.866197,0.023,1
1,0.016435,0.001208,0.013108,0.000435,0.0015,{'gamma': 0.0015},0.880282,0.852113,0.838028,0.856808,0.017566,2
2,0.022467,0.001139,0.017441,0.000608,0.01,{'gamma': 0.01},0.866197,0.859155,0.84507,0.856808,0.008783,2
3,0.024244,0.002598,0.021888,0.002294,0.1,{'gamma': 0.1},0.690141,0.683099,0.647887,0.673709,0.018484,4
4,0.026581,0.00469,0.023628,0.002068,1.0,{'gamma': 1},0.626761,0.633803,0.626761,0.629108,0.00332,5
5,0.025444,0.002669,0.020802,0.00088,5.0,{'gamma': 5},0.626761,0.626761,0.626761,0.626761,0.0,6
6,0.025123,0.001464,0.022399,0.000814,10.0,{'gamma': 10},0.626761,0.626761,0.626761,0.626761,0.0,6
7,0.024821,0.001412,0.020859,0.000856,50.0,{'gamma': 50},0.626761,0.626761,0.626761,0.626761,0.0,6


In [24]:
clasif = SVC(kernel='rbf')
grid_val = {'gamma': [0.001, 0.0015, 0.01,  0.1, 1, 5, 10, 50]} # 

grid_clas_auc = GridSearchCV(clasif, param_grid = grid_val, scoring='roc_auc', cv=5 )
grid_clas_auc.fit(X_train, y_train)
y_decision_fn_scores_auc = grid_clas_auc.decision_function(X_test)

print('AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))
print('Mejor parametro Grid (Max AUC): ', grid_clas_auc.best_params_)
print('Mejor score Grid (Max Exactitud): ', grid_clas_auc.best_score_)

AUC:  0.9528301886792453
Mejor parametro Grid (Max AUC):  {'gamma': 0.001}
Mejor score Grid (Max Exactitud):  0.9040902341020265


In [25]:
pd.DataFrame(grid_clas_auc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01752,0.003169,0.014391,0.002992,0.001,{'gamma': 0.001},0.914931,0.907407,0.840802,0.896816,0.960495,0.90409,0.038388,1
1,0.020389,0.001782,0.0164,0.000395,0.0015,{'gamma': 0.0015},0.92419,0.898447,0.82842,0.90566,0.954009,0.902145,0.041563,2
2,0.024461,0.00136,0.020641,0.002688,0.01,{'gamma': 0.01},0.941551,0.89307,0.786557,0.932193,0.915094,0.893693,0.056052,3
3,0.032976,0.002218,0.022703,0.001992,0.1,{'gamma': 0.1},0.923032,0.870968,0.807783,0.87559,0.89033,0.873541,0.037593,4
4,0.03142,0.003167,0.022002,0.002106,1.0,{'gamma': 1},0.899306,0.844683,0.791274,0.848467,0.856132,0.847972,0.034431,5
5,0.031011,0.001488,0.023901,0.001313,5.0,{'gamma': 5},0.722801,0.707885,0.673054,0.68809,0.721403,0.702647,0.019355,6
6,0.029438,0.002692,0.020991,0.000568,10.0,{'gamma': 10},0.634259,0.587216,0.604068,0.597288,0.602005,0.604967,0.015759,7
7,0.030135,0.002103,0.024404,0.001743,50.0,{'gamma': 50},0.518519,0.493429,0.524764,0.490566,0.515625,0.508581,0.013888,8


In [26]:
from sklearn.metrics import get_scorer_names
print(get_scorer_names())

['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'd2_absolute_error_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'matthews_corrcoef', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_max_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_negative_likelihood_ratio', 'neg_root_mean_squared_error', 'neg_root_mean_squared_log_error', 'normalized_mutual_info_score', 'positive_likelihood_ratio', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
base.columns

In [27]:
from sklearn.model_selection import GridSearchCV

X = base[['Radio', 'Simetria']]
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=0)
clasif = SVC(kernel = 'linear').fit(X_train, y_train)
grid_val = {'class_weight':['balanced', {1:1},{1:3},{1:4}], 'gamma': [0.001, 0.01, 0.1, 1, 5, 10, 50]}

for i, eval_metric in enumerate(('precision', 'recall', 'f1', 'roc_auc')):
    grid_clas_p = GridSearchCV(clasif, param_grid=grid_val, scoring=eval_metric, cv=3)
    grid_clas_p.fit(X_train, y_train)
    print('Mejor Parametro Grid (max, {0}): {1}'.format(eval_metric, grid_clas_p.best_params_))
    print('Mejor Score Grid (max, {0}): {1}'.format(eval_metric, grid_clas_p.best_score_))

Mejor Parametro Grid (max, precision): {'class_weight': {1: 1}, 'gamma': 0.001}
Mejor Score Grid (max, precision): 0.9230404823428079
Mejor Parametro Grid (max, recall): {'class_weight': {1: 4}, 'gamma': 0.001}
Mejor Score Grid (max, recall): 0.8930817610062892
Mejor Parametro Grid (max, f1): {'class_weight': {1: 1}, 'gamma': 0.001}
Mejor Score Grid (max, f1): 0.822288055006949
Mejor Parametro Grid (max, roc_auc): {'class_weight': {1: 4}, 'gamma': 0.001}
Mejor Score Grid (max, roc_auc): 0.9369655854709915


### Implementación de Pipelines

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import time

In [30]:
modelo = Pipeline([('scaler', MinMaxScaler()), ('model', SVC())])
param_grid = {'model__gamma': [0.001, 0.01, 0.1, 1, 5, 10, 50], 'model__kernel':  ['rbf', 'linear'] }
cv = GridSearchCV(modelo, param_grid, scoring='f1', cv=5)
cv.fit(X_train, y_train)

In [31]:
cv.best_params_

{'model__gamma': 50, 'model__kernel': 'rbf'}

In [32]:
cv.score(X_test, y_test)

0.8155339805825242

In [33]:
cv.predict(X_test)

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1])

### EJERCICIO
REALICE UN GRID SEARCH CON CROSS VALIDATION, UTILICE KERNEL LINEAL Y RBF, PRUEBE LOS PARAMETROS C{0.1 , 10} y un cv=5, use PRECISSION como metrica