1. Importación de Librerías 

In [123]:
# Librerías para manejo de datos
import pandas as pd
pd.set_option('display.max_columns', 25) # Número máximo de columnas a mostrar
pd.set_option('display.max_rows', 50) # Numero máximo de filas a mostar
import numpy as np
np.random.seed(3301)
import pandas as pd
# Para preparar los datos
from sklearn.preprocessing import LabelEncoder
# Para crear el arbol de decisión 
from sklearn.tree import DecisionTreeClassifier 
# Para usar KNN como clasificador
from sklearn.neighbors import KNeighborsClassifier
# Para realizar la separación del conjunto de aprendizaje en entrenamiento y test.
from sklearn.model_selection import train_test_split
# Para evaluar el modelo
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import plot_confusion_matrix
# Para búsqueda de hiperparámetros
from sklearn.model_selection import GridSearchCV
# Para la validación cruzada
from sklearn.model_selection import KFold 
#Librerías para la visualización
import matplotlib.pyplot as plt
# Seaborn
import seaborn as sns 
from sklearn import tree

2. Carga de los datos

In [124]:
# Se cargan los datos. 
df_tracks=pd.read_csv('202210_Laboratorio1_data_Datos_Clasificacion_2022.csv', sep=';', encoding = 'utf-8', index_col=None, low_memory=False)

In [125]:
# Cantidad de datos y número de variables
df_tracks.shape

(100000, 27)

In [126]:
# Mostrar los datos
df_tracks.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,0,1,1,1,40,1,0,0,0,0,1,0,...,18,15,1,0,9,4,3,,,,,
1,0,0,0,0,25,1,0,0,1,0,0,0,...,0,0,0,0,7,6,1,,,,,
2,0,1,1,1,28,0,0,0,0,1,0,0,...,30,30,1,0,9,4,8,,,,,
3,0,1,0,1,27,0,0,0,1,1,1,0,...,0,0,0,0,11,3,6,,,,,
4,0,1,1,1,24,0,0,0,1,1,1,0,...,3,0,0,0,11,5,4,,,,,


In [127]:
# Podemos ver los tipos de todas la variables.
df_tracks.dtypes

Diabetes_012             object
HighBP                   object
HighChol                 object
CholCheck                object
BMI                      object
Smoker                   object
Stroke                   object
HeartDiseaseorAttack     object
PhysActivity             object
Fruits                   object
Veggies                  object
HvyAlcoholConsump        object
AnyHealthcare            object
NoDocbcCost              object
GenHlth                  object
MentHlth                 object
PhysHlth                 object
DiffWalk                 object
Sex                      object
Age                      object
Education                object
Income                   object
Unnamed: 22             float64
Unnamed: 23             float64
Unnamed: 24             float64
Unnamed: 25             float64
Unnamed: 26             float64
dtype: object

In [128]:
# Y hacer una descripción de los datos
df_tracks.describe()

Unnamed: 0,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
count,0.0,0.0,0.0,0.0,0.0
mean,,,,,
std,,,,,
min,,,,,
25%,,,,,
50%,,,,,
75%,,,,,
max,,,,,


In [129]:
# Se observa que hay ausencias, en particular en la variable "decibel_range". Veamos cuantas, para todas las variables:
df_tracks.isnull().sum()

Diabetes_012                16
HighBP                      18
HighChol                     9
CholCheck                   16
BMI                         19
Smoker                      21
Stroke                      13
HeartDiseaseorAttack        13
PhysActivity                15
Fruits                      24
Veggies                     11
HvyAlcoholConsump           12
AnyHealthcare               16
NoDocbcCost                 22
GenHlth                     18
MentHlth                    14
PhysHlth                    22
DiffWalk                    13
Sex                          9
Age                         12
Education                   19
Income                       1
Unnamed: 22             100000
Unnamed: 23             100000
Unnamed: 24             100000
Unnamed: 25             100000
Unnamed: 26             100000
dtype: int64

3. Limpieza y preparación de datos

In [130]:
# Hacemos una copia para llevar a cabo los pasos de limpieza y preparación
df_tracks_t = df_tracks

In [131]:
# Se procede a eliminar aquellas columnas que estan vacias
df_tracks_t = df_tracks_t.drop(['Unnamed: 22','Unnamed: 23','Unnamed: 24','Unnamed: 25','Unnamed: 26' ], axis=1)

In [132]:
# Cantidad de datos y número de variables
df_tracks_t.shape

(100000, 22)

In [133]:
df_tracks_t.isnull().sum()

Diabetes_012            16
HighBP                  18
HighChol                 9
CholCheck               16
BMI                     19
Smoker                  21
Stroke                  13
HeartDiseaseorAttack    13
PhysActivity            15
Fruits                  24
Veggies                 11
HvyAlcoholConsump       12
AnyHealthcare           16
NoDocbcCost             22
GenHlth                 18
MentHlth                14
PhysHlth                22
DiffWalk                13
Sex                      9
Age                     12
Education               19
Income                   1
dtype: int64

In [134]:
# Eliminación registros con ausencias
df_tracks_t = df_tracks_t.dropna()
# Eliminación de registros duplicados.
df_tracks_t = df_tracks_t.drop_duplicates()

In [135]:
df_tracks_t.describe()


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825,93825
unique,6,5,5,5,95,5,5,5,5,5,5,5,5,5,8,49,54,5,5,16,9,11
top,0,0,0,1,27,0,0,0,1,1,1,0,1,0,2,0,0,0,0,9,6,8
freq,78022,51469,52199,90210,8871,50549,89676,84530,70134,58408,75476,88164,88867,85637,32328,62601,56605,77223,52759,12219,38640,31883


In [136]:
colNames = df_tracks_t.columns
for i in colNames:
    print (df_tracks_t[i].describe())
    print ("_______________________________________")

count     93825
unique        6
top           0
freq      78022
Name: Diabetes_012, dtype: object
_______________________________________
count     93825
unique        5
top           0
freq      51469
Name: HighBP, dtype: object
_______________________________________
count     93825
unique        5
top           0
freq      52199
Name: HighChol, dtype: object
_______________________________________
count     93825
unique        5
top           1
freq      90210
Name: CholCheck, dtype: object
_______________________________________
count     93825
unique       95
top          27
freq       8871
Name: BMI, dtype: object
_______________________________________
count     93825
unique        5
top           0
freq      50549
Name: Smoker, dtype: object
_______________________________________
count     93825
unique        5
top           0
freq      89676
Name: Stroke, dtype: object
_______________________________________
count     93825
unique        5
top           0
freq      84530
Name

In [137]:
for i in colNames:
    print (df_tracks_t[i].value_counts())
    print ("_______________________________________")

0     78022
2     13875
1      1888
Xx       17
-        15
?         8
Name: Diabetes_012, dtype: int64
_______________________________________
0     51469
1     42312
-        19
Xx       18
?         7
Name: HighBP, dtype: int64
_______________________________________
0     52199
1     41601
Xx       10
-         8
?         7
Name: HighChol, dtype: int64
_______________________________________
1     90210
0      3585
-        15
Xx       10
?         5
Name: CholCheck, dtype: int64
_______________________________________
27    8871
26    7286
24    6929
25    6079
28    6078
      ... 
88       1
85       1
91       1
86       1
83       1
Name: BMI, Length: 95, dtype: int64
_______________________________________
0     50549
1     43230
Xx       20
-        15
?        11
Name: Smoker, dtype: int64
_______________________________________
0     89676
1      4117
Xx       16
-        13
?         3
Name: Stroke, dtype: int64
_______________________________________
0     84530
1     

In [138]:
for i in colNames:
    df_tracks_t[i].replace({"-": -1, "Xx": -1, "?": -1}, inplace=True)

In [139]:
for i in colNames:
    print (df_tracks_t[i].value_counts())
    print ("_______________________________________")

0     78022
2     13875
1      1888
-1       40
Name: Diabetes_012, dtype: int64
_______________________________________
0     51469
1     42312
-1       44
Name: HighBP, dtype: int64
_______________________________________
0     52199
1     41601
-1       25
Name: HighChol, dtype: int64
_______________________________________
1     90210
0      3585
-1       30
Name: CholCheck, dtype: int64
_______________________________________
27    8871
26    7286
24    6929
25    6079
28    6078
      ... 
86       1
96       1
88       1
85       1
83       1
Name: BMI, Length: 93, dtype: int64
_______________________________________
0     50549
1     43230
-1       46
Name: Smoker, dtype: int64
_______________________________________
0     89676
1      4117
-1       32
Name: Stroke, dtype: int64
_______________________________________
0     84530
1      9263
-1       32
Name: HeartDiseaseorAttack, dtype: int64
_______________________________________
1     70134
0     23658
-1       33
Name: Phy

In [140]:
# Cantidad de datos de cada clase
pd.value_counts(df_tracks_t['Diabetes_012'])

0     78022
2     13875
1      1888
-1       40
Name: Diabetes_012, dtype: int64

In [141]:
# También podemos vizualizar las proporciones con un diagrama de barras.
ax = sns.countplot(x='Diabetes_012', data=df_tracks_t)

In [142]:
def is_true (row, val):
    if row['Diabetes_012'] == val:
        return 1
    return 0
df_tracks_t['Diabetes0_label']=df_tracks_t.apply (lambda row: is_true (row,'0'), axis=1)
df_tracks_t['Diabetes2_label']=df_tracks_t.apply (lambda row: is_true (row,'2'), axis=1)

In [143]:
df_tracks_t.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes0_label,Diabetes2_label
0,0,1,1,1,40,1,0,0,0,0,1,0,1,0,5,18,15,1,0,9,4,3,1,0
1,0,0,0,0,25,1,0,0,1,0,0,0,0,1,3,0,0,0,0,7,6,1,1,0
2,0,1,1,1,28,0,0,0,0,1,0,0,1,1,5,30,30,1,0,9,4,8,1,0
3,0,1,0,1,27,0,0,0,1,1,1,0,1,0,2,0,0,0,0,11,3,6,1,0
4,0,1,1,1,24,0,0,0,1,1,1,0,1,0,2,3,0,0,0,11,5,4,1,0


In [144]:
pd.value_counts(df_tracks_t['Diabetes0_label'])

1    78022
0    15803
Name: Diabetes0_label, dtype: int64

In [145]:
pd.value_counts(df_tracks_t['Diabetes2_label'])

0    79950
1    13875
Name: Diabetes2_label, dtype: int64

In [146]:
lt = list(df_tracks_t.columns)
for i in lt:
    df_tracks_t[i] = df_tracks_t[i].astype(str).astype('int64')
df_tracks_t.dtypes

Diabetes_012            int64
HighBP                  int64
HighChol                int64
CholCheck               int64
BMI                     int64
Smoker                  int64
Stroke                  int64
HeartDiseaseorAttack    int64
PhysActivity            int64
Fruits                  int64
Veggies                 int64
HvyAlcoholConsump       int64
AnyHealthcare           int64
NoDocbcCost             int64
GenHlth                 int64
MentHlth                int64
PhysHlth                int64
DiffWalk                int64
Sex                     int64
Age                     int64
Education               int64
Income                  int64
Diabetes0_label         int64
Diabetes2_label         int64
dtype: object

In [147]:
# Vamos a seleccionar de nuestro conjunto solo los atributos numéricos.
number_cols = df_tracks_t.dtypes[(df_tracks_t.dtypes == np.int64) | (df_tracks_t.dtypes == np.float64)].index 
number_cols = df_tracks_t.select_dtypes(include = ['int64','float']).columns
number_cols

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income',
       'Diabetes0_label', 'Diabetes2_label'],
      dtype='object')

3. Construcción del modelo con árboles de decisión

In [148]:
# Se selecciona la variable objetivo, en este caso "Diabetes2_label".
Y = df_tracks_t['Diabetes2_label']
# Del conjunto de datos se elimina la variable "Popularity_label"
X = df_tracks_t.drop(['Diabetes2_label', 'Diabetes0_label', 'Diabetes_012'], axis=1)

In [149]:
# Dividir los datos en entrenamiento y test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [150]:
# Fijemos el número de particiones. Utilizaremos K = 10.
particiones = KFold(n_splits=10, shuffle=True, random_state = 0)

In [151]:
# Establecemos el espacio de búsqueda para los hiperparámetros que deseamos ajustar. 
param_grid = {'criterion':['gini', 'entropy'],'max_depth':[4,6,8,10,20],'min_samples_split':[2, 3, 4, 5]}

In [152]:
# Definimos el modelo sin ningún valor de estos hiperparámetros
arbolDecision = DecisionTreeClassifier(random_state=0)

In [153]:
# Ahora utilizamos GridSearch sobre el grid definido y con 10 particiones en la validación cruzada.
mejor_modelo = GridSearchCV(arbolDecision, param_grid, cv=particiones)
# Ajuste del modelo
mejor_modelo.fit(X_train, Y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=0, shuffle=True),
             estimator=DecisionTreeClassifier(random_state=0),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 6, 8, 10, 20],
                         'min_samples_split': [2, 3, 4, 5]})

In [154]:
# Podemos ver cuál fue el resultado de la búsqueda (mejores valores de hiperparámetros)
mejor_modelo.best_params_

{'criterion': 'gini', 'max_depth': 6, 'min_samples_split': 2}

In [155]:
# Obtener el mejor modelo.
arbol_final = mejor_modelo.best_estimator_
# Probemos ahora este modelo sobre test.
y_pred_train = arbol_final.predict(X_train)
y_pred_test = arbol_final.predict(X_test)
print('Exactitud sobre entrenamiento: %.2f' % accuracy_score(Y_train, y_pred_train))
print('Exactitud sobre test: %.2f' % accuracy_score(Y_test, y_pred_test))

Exactitud sobre entrenamiento: 0.86
Exactitud sobre test: 0.86


In [156]:
# Entrenar el modelo de arbol de decisión con los datos de entrenamiento
arbolDecision = arbol_final.fit(X_train,Y_train)

In [157]:
# Se genera la matriz de confusión
confusion_matrix(Y_test, y_pred_test)

array([[15745,   241],
       [ 2456,   323]], dtype=int64)

In [None]:
# Se puede visualizar la matriz de confusión
plot_confusion_matrix(arbolDecision, X_test, Y_test)  
plt.show()

In [None]:
print('Exactitud: %.2f' % accuracy_score(Y_test, y_pred_test))
print("Recall: {}".format(recall_score(Y_test,y_pred_test)))
print("Precisión: {}".format(precision_score(Y_test,y_pred_test)))
print("Puntuación F1: {}".format(f1_score(Y_test,y_pred_test)))

In [None]:
# Mostrar reporte de clasificación
print(classification_report(Y_test, y_pred_test))

In [None]:
importancia = arbol_final.feature_importances_
importancia

In [None]:
#Encontramos los valores de importancia que se le dieron a las variables del modelo
importancia_atributo = pd.DataFrame(data={"Atributo": X_train.columns,"Importancia": importancia})
importancia_atributo = importancia_atributo.sort_values(by='Importancia', ascending=False).reset_index(drop=True)
importancia_atributo

4. Construcción del modelo con KNN.

In [108]:
df_tracks_Knn= df_tracks_t[number_cols]

# Podemos ver como quedaron los datos
df_tracks_Knn.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes0_label,Diabetes2_label
0,0,1,1,1,40,1,0,0,0,0,1,0,1,0,5,18,15,1,0,9,4,3,1,0
1,0,0,0,0,25,1,0,0,1,0,0,0,0,1,3,0,0,0,0,7,6,1,1,0
2,0,1,1,1,28,0,0,0,0,1,0,0,1,1,5,30,30,1,0,9,4,8,1,0
3,0,1,0,1,27,0,0,0,1,1,1,0,1,0,2,0,0,0,0,11,3,6,1,0
4,0,1,1,1,24,0,0,0,1,1,1,0,1,0,2,3,0,0,0,11,5,4,1,0


In [109]:
# Se selecciona la variable objetivo, en este caso  "Diabetes2_label".
Y_Knn = df_tracks_Knn['Diabetes2_label']
# Del conjunto de datos se elimina la variable "Popularity_label"
X_Knn = df_tracks_Knn.drop(['Diabetes2_label', 'Diabetes0_label', 'Diabetes_012'], axis=1)

In [110]:
# Dividir los datos en entrenamiento y test
X_trainKnn, X_testKnn, Y_trainKnn, Y_testKnn = train_test_split(X_Knn, Y_Knn, test_size=0.2, random_state=0)

In [111]:
# Utilicemos un número de vecinos = 3 
modelo_knn = KNeighborsClassifier(n_neighbors=3)
modelo_knn = modelo_knn.fit(X_trainKnn, Y_trainKnn)

In [112]:
y_pred_train = modelo_knn.predict(X_trainKnn)
y_pred_test = modelo_knn.predict(X_testKnn)
print('Exactitud sobre entrenamiento: %.2f' % accuracy_score(Y_trainKnn, y_pred_train))
print('Exactitud sobre test: %.2f' % accuracy_score(Y_testKnn, y_pred_test))

Exactitud sobre entrenamiento: 0.89
Exactitud sobre test: 0.83


In [113]:
# Mostrar reporte de clasificación sobre test
print(classification_report(Y_testKnn, y_pred_test))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90     15986
           1       0.36      0.22      0.28      2779

    accuracy                           0.83     18765
   macro avg       0.62      0.58      0.59     18765
weighted avg       0.80      0.83      0.81     18765



In [114]:
# Mostrar reporte de clasificación sobre entrenamiento
print(classification_report(Y_trainKnn, y_pred_train))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94     63964
           1       0.73      0.45      0.56     11096

    accuracy                           0.89     75060
   macro avg       0.82      0.71      0.75     75060
weighted avg       0.88      0.89      0.88     75060



In [115]:
# Probemos normalizando los datos
X_n_Knn = X_Knn.copy()
X_n_Knn=(X_n_Knn-X_n_Knn.min())/(X_n_Knn.max()-X_n_Knn.min())

In [116]:
X_n_Knn.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0,93825.0
mean,0.725249,0.721561,0.980576,0.477506,0.730131,0.521769,0.549193,0.873573,0.811084,0.902004,0.529912,0.973435,0.543293,0.593371,0.552544,0.558285,0.5881,0.718439,0.651707,0.861117,0.777901
std,0.249244,0.248647,0.097442,0.059082,0.249702,0.102866,0.14949,0.217729,0.242786,0.199019,0.119119,0.112787,0.14122,0.17778,0.106124,0.107561,0.190985,0.248409,0.219641,0.142457,0.230665
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.5,1.0,0.443609,0.5,0.5,0.5,0.5,0.5,1.0,0.5,1.0,0.5,0.5,0.506849,0.505882,0.5,0.5,0.5,0.714286,0.666667
50%,0.5,0.5,1.0,0.466165,0.5,0.5,0.5,1.0,1.0,1.0,0.5,1.0,0.5,0.5,0.506849,0.505882,0.5,0.5,0.642857,0.857143,0.888889
75%,1.0,1.0,1.0,0.503759,1.0,0.5,0.5,1.0,1.0,1.0,0.5,1.0,0.5,0.666667,0.534247,0.541176,0.5,1.0,0.785714,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [117]:
# Dividir los datos en entrenamiento y test
X_trainKnn, X_testKnn, Y_trainKnn, Y_testKnn = train_test_split(X_n_Knn, Y_Knn, test_size=0.2, random_state=0)

In [118]:
# Utilicemos un número de vecinos = 3 
modelo_knn = KNeighborsClassifier(n_neighbors=3)
modelo_knn = modelo_knn.fit(X_trainKnn, Y_trainKnn)

In [119]:
y_pred_train = modelo_knn.predict(X_trainKnn)
y_pred_test = modelo_knn.predict(X_testKnn)
print('Exactitud sobre entrenamiento: %.2f' % accuracy_score(Y_trainKnn, y_pred_train))
print('Exactitud sobre test: %.2f' % accuracy_score(Y_testKnn, y_pred_test))

KeyboardInterrupt: 

In [38]:
# Mostrar reporte de clasificación sobre test
print(classification_report(Y_testKnn, y_pred_test))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90     15986
           1       0.35      0.22      0.27      2779

    accuracy                           0.82     18765
   macro avg       0.61      0.57      0.58     18765
weighted avg       0.79      0.82      0.81     18765



In [40]:
# Mostrar reporte de clasificación sobre entrenamiento
print(classification_report(Y_trainKnn, y_pred_train))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94     63964
           1       0.72      0.47      0.57     11096

    accuracy                           0.89     75060
   macro avg       0.82      0.72      0.75     75060
weighted avg       0.88      0.89      0.89     75060



4.1. Construcción del modelo con búsqueda de hiperparámetros

In [120]:
# Primero definamos el espacio de búsqueda
n_vecinos =  list(range(1,11))
# Fijemos el número de particiones. Utilizaremos K = 10.
particionesKnn = KFold(n_splits=10, shuffle=True, random_state = 0)
# Establecemos el espacio de búsqueda para los hiperparámetros que deseamos ajustar. 
param_grid = {'criterion':['gini', 'entropy'],'max_depth':[4,6,8,10,20],'min_samples_split':[2, 3, 4, 5]}

In [121]:
param_grid = {'n_neighbors': n_vecinos, 'p': [1, 2]}

In [122]:
clasificadorKNN = KNeighborsClassifier()
modelo_Knn = GridSearchCV(clasificadorKNN, param_grid, cv=particionesKnn)
modelo_Knn.fit(X_trainKnn,Y_trainKnn) 
print("Mejor parámetro: {}".format(modelo_Knn.best_params_)) 
print("Mejor cross-validation score: {:.2f}".format(modelo_Knn.best_score_))

KeyboardInterrupt: 

In [None]:
# Obtener el mejor modelo.
modelo_final = modelo_Knn.best_estimator_
# Probemos ahora este modelo sobre test.
y_pred_train = modelo_knn.predict(X_trainKnn)
y_pred_test = modelo_knn.predict(X_testKnn)
print('Exactitud sobre entrenamiento: %.2f' % accuracy_score(Y_trainKnn, y_pred_train))


In [None]:
# Mostrar reporte de clasificación sobre test
print(classification_report(Y_test, y_pred_test))

In [None]:
# Mostrar reporte de clasificación sobre entrenamiento
print(classification_report(Y_train, y_pred_train))