# Clasificadores K-NN-A en conjunto de diabetes
### Realizado por: Bogdan Rivera

Recolección de los datos

In [1]:
pip install ucimlrepo




In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

# metadata
print(cdc_diabetes_health_indicators.metadata)

# variable information
print(cdc_diabetes_health_indicators.variables)


{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

Conversión a dataframe


In [3]:
import pandas as pd
X = pd.DataFrame(X)
y = pd.DataFrame(y)
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


## Clasificador 1
1) Codificación original

Normalizando los datos:

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd

scaler_min_max = MinMaxScaler()
df_normalizado = df.copy()

df_normalizado[['BMI', 'MentHlth', 'PhysHlth']] = scaler_min_max.fit_transform(df_normalizado[['BMI', 'MentHlth', 'PhysHlth']])

Datos originales: 

In [5]:
df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


Datos normalizados:

In [6]:
df_normalizado.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,0.325581,1,0,0,0,0,1,...,0,5,0.6,0.5,1,0,9,4,3,0
1,0,0,0,0.151163,1,0,0,1,0,0,...,1,3,0.0,0.0,0,0,7,6,1,0
2,1,1,1,0.186047,0,0,0,0,1,0,...,1,5,1.0,1.0,1,0,9,4,8,0
3,1,0,1,0.174419,0,0,0,1,1,1,...,0,2,0.0,0.0,0,0,11,3,6,0
4,1,1,1,0.139535,0,0,0,1,1,1,...,0,2,0.1,0.0,0,0,11,5,4,0


b) Aplicando la distancia Euclidiana y  K-fold cross validation (K=5), realizar la clasificación KNN





In [7]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np


def clasificador_knn(df, k):
    X = df.drop(columns=['Diabetes_binary'])
    y = df['Diabetes_binary']
    knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')

    # Obtener los scores de cada pliegue
    scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
    exactitud_promedio = scores.mean()
    desviacion_estandar = scores.std()

    # Devolver los scores y estadísticas
    return {
        'k': k,
        'scores': scores,
        'exactitud_promedio': exactitud_promedio,
        'desviacion_estandar': desviacion_estandar,
        'modelo': knn
    }


Realizamos iteraciones de K desde 3 hasta 6 y los guardamos en un arreglo:

In [8]:
#Numero de datos 
num_datos = df_normalizado.shape[0]
print(f"Se utilizaron {num_datos} de datos")
print("Para el crossvalidation se uso K = 5")
print("En cada iteración, el conjunto de datos se divide en 80% para entrenamiento y 20% para prueba (ya que 4 de los 5 pliegues se usan para entrenar y el quinto se usa para prueba).")


Se utilizaron 253680 de datos
Para el crossvalidation se uso K = 5
En cada iteración, el conjunto de datos se divide en 80% para entrenamiento y 20% para prueba (ya que 4 de los 5 pliegues se usan para entrenar y el quinto se usa para prueba).


In [None]:
knn_modelos = []
for k in range(3, 7):
    resultado_modelo = clasificador_knn(df_normalizado, k)
    knn_modelos.append(resultado_modelo)

for modelo in knn_modelos:
    print(f"Modelo KNN para K = {modelo['k']}:")
    print("Scores de cada k-fold:", modelo['scores'])
    print("Exactitud promedio:", modelo['exactitud_promedio'])
    print("Desviación estándar:", modelo['desviacion_estandar'])
    print("-----")




In [None]:
# Ejemplo de dato nuevo a evaluar
nuevo_dato = {
    'HighBP': 1,
    'HighChol': 0,
    'CholCheck': 1,
    'BMI': 30,
    'Smoker': 0,
    'Stroke': 0,
    'HeartDiseaseorAttack': 0,
    'PhysActivity': 1,
    'Fruits': 1,
    'Veggies': 0,
    'HvyAlcoholConsump': 0,
    'AnyHealthcare': 1,
    'NoDocbcCost': 0,
    'GenHlth': 3,
    'MentHlth': 5,
    'PhysHlth': 7,
    'DiffWalk': 0,
    'Sex': 1,
    'Age': 10,
    'Education': 4,
    'Income': 3
}


A partir del dato anteriormente creado se evalua que clase predice cada uno de los modelos: 

In [None]:
nuevo_dato_df = pd.DataFrame([nuevo_dato])

nuevo_dato_df[['BMI', 'MentHlth', 'PhysHlth']] = scaler_min_max.transform(nuevo_dato_df[['BMI', 'MentHlth', 'PhysHlth']])


for modelo in knn_modelos:
    k = modelo['k']
    knn = modelo['modelo']
    
    X = df_normalizado.drop(columns=['Diabetes_binary'])
    y = df_normalizado['Diabetes_binary']
    knn.fit(X, y)
    
    prediccion = knn.predict(nuevo_dato_df)
    print(f"Predicción del modelo KNN para K = {k}: {prediccion[0]}")


Predicción del modelo KNN para K = 3: 0
Predicción del modelo KNN para K = 4: 0
Predicción del modelo KNN para K = 5: 0
Predicción del modelo KNN para K = 6: 0


## Clasificador 2
Discretización-codificación de características numéricas

In [None]:
df_discretizado = df.copy()


df_discretizado['BMI'] = pd.qcut(df_discretizado['BMI'], q=4, labels=False, duplicates='drop')
df_discretizado['MentHlth'] = pd.qcut(df_discretizado['MentHlth'], q=4, labels=False, duplicates='drop')
df_discretizado['PhysHlth'] = pd.qcut(df_discretizado['PhysHlth'], q=4, labels=False, duplicates='drop')


In [None]:
from sklearn.preprocessing import MinMaxScaler


scaler_min_max = MinMaxScaler()
X_discretizado = df_discretizado.drop(columns=['Diabetes_binary'])
X_discretizado = pd.DataFrame(scaler_min_max.fit_transform(X_discretizado), columns=X_discretizado.columns)


df_discretizado_normalizado = pd.concat([X_discretizado, df_discretizado[['Diabetes_binary']]], axis=1)

Datos discretizados:

In [None]:
df_discretizado_normalizado.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.666667,0.6,0.285714,0
1,0.0,0.0,0.0,0.333333,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.5,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0
2,1.0,1.0,1.0,0.666667,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.666667,0.6,1.0,0
3,1.0,0.0,1.0,0.333333,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.833333,0.4,0.714286,0
4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.25,1.0,0.0,0.0,0.0,0.833333,0.8,0.428571,0


In [None]:
knn_modelos_discretizados = []

for k in range(3, 7): 
    modelo_resultado = clasificador_knn(df_discretizado_normalizado, k)
    knn_modelos_discretizados.append(modelo_resultado)

for modelo in knn_modelos_discretizados:
    print(f"Modelo KNN con discretización para K = {modelo['k']}:")
    print("Scores de cada k-fold:", modelo['scores'])
    print("Exactitud promedio:", modelo['exactitud_promedio'])
    print("Desviación estándar:", modelo['desviacion_estandar'])
    print("-----")



Modelo KNN con discretización para K = 3:
Exactitud promedio: 0.8363095238095237
Desviación estándar: 0.0025140092676569706
-----
Modelo KNN con discretización para K = 4:
Exactitud promedio: 0.8538789025543994
Desviación estándar: 0.0009415131058684912
-----
Modelo KNN con discretización para K = 5:
Exactitud promedio: 0.8454194260485652
Desviación estándar: 0.001851276295513618
-----
Modelo KNN con discretización para K = 6:
Exactitud promedio: 0.8560391043834754
Desviación estándar: 0.000812658787767572
-----


In [None]:

nuevo_dato_df = pd.DataFrame([nuevo_dato])


nuevo_dato_df['BMI'] = pd.qcut(df['BMI'], q=4, labels=False, duplicates='drop').iloc[0]
nuevo_dato_df['MentHlth'] = pd.qcut(df['MentHlth'], q=4, labels=False, duplicates='drop').iloc[0]
nuevo_dato_df['PhysHlth'] = pd.qcut(df['PhysHlth'], q=4, labels=False, duplicates='drop').iloc[0]


nuevo_dato_df[nuevo_dato_df.columns] = scaler_min_max.transform(nuevo_dato_df)


for modelo in knn_modelos_discretizados:
    k = modelo['k']
    knn = modelo['modelo']
    
    X = df_discretizado.drop(columns=['Diabetes_binary'])
    y = df_discretizado['Diabetes_binary']
    
 
    knn.fit(X, y)
    
    
    prediccion = knn.predict(nuevo_dato_df)
    print(f"Predicción del modelo KNN con discretización para K = {k}: {prediccion[0]}")


Predicción del modelo KNN con discretización para K = 3: 0
Predicción del modelo KNN con discretización para K = 4: 0
Predicción del modelo KNN con discretización para K = 5: 0
Predicción del modelo KNN con discretización para K = 6: 0
