# Clasificadores K-NN-A en conjunto de diabetes parte 2
### Realizado por: Bogdan Rivera

Recolección de los datos

In [1]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 
  
# metadata 
print(cdc_diabetes_health_indicators.metadata) 
  
# variable information 
print(cdc_diabetes_health_indicators.variables) 


{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

Conversión a Dataframe

In [3]:
import pandas as pd
X = pd.DataFrame(X)
y = pd.DataFrame(y)
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


In [4]:
#Valores únicos
for column in df.columns:
    print(f'Columna: {column}, Valores únicos: {df[column].unique()}')

Columna: HighBP, Valores únicos: [1 0]
Columna: HighChol, Valores únicos: [1 0]
Columna: CholCheck, Valores únicos: [1 0]
Columna: BMI, Valores únicos: [40 25 28 27 24 30 34 26 33 21 23 22 38 32 37 31 29 20 35 45 39 19 47 18
 36 43 55 49 42 17 16 41 44 50 59 48 52 46 54 57 53 14 15 51 58 63 61 56
 74 62 64 66 73 85 60 67 65 70 82 79 92 68 72 88 96 13 81 71 75 12 77 69
 76 87 89 84 95 98 91 86 83 80 90 78]
Columna: Smoker, Valores únicos: [1 0]
Columna: Stroke, Valores únicos: [0 1]
Columna: HeartDiseaseorAttack, Valores únicos: [0 1]
Columna: PhysActivity, Valores únicos: [0 1]
Columna: Fruits, Valores únicos: [0 1]
Columna: Veggies, Valores únicos: [1 0]
Columna: HvyAlcoholConsump, Valores únicos: [0 1]
Columna: AnyHealthcare, Valores únicos: [1 0]
Columna: NoDocbcCost, Valores únicos: [0 1]
Columna: GenHlth, Valores únicos: [5 3 2 4 1]
Columna: MentHlth, Valores únicos: [18  0 30  3  5 15 10  6 20  2 25  1  4  7  8 21 14 26 29 16 28 11 12 24
 17 13 27 19 22  9 23]
Columna: PhysHlth, 

In [5]:
columnas_numericas = []
for column in df.columns:
    if len(df[column].value_counts()) > 8:
        columnas_numericas.append(column)

columnas_categoricas = df.columns.difference(columnas_numericas + ['Diabetes_binary'])
print("Columnas numéricas: ",columnas_numericas)
print("Columnas categóricas: ",columnas_categoricas)

Columnas numéricas:  ['BMI', 'MentHlth', 'PhysHlth', 'Age']
Columnas categóricas:  Index(['AnyHealthcare', 'CholCheck', 'DiffWalk', 'Education', 'Fruits',
       'GenHlth', 'HeartDiseaseorAttack', 'HighBP', 'HighChol',
       'HvyAlcoholConsump', 'Income', 'NoDocbcCost', 'PhysActivity', 'Sex',
       'Smoker', 'Stroke', 'Veggies'],
      dtype='object')


Clasificador 3: Codificación original-distancia de Gover

a) Normalizar solo las características numéricas

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_normalizado = df.copy()

df_normalizado[columnas_numericas] = scaler.fit_transform(df_normalizado[columnas_numericas])



Datos originales: 

In [7]:
df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


Datos normalizados: 


In [8]:
df_normalizado.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,0.325581,1,0,0,0,0,1,...,0,5,0.6,0.5,1,0,0.666667,4,3,0
1,0,0,0,0.151163,1,0,0,1,0,0,...,1,3,0.0,0.0,0,0,0.5,6,1,0
2,1,1,1,0.186047,0,0,0,0,1,0,...,1,5,1.0,1.0,1,0,0.666667,4,8,0
3,1,0,1,0.174419,0,0,0,1,1,1,...,0,2,0.0,0.0,0,0,0.833333,3,6,0
4,1,1,1,0.139535,0,0,0,1,1,1,...,0,2,0.1,0.0,0,0,0.833333,5,4,0


b) Aplicando la distancia de Gover  y  K-fold cross validation, realizar la clasificación KNN

In [None]:
from sklearn.model_selection import train_test_split

df_diabetes = df_normalizado[df_normalizado['Diabetes_binary'] == 1]   
df_no_diabetes = df_normalizado[df_normalizado['Diabetes_binary'] == 0] 

# Muestreo aleatorio 
df_no_diabetes_balanced = df_no_diabetes.sample(n=len(df_diabetes), random_state=42)

# Combinar ambas clases balanceadas
df_balanceado = pd.concat([df_diabetes, df_no_diabetes_balanced])


X = df_balanceado.drop(columns='Diabetes_binary')
y = df_balanceado['Diabetes_binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


print("Tamaño de entrenamiento:", X_train.shape[0])
print("Tamaño de prueba:", X_test.shape[0])



Tamaño de entrenamiento: 56553
Tamaño de prueba: 14139


In [None]:
import numpy as np
from tqdm import tqdm
def calcular_knn_gower(X_train, X_test, feature_ranges, tipos_caracteristicas):

    distancias = np.zeros((X_test.shape[0], X_train.shape[0]))
    

    for i, test_row in enumerate(tqdm(X_test, desc="Calculando distancias de Gower")):
        diff = np.abs(X_train - test_row)
        

        for j, (ftype, frange) in enumerate(zip(tipos_caracteristicas, feature_ranges)):
            if ftype == 'categorical':
                diff[:, j] = (X_train[:, j] != test_row[j]).astype(int)
            elif ftype == 'numerical' and frange != 0:
                diff[:, j] = diff[:, j] / frange  
        
        distancias[i, :] = np.mean(diff, axis=1) 
    
    return distancias

Realizando las predicciones con el conjunto balanceado: 

In [None]:
X = df_balanceado.drop(columns='Diabetes_binary').to_numpy()
y = df_balanceado['Diabetes_binary'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


tipos_caracteristicas = ['numerical', 'categorical'] 
feature_ranges = (X_train.max(axis=0) - X_train.min(axis=0)).astype(float)  


distancias_gower = calcular_knn_gower(X_train, X_test, feature_ranges, tipos_caracteristicas)


k = 3
y_pred = []
for dist in tqdm(distancias_gower, desc="Realizando predicciones KNN"):
    knn_indices = np.argsort(dist)[:k]
    knn_labels = y_train[knn_indices]
    y_pred.append(np.bincount(knn_labels).argmax())


precision = np.mean(y_pred == y_test)
print("Exactitud del modelo:", precision)




Calculando distancias de Gower: 100%|██████████| 14139/14139 [05:11<00:00, 45.39it/s]
Realizando predicciones KNN: 100%|██████████| 14139/14139 [03:50<00:00, 61.34it/s] 


Exactitud del modelo: 0.6793266850555202


Para utilizar todos los datos se puede usar la técnica de mini-split: 

In [12]:
def calcular_distancia_gower(X_train, test_row, feature_ranges, tipos_caracteristicas):
    diff = np.abs(X_train - test_row)  
    
    for j, (ftype, frange) in enumerate(zip(tipos_caracteristicas, feature_ranges)):
        if ftype == 'categorical':
            diff[:, j] = (X_train[:, j] != test_row[j]).astype(int)
        elif ftype == 'numerical' and frange != 0:
            diff[:, j] = diff[:, j] / frange  

    distancia = np.mean(diff, axis=1)  
    return distancia


X = df_normalizado.drop(columns='Diabetes_binary').to_numpy()
y = df_normalizado['Diabetes_binary'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


tipos_caracteristicas = ['numerical', 'categorical']  
feature_ranges = (X_train.max(axis=0) - X_train.min(axis=0)).astype(float)  

k = 3
y_pred = []


for i in tqdm(range(X_test.shape[0]), desc="Calculando y prediciendo distancias de Gower"):
    test_row = X_test[i]
    distancias_gower = calcular_distancia_gower(X_train, test_row, feature_ranges, tipos_caracteristicas)
    

    knn_indices = np.argsort(distancias_gower)[:k]
    knn_labels = y_train[knn_indices]
    
    y_pred.append(np.bincount(knn_labels).argmax())


accuracy = np.mean(y_pred == y_test)
print("Exactitud del modelo:", accuracy)


Calculando y prediciendo distancias de Gower: 100%|██████████| 50736/50736 [1:18:41<00:00, 10.75it/s] 


Exactitud del modelo: 0.8333727530747398


b) Diseñar una función de  distancia que trabaje con datos categóricos y numéricos. Se utiliza la mezcla de funciones de Canberra y Hamming


## Distancia de Hamming para Categóricas:

La distancia entre dos valores categóricos se mide con Hamming, donde el valor es 1 si las categorías difieren y 0 si coinciden.

## Distancia de Canberra: 
Para características numéricas, utilizamos la distancia de Canberra, calculada como:


\frac{|x - y|}{|x| + |y|}

​
 


In [13]:
def distancia_mixta(X_train, fila_test, tipos_caracteristicas, feature_ranges):
    # Se crea una matriz de diferencias 
    diff = np.zeros(X_train.shape)
    
    for j, (ftype, frange) in enumerate(zip(tipos_caracteristicas, feature_ranges)):
        if ftype == 'categorical':
            # Distancia de Hamming: 1 si no coinciden, 0 si coinciden
            diff[:, j] = (X_train[:, j] != fila_test[j]).astype(float)
        elif ftype == 'numerical' and frange != 0:
            # Distancia de Canberra 
            diff[:, j] = np.abs(X_train[:, j] - fila_test[j]) / (np.abs(X_train[:, j]) + np.abs(fila_test[j]))

    distancia = np.mean(diff, axis=1)
    return distancia

In [15]:
tipos_caracteristicas = ['numerical', 'categorical']  
feature_ranges = (X_train.max(axis=0) - X_train.min(axis=0)).astype(float)

k = 3
y_pred = []

for i in tqdm(range(X_test.shape[0]), desc="Calculando y prediciendo distancias mixtas"):
    fila_test = X_test[i]
    distancias = distancia_mixta(X_train, fila_test, tipos_caracteristicas, feature_ranges)
    

    knn_indices = np.argsort(distancias)[:k]
    knn_labels = y_train[knn_indices]
    

    y_pred.append(np.bincount(knn_labels).argmax())


accuracy = np.mean(y_pred == y_test)
print("Exactitud del modelo:", accuracy)

  diff[:, j] = np.abs(X_train[:, j] - fila_test[j]) / (np.abs(X_train[:, j]) + np.abs(fila_test[j]))
Calculando y prediciendo distancias mixtas: 100%|██████████| 50736/50736 [37:24<00:00, 22.61it/s]  

Exactitud del modelo: 0.8606709239987386



