## Autor:
Andrea Campillo Piqueras.
## Estudios:
Universidad Internacional de La Rioja.
Escuela Superior de Ingeniería y Tecnología.<br>
Trabajo Fin de Máster Universitario en Análisis y Visualización de Datos Masivos/ Visual Analytics and Big Data.
## Título:
PrediDia: Un Enfoque Predictivo para la Evaluación de la Diabetes.
## Repositorio:
https://github.com/AndreaCampillo/TFM_PrediDia
## Licencia:
MIT License Copyright (c) 2024 Andrea Campillo Piqueras.

# <center><H1>Redes Neuronales Artificiales Densas</H1></center>

In [1]:
# Librerías utilizadas
import time
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from imblearn.metrics import specificity_score
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import metrics

# Dataset 2021

In [2]:
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021DataSet_Diabeticos_NoDiabeticos_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [3]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   Year                229655 non-null  int64
 1   CatBMI              229655 non-null  int64
 2   Stroke              229655 non-null  int64
 3   HeartDis            229655 non-null  int64
 4   PhysExer            229655 non-null  int64
 5   HealthIns           229655 non-null  int64
 6   NoMedCost           229655 non-null  int64
 7   GenHealth           229655 non-null  int64
 8   CogDiff             229655 non-null  int64
 9   Depression          229655 non-null  int64
 10  MentalHlth          229655 non-null  int64
 11  MentalState         229655 non-null  int64
 12  PhysHlth            229655 non-null  int64
 13  WalkDiff            229655 non-null  int64
 14  Gender              229655 non-null  int64
 15  AgeRange            229655 non-null  int64
 16  EdLevel             

In [4]:
dfDiabetes_load.head()

Unnamed: 0,Year,CatBMI,Stroke,HeartDis,PhysExer,HealthIns,NoMedCost,GenHealth,CogDiff,Depression,...,FruitCons,VegCons,FruitOrVegCon,FruitAndVegCon,MarijuanaCon,SleepHours,BrDiabetes,GrDiabetes,SupGrPreDiabetes,SupGrNoPreDiabetes
0,2021,1,2,2,2,1,2,2,2,2,...,1,1,1,1,9,99,3,3,2,2
1,2021,3,2,1,2,1,2,1,2,2,...,1,2,1,2,9,99,1,1,1,1
2,2021,4,2,2,1,1,2,1,2,2,...,1,1,1,1,9,99,1,1,1,1
3,2021,3,1,1,1,1,2,2,2,2,...,1,1,1,1,9,99,1,1,1,1
4,2021,2,2,2,2,1,2,1,2,2,...,2,2,2,2,9,99,3,3,2,2


In [5]:
# Incluye 25 características sin eliminar LungDiseases, VisionDiff, UrologyDZ,Stroke. Se comprobará el rendimiento con estás
# últimas y se tomará una decisión.
columns = ['CatBMI', 'Stroke', 'HeartDis', 'PhysExer', 'GenHealth', 
           'CogDiff', 'Depression', 'PhysHlth', 'WalkDiff','Gender', 
           'AgeRange', 'EdLevel', 'SocClass', 'UrologyDz','VisionDiff',
           'Asthma', 'LungDiseases','Arthritis', 'SmokerTrad', 'AlcDrinker', 
           'Race', 'LastMedChk', 'HighBP','HighChol', 'FruitAndVegCon',
           'SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [6]:
dfDiabetes['SupGrPreDiabetes'] = dfDiabetes['SupGrPreDiabetes'].replace({2: 0, 1: 1})

In [7]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 26 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   Stroke            229655 non-null  int64
 2   HeartDis          229655 non-null  int64
 3   PhysExer          229655 non-null  int64
 4   GenHealth         229655 non-null  int64
 5   CogDiff           229655 non-null  int64
 6   Depression        229655 non-null  int64
 7   PhysHlth          229655 non-null  int64
 8   WalkDiff          229655 non-null  int64
 9   Gender            229655 non-null  int64
 10  AgeRange          229655 non-null  int64
 11  EdLevel           229655 non-null  int64
 12  SocClass          229655 non-null  int64
 13  UrologyDz         229655 non-null  int64
 14  VisionDiff        229655 non-null  int64
 15  Asthma            229655 non-null  int64
 16  LungDiseases      229655 non-null  int64
 17  Arthritis 

In [8]:
dfDiabetes.head()

Unnamed: 0,CatBMI,Stroke,HeartDis,PhysExer,GenHealth,CogDiff,Depression,PhysHlth,WalkDiff,Gender,...,LungDiseases,Arthritis,SmokerTrad,AlcDrinker,Race,LastMedChk,HighBP,HighChol,FruitAndVegCon,SupGrPreDiabetes
0,1,2,2,2,2,2,2,3,2,2,...,1,1,3,1,1,2,2,1,1,0
1,3,2,1,2,1,2,2,1,2,2,...,2,2,4,1,2,1,1,2,2,1
2,4,2,2,1,1,2,2,1,2,2,...,2,2,4,2,1,1,1,1,1,1
3,3,1,1,1,2,2,2,3,1,1,...,2,2,4,1,4,1,1,1,1,1
4,2,2,2,2,1,2,2,1,1,1,...,1,2,3,1,1,1,2,2,2,0


In [9]:
numClases = dfDiabetes['SupGrPreDiabetes'].value_counts()
numDiabeticos = numClases[1]
numNoDiabeticos = numClases[0]
print("Número de registros: ", len(dfDiabetes)) 
print("Número de dibéticos: ", numDiabeticos)
print("Número de no diabéticos: ", numNoDiabeticos)

Número de registros:  229655
Número de dibéticos:  37793
Número de no diabéticos:  191862


# Partición de datos

In [10]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [11]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 137793
Logintud del Validation set: 45931
Logintud del Test set: 45931


In [12]:
# Obtención del modelo
start_time = time.time()
model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['acc'])

# Entrenamos el algoritmo
history = model.fit(X_train,
                    y_train,
                    epochs=20, 
                    batch_size=512,
                    validation_data=(X_val, y_val))

print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Tiempo en generación del modelo: 13.602  sg.


In [13]:
# Predecimos con el el conjunto de validación
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int)

In [14]:
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.812


In [15]:
# Se comprueba con el dataset de prueba
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))

F1 score: 0.810


In [16]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.810
Precisión (Precision): 0.818
Exactitud (Accuracy): 0.846
Especificidad (Specificity): 0.976
AUC-ROC: 0.581


In [17]:
# Obtención de métricas
# Se reproduce el proceso pero eliminando las caracteristcas las características LungDiseases, VisionDiff, UrologyDZ,Stroke
columns = ['CatBMI','HeartDis','PhysExer','GenHealth','CogDiff',
           'Depression', 'PhysHlth', 'WalkDiff','Gender','AgeRange',
           'EdLevel', 'SocClass','Asthma', 'Arthritis', 'SmokerTrad',
           'AlcDrinker','Race', 'LastMedChk', 'HighBP','HighChol', 'FruitAndVegCon',
           'SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [18]:
dfDiabetes['SupGrPreDiabetes'] = dfDiabetes['SupGrPreDiabetes'].replace({2: 0, 1: 1})

In [19]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   HeartDis          229655 non-null  int64
 2   PhysExer          229655 non-null  int64
 3   GenHealth         229655 non-null  int64
 4   CogDiff           229655 non-null  int64
 5   Depression        229655 non-null  int64
 6   PhysHlth          229655 non-null  int64
 7   WalkDiff          229655 non-null  int64
 8   Gender            229655 non-null  int64
 9   AgeRange          229655 non-null  int64
 10  EdLevel           229655 non-null  int64
 11  SocClass          229655 non-null  int64
 12  Asthma            229655 non-null  int64
 13  Arthritis         229655 non-null  int64
 14  SmokerTrad        229655 non-null  int64
 15  AlcDrinker        229655 non-null  int64
 16  Race              229655 non-null  int64
 17  LastMedChk

In [20]:
dfDiabetes.head()

Unnamed: 0,CatBMI,HeartDis,PhysExer,GenHealth,CogDiff,Depression,PhysHlth,WalkDiff,Gender,AgeRange,...,Asthma,Arthritis,SmokerTrad,AlcDrinker,Race,LastMedChk,HighBP,HighChol,FruitAndVegCon,SupGrPreDiabetes
0,1,2,2,2,2,2,3,2,2,6,...,2,1,3,1,1,2,2,1,1,0
1,3,1,2,1,2,2,1,2,2,6,...,1,2,4,1,2,1,1,2,2,1
2,4,2,1,1,2,2,1,2,2,5,...,1,2,4,2,1,1,1,1,1,1
3,3,1,1,2,2,2,3,1,1,6,...,1,2,4,1,4,1,1,1,1,1
4,2,2,2,1,2,2,1,1,1,6,...,1,2,3,1,1,1,2,2,2,0


In [21]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [22]:
numClases = dfDiabetes['SupGrPreDiabetes'].value_counts()
numDiabeticos = numClases[1]
numNoDiabeticos = numClases[0]
print("Número de registros: ", len(dfDiabetes)) 
print("Número de dibéticos: ", numDiabeticos)
print("Número de no diabéticos: ", numNoDiabeticos)

Número de registros:  229655
Número de dibéticos:  37793
Número de no diabéticos:  191862


In [23]:
# Obtenión del modelo
start_time = time.time()
model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['acc'])

# Entrenamos el algoritmo
history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(X_val, y_val))

print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Tiempo en generación del modelo: 13.13  sg.


In [24]:
#Predecimos con el el conjunto de validación
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.812


In [25]:
# Se comprueba con el dataset de prueba
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))

F1 score: 0.811


In [26]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.811
Precisión (Precision): 0.814
Exactitud (Accuracy): 0.844
Especificidad (Specificity): 0.972
AUC-ROC: 0.585


# SMOTE

In [27]:
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021DataSet_Diabeticos_NoDiabeticos_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [28]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   Year                229655 non-null  int64
 1   CatBMI              229655 non-null  int64
 2   Stroke              229655 non-null  int64
 3   HeartDis            229655 non-null  int64
 4   PhysExer            229655 non-null  int64
 5   HealthIns           229655 non-null  int64
 6   NoMedCost           229655 non-null  int64
 7   GenHealth           229655 non-null  int64
 8   CogDiff             229655 non-null  int64
 9   Depression          229655 non-null  int64
 10  MentalHlth          229655 non-null  int64
 11  MentalState         229655 non-null  int64
 12  PhysHlth            229655 non-null  int64
 13  WalkDiff            229655 non-null  int64
 14  Gender              229655 non-null  int64
 15  AgeRange            229655 non-null  int64
 16  EdLevel             

In [29]:
print(dfDiabetes_load.columns)

Index(['Year', 'CatBMI', 'Stroke', 'HeartDis', 'PhysExer', 'HealthIns',
       'NoMedCost', 'GenHealth', 'CogDiff', 'Depression', 'MentalHlth',
       'MentalState', 'PhysHlth', 'WalkDiff', 'Gender', 'AgeRange', 'EdLevel',
       'AnnIncome', 'SocClass', 'UrologyDz', 'VisionDiff', 'Asthma',
       'LungDiseases', 'Arthritis', 'SmokerTrad', 'ECigSmok', 'AlcDrinker',
       'Race', 'MaritalSt', 'LastMedChk', 'Awareness', 'FootIrrita',
       'FecFootIrrita', 'HighBP', 'HighChol', 'FruitCons', 'VegCons',
       'FruitOrVegCon', 'FruitAndVegCon', 'MarijuanaCon', 'SleepHours',
       'BrDiabetes', 'GrDiabetes', 'SupGrPreDiabetes', 'SupGrNoPreDiabetes'],
      dtype='object')


In [30]:
# Incluye 25 características sin eliminar LungDiseases, VisionDiff, UrologyDZ,Stroke. Se comprobará el rendimiento con estás
# últimas y se tomará una decisión.
columns = ['CatBMI', 'Stroke', 'HeartDis', 'PhysExer', 'GenHealth', 
           'CogDiff', 'Depression', 'PhysHlth', 'WalkDiff','Gender', 
           'AgeRange', 'EdLevel', 'SocClass', 'UrologyDz','VisionDiff',
           'Asthma', 'LungDiseases','Arthritis', 'SmokerTrad', 'AlcDrinker', 
           'Race', 'LastMedChk', 'HighBP','HighChol', 'FruitAndVegCon',
           'SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [31]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 26 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   Stroke            229655 non-null  int64
 2   HeartDis          229655 non-null  int64
 3   PhysExer          229655 non-null  int64
 4   GenHealth         229655 non-null  int64
 5   CogDiff           229655 non-null  int64
 6   Depression        229655 non-null  int64
 7   PhysHlth          229655 non-null  int64
 8   WalkDiff          229655 non-null  int64
 9   Gender            229655 non-null  int64
 10  AgeRange          229655 non-null  int64
 11  EdLevel           229655 non-null  int64
 12  SocClass          229655 non-null  int64
 13  UrologyDz         229655 non-null  int64
 14  VisionDiff        229655 non-null  int64
 15  Asthma            229655 non-null  int64
 16  LungDiseases      229655 non-null  int64
 17  Arthritis 

In [32]:
#Clases antes de la normalización
dfDiabetes['SupGrPreDiabetes'].value_counts()

2    191862
1     37793
Name: SupGrPreDiabetes, dtype: int64

In [33]:
dfDiabetes['SupGrPreDiabetes'] = dfDiabetes['SupGrPreDiabetes'].replace({2: 0, 1: 1})

In [34]:
numClases = dfDiabetes['SupGrPreDiabetes'].value_counts()
numDiabeticos = numClases[1]
numNoDiabeticos = numClases[0]
print("Número de registros: ", len(dfDiabetes)) 
print("Número de dibéticos: ", numDiabeticos)
print("Número de no diabéticos: ", numNoDiabeticos)

Número de registros:  229655
Número de dibéticos:  37793
Número de no diabéticos:  191862


In [35]:
#Clases despues de la normalización
dfDiabetes['SupGrPreDiabetes'].value_counts()

0    191862
1     37793
Name: SupGrPreDiabetes, dtype: int64

In [36]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [37]:
y_val.value_counts()

0    38372
1     7559
Name: SupGrPreDiabetes, dtype: int64

In [38]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 137793
Logintud del Validation set: 45931
Logintud del Test set: 45931


In [39]:
# Obtención de datos de entrenamiento con SMOTE para balanceamiento de los datos
smote = SMOTE(sampling_strategy='auto', random_state=14)  
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [40]:
y_train_smote.value_counts()

1    115117
0    115117
Name: SupGrPreDiabetes, dtype: int64

In [41]:
# Se obtiene el modelo
start_time = time.time()
model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['acc'])

# Entrenamos el algoritmo
history = model.fit(X_train_smote,
                    y_train_smote,
                    epochs=20,
                    batch_size=512,
                    validation_data=(X_val, y_val))

print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Tiempo en generación del modelo: 20.324  sg.


In [42]:
#Predecimos con el el conjunto de validación
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.756


In [43]:
#Predecimos con el el conjunto de validación
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

In [44]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.751
Precisión (Precision): 0.826
Exactitud (Accuracy): 0.717
Especificidad (Specificity): 0.721
AUC-ROC: 0.710


In [45]:
# Se aplicac SMOTE con las 21 características que fueron óptimas
# Se reproduce el proceso pero eliminando las caracteristcas las características LungDiseases, VisionDiff, UrologyDZ,Stroke
columns = ['CatBMI','HeartDis','PhysExer','GenHealth','CogDiff',
           'Depression', 'PhysHlth', 'WalkDiff','Gender','AgeRange',
           'EdLevel', 'SocClass','Asthma', 'Arthritis', 'SmokerTrad',
           'AlcDrinker','Race', 'LastMedChk', 'HighBP','HighChol', 'FruitAndVegCon',
           'SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [46]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   HeartDis          229655 non-null  int64
 2   PhysExer          229655 non-null  int64
 3   GenHealth         229655 non-null  int64
 4   CogDiff           229655 non-null  int64
 5   Depression        229655 non-null  int64
 6   PhysHlth          229655 non-null  int64
 7   WalkDiff          229655 non-null  int64
 8   Gender            229655 non-null  int64
 9   AgeRange          229655 non-null  int64
 10  EdLevel           229655 non-null  int64
 11  SocClass          229655 non-null  int64
 12  Asthma            229655 non-null  int64
 13  Arthritis         229655 non-null  int64
 14  SmokerTrad        229655 non-null  int64
 15  AlcDrinker        229655 non-null  int64
 16  Race              229655 non-null  int64
 17  LastMedChk

In [47]:
#Clases antes de la normalización
dfDiabetes['SupGrPreDiabetes'].value_counts()

2    191862
1     37793
Name: SupGrPreDiabetes, dtype: int64

In [48]:
dfDiabetes['SupGrPreDiabetes'] = dfDiabetes['SupGrPreDiabetes'].replace({2: 0, 1: 1})

In [49]:
numClases = dfDiabetes['SupGrPreDiabetes'].value_counts()
numDiabeticos = numClases[1]
numNoDiabeticos = numClases[0]
print("Número de registros: ", len(dfDiabetes)) 
print("Número de dibéticos: ", numDiabeticos)
print("Número de no diabéticos: ", numNoDiabeticos)

Número de registros:  229655
Número de dibéticos:  37793
Número de no diabéticos:  191862


In [50]:
#Clases despues de la normalización
dfDiabetes['SupGrPreDiabetes'].value_counts()

0    191862
1     37793
Name: SupGrPreDiabetes, dtype: int64

In [51]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [52]:
y_val.value_counts()

0    38372
1     7559
Name: SupGrPreDiabetes, dtype: int64

In [53]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 137793
Logintud del Validation set: 45931
Logintud del Test set: 45931


In [54]:
# Obtención de datos de entrenamiento con SMOTE para balanceamiento de los datos
smote = SMOTE(sampling_strategy='auto', random_state=14)  
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [55]:
y_train_smote.value_counts()

1    115117
0    115117
Name: SupGrPreDiabetes, dtype: int64

In [56]:
# Obtención del modelo
start_time = time.time()
model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['acc'])

# Entrenamos el algoritmo
history = model.fit(X_train_smote,
                    y_train_smote,
                    epochs=20,
                    batch_size=512,
                    validation_data=(X_val, y_val))

print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Tiempo en generación del modelo: 20.738  sg.


In [57]:
#Predecimos con el el conjunto de validación
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.739


In [58]:
#Predecimos con el el conjunto de validación
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

In [59]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.735
Precisión (Precision): 0.828
Exactitud (Accuracy): 0.698
Especificidad (Specificity): 0.692
AUC-ROC: 0.710


# Dataset 2021_22
Se realiza el mismo test con los datos unificados de 2021 y 2022, por ese motivo se eliminan las caracteríticas HighBP, HighChol, FruitAndVegCon por no estar presentes en el dataset del 2022 

In [60]:
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021_22DataSet_Diabeticos_NoDiabeticos_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [61]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503592 entries, 0 to 503591
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   Year                503592 non-null  int64
 1   CatBMI              503592 non-null  int64
 2   Stroke              503592 non-null  int64
 3   HeartDis            503592 non-null  int64
 4   PhysExer            503592 non-null  int64
 5   HealthIns           503592 non-null  int64
 6   NoMedCost           503592 non-null  int64
 7   GenHealth           503592 non-null  int64
 8   CogDiff             503592 non-null  int64
 9   Depression          503592 non-null  int64
 10  MentalHlth          503592 non-null  int64
 11  MentalState         503592 non-null  int64
 12  PhysHlth            503592 non-null  int64
 13  WalkDiff            503592 non-null  int64
 14  Gender              503592 non-null  int64
 15  AgeRange            503592 non-null  int64
 16  EdLevel             

In [62]:
print(dfDiabetes_load.columns)

Index(['Year', 'CatBMI', 'Stroke', 'HeartDis', 'PhysExer', 'HealthIns',
       'NoMedCost', 'GenHealth', 'CogDiff', 'Depression', 'MentalHlth',
       'MentalState', 'PhysHlth', 'WalkDiff', 'Gender', 'AgeRange', 'EdLevel',
       'AnnIncome', 'SocClass', 'UrologyDz', 'VisionDiff', 'Asthma',
       'LungDiseases', 'Arthritis', 'SmokerTrad', 'ECigSmok', 'AlcDrinker',
       'Race', 'MaritalSt', 'LastMedChk', 'Awareness', 'FootIrrita',
       'FecFootIrrita', 'HighBP', 'HighChol', 'FruitCons', 'VegCons',
       'FruitOrVegCon', 'FruitAndVegCon', 'MarijuanaCon', 'SleepHours',
       'BrDiabetes', 'GrDiabetes', 'SupGrPreDiabetes', 'SupGrNoPreDiabetes'],
      dtype='object')


In [63]:
# Se reproduce el proceso con las características seleccionadas y comunes al 2021 y 2022
columns = ['CatBMI','HeartDis','PhysExer','GenHealth','CogDiff',
           'Depression', 'PhysHlth', 'WalkDiff','Gender','AgeRange',
           'EdLevel', 'SocClass','Asthma', 'Arthritis', 'SmokerTrad',
           'AlcDrinker','Race', 'LastMedChk','SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [64]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503592 entries, 0 to 503591
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            503592 non-null  int64
 1   HeartDis          503592 non-null  int64
 2   PhysExer          503592 non-null  int64
 3   GenHealth         503592 non-null  int64
 4   CogDiff           503592 non-null  int64
 5   Depression        503592 non-null  int64
 6   PhysHlth          503592 non-null  int64
 7   WalkDiff          503592 non-null  int64
 8   Gender            503592 non-null  int64
 9   AgeRange          503592 non-null  int64
 10  EdLevel           503592 non-null  int64
 11  SocClass          503592 non-null  int64
 12  Asthma            503592 non-null  int64
 13  Arthritis         503592 non-null  int64
 14  SmokerTrad        503592 non-null  int64
 15  AlcDrinker        503592 non-null  int64
 16  Race              503592 non-null  int64
 17  LastMedChk

In [65]:
#Clases antes de la normalización
dfDiabetes['SupGrPreDiabetes'].value_counts()

2    422532
1     81060
Name: SupGrPreDiabetes, dtype: int64

In [66]:
dfDiabetes['SupGrPreDiabetes'] = dfDiabetes['SupGrPreDiabetes'].replace({2: 0, 1: 1})

In [67]:
#Clases despues de la normalización
dfDiabetes['SupGrPreDiabetes'].value_counts()

0    422532
1     81060
Name: SupGrPreDiabetes, dtype: int64

In [68]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [69]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 302155
Logintud del Validation set: 100718
Logintud del Test set: 100719


In [70]:
# Obtención del modelo
start_time = time.time()
model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['acc'])

# Entrenamos el algoritmo
history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(X_val, y_val))

print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Tiempo en generación del modelo: 28.039  sg.


In [71]:
#Predecimos con el el conjunto de validación
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.795


In [72]:
#Predecimos con el el conjunto de validación
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

In [73]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.797
Precisión (Precision): 0.808
Exactitud (Accuracy): 0.844
Especificidad (Specificity): 0.984
AUC-ROC: 0.548
