## Autor:
Andrea Campillo Piqueras.
## Estudios:
Universidad Internacional de La Rioja.
Escuela Superior de Ingeniería y Tecnología.<br>
Trabajo Fin de Máster Universitario en Análisis y Visualización de Datos Masivos/ Visual Analytics and Big Data.
## Título:
PrediDia: Un Enfoque Predictivo para la Evaluación de la Diabetes.
## Repositorio:
https://github.com/AndreaCampillo/TFM_PrediDia
## Licencia:
MIT License Copyright (c) 2024 Andrea Campillo Piqueras.

# <center><H1>Aplicación de los Mejores Modelos a Nuevos Datos de Pacientes</H1></center>

In [1]:
# Librerias utilizadas
import time
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from tensorflow.keras import models
from tensorflow.keras import layers
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from imblearn.metrics import specificity_score

# Obtención de los mejores modelos Voting Classifier (GBC+GBN+RF) y Red Neuronal Artificial Densa

In [36]:
# Dataset 2021 con individiduos que no contestaron respecto a la Diabetes
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021DataSet_NoDefinidosDiabetes_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [37]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 45 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   Year                126 non-null    int64
 1   CatBMI              126 non-null    int64
 2   Stroke              126 non-null    int64
 3   HeartDis            126 non-null    int64
 4   PhysExer            126 non-null    int64
 5   HealthIns           126 non-null    int64
 6   NoMedCost           126 non-null    int64
 7   GenHealth           126 non-null    int64
 8   CogDiff             126 non-null    int64
 9   Depression          126 non-null    int64
 10  MentalHlth          126 non-null    int64
 11  MentalState         126 non-null    int64
 12  PhysHlth            126 non-null    int64
 13  WalkDiff            126 non-null    int64
 14  Gender              126 non-null    int64
 15  AgeRange            126 non-null    int64
 16  EdLevel             126 non-null    int64
 1

In [38]:
dfDiabetes_load.head()

Unnamed: 0,Year,CatBMI,Stroke,HeartDis,PhysExer,HealthIns,NoMedCost,GenHealth,CogDiff,Depression,...,FruitCons,VegCons,FruitOrVegCon,FruitAndVegCon,MarijuanaCon,SleepHours,BrDiabetes,GrDiabetes,SupGrPreDiabetes,SupGrNoPreDiabetes
0,2021,4,2,2,1,1,2,1,2,2,...,1,1,1,1,9,99,7,9,9,9
1,2021,4,2,1,2,1,2,1,2,2,...,1,1,1,1,9,99,7,9,9,9
2,2021,4,2,2,2,1,2,2,1,1,...,2,2,2,2,9,99,7,9,9,9
3,2021,4,2,1,2,1,2,2,2,2,...,1,2,1,2,9,99,9,9,9,9
4,2021,4,2,2,1,1,1,2,2,2,...,2,2,2,2,9,99,7,9,9,9


In [39]:
# Se obtiene un nuevo dataSet sin varible objetivo con los aquellos individuos que no se ha definido diabetes y
# las 21 caracaterísticas obtenidas en las fases anteriores del trabajo
columns = ['CatBMI','HeartDis','PhysExer','GenHealth','CogDiff',
           'Depression', 'PhysHlth', 'WalkDiff','Gender','AgeRange',
           'EdLevel', 'SocClass','Asthma', 'Arthritis', 'SmokerTrad',
           'AlcDrinker','Race', 'LastMedChk', 'HighBP','HighChol', 'FruitAndVegCon']
dfDiabetesNoDefinidos = dfDiabetes_load[columns].copy()

In [40]:
dfDiabetesNoDefinidos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   CatBMI          126 non-null    int64
 1   HeartDis        126 non-null    int64
 2   PhysExer        126 non-null    int64
 3   GenHealth       126 non-null    int64
 4   CogDiff         126 non-null    int64
 5   Depression      126 non-null    int64
 6   PhysHlth        126 non-null    int64
 7   WalkDiff        126 non-null    int64
 8   Gender          126 non-null    int64
 9   AgeRange        126 non-null    int64
 10  EdLevel         126 non-null    int64
 11  SocClass        126 non-null    int64
 12  Asthma          126 non-null    int64
 13  Arthritis       126 non-null    int64
 14  SmokerTrad      126 non-null    int64
 15  AlcDrinker      126 non-null    int64
 16  Race            126 non-null    int64
 17  LastMedChk      126 non-null    int64
 18  HighBP          126 non-null  

In [41]:
dfDiabetesNoDefinidos.head()

Unnamed: 0,CatBMI,HeartDis,PhysExer,GenHealth,CogDiff,Depression,PhysHlth,WalkDiff,Gender,AgeRange,...,SocClass,Asthma,Arthritis,SmokerTrad,AlcDrinker,Race,LastMedChk,HighBP,HighChol,FruitAndVegCon
0,4,2,1,1,2,2,1,2,2,5,...,1,2,2,1,1,1,1,2,2,1
1,4,1,2,1,2,2,1,1,1,6,...,1,2,1,3,1,4,1,1,1,1
2,4,2,2,2,1,1,3,1,2,5,...,1,1,1,1,1,1,3,2,1,2
3,4,1,2,2,2,2,3,1,2,6,...,1,1,2,4,1,1,1,2,1,2
4,4,2,1,2,2,2,3,1,2,5,...,1,1,1,3,1,1,1,1,1,2


In [8]:
# Dataset 2021
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021DataSet_Diabeticos_NoDiabeticos_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [9]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   Year                229655 non-null  int64
 1   CatBMI              229655 non-null  int64
 2   Stroke              229655 non-null  int64
 3   HeartDis            229655 non-null  int64
 4   PhysExer            229655 non-null  int64
 5   HealthIns           229655 non-null  int64
 6   NoMedCost           229655 non-null  int64
 7   GenHealth           229655 non-null  int64
 8   CogDiff             229655 non-null  int64
 9   Depression          229655 non-null  int64
 10  MentalHlth          229655 non-null  int64
 11  MentalState         229655 non-null  int64
 12  PhysHlth            229655 non-null  int64
 13  WalkDiff            229655 non-null  int64
 14  Gender              229655 non-null  int64
 15  AgeRange            229655 non-null  int64
 16  EdLevel             

In [10]:
dfDiabetes_load.head()

Unnamed: 0,Year,CatBMI,Stroke,HeartDis,PhysExer,HealthIns,NoMedCost,GenHealth,CogDiff,Depression,...,FruitCons,VegCons,FruitOrVegCon,FruitAndVegCon,MarijuanaCon,SleepHours,BrDiabetes,GrDiabetes,SupGrPreDiabetes,SupGrNoPreDiabetes
0,2021,1,2,2,2,1,2,2,2,2,...,1,1,1,1,9,99,3,3,2,2
1,2021,3,2,1,2,1,2,1,2,2,...,1,2,1,2,9,99,1,1,1,1
2,2021,4,2,2,1,1,2,1,2,2,...,1,1,1,1,9,99,1,1,1,1
3,2021,3,1,1,1,1,2,2,2,2,...,1,1,1,1,9,99,1,1,1,1
4,2021,2,2,2,2,1,2,1,2,2,...,2,2,2,2,9,99,3,3,2,2


In [11]:
# Se eliminan las caracteristcas las características LungDiseases, VisionDiff, UrologyDZ,Stroke
columns = ['CatBMI','HeartDis','PhysExer','GenHealth','CogDiff',
           'Depression', 'PhysHlth', 'WalkDiff','Gender','AgeRange',
           'EdLevel', 'SocClass','Asthma', 'Arthritis', 'SmokerTrad',
           'AlcDrinker','Race', 'LastMedChk', 'HighBP','HighChol', 'FruitAndVegCon',
           'SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [12]:
dfDiabetes['SupGrPreDiabetes'] = dfDiabetes['SupGrPreDiabetes'].replace({2: 0, 1: 1})

In [13]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   HeartDis          229655 non-null  int64
 2   PhysExer          229655 non-null  int64
 3   GenHealth         229655 non-null  int64
 4   CogDiff           229655 non-null  int64
 5   Depression        229655 non-null  int64
 6   PhysHlth          229655 non-null  int64
 7   WalkDiff          229655 non-null  int64
 8   Gender            229655 non-null  int64
 9   AgeRange          229655 non-null  int64
 10  EdLevel           229655 non-null  int64
 11  SocClass          229655 non-null  int64
 12  Asthma            229655 non-null  int64
 13  Arthritis         229655 non-null  int64
 14  SmokerTrad        229655 non-null  int64
 15  AlcDrinker        229655 non-null  int64
 16  Race              229655 non-null  int64
 17  LastMedChk

In [14]:
dfDiabetes.head()

Unnamed: 0,CatBMI,HeartDis,PhysExer,GenHealth,CogDiff,Depression,PhysHlth,WalkDiff,Gender,AgeRange,...,Asthma,Arthritis,SmokerTrad,AlcDrinker,Race,LastMedChk,HighBP,HighChol,FruitAndVegCon,SupGrPreDiabetes
0,1,2,2,2,2,2,3,2,2,6,...,2,1,3,1,1,2,2,1,1,0
1,3,1,2,1,2,2,1,2,2,6,...,1,2,4,1,2,1,1,2,2,1
2,4,2,1,1,2,2,1,2,2,5,...,1,2,4,2,1,1,1,1,1,1
3,3,1,1,2,2,2,3,1,1,6,...,1,2,4,1,4,1,1,1,1,1
4,2,2,2,1,2,2,1,1,1,6,...,1,2,3,1,1,1,2,2,2,0


In [15]:
numClases = dfDiabetes['SupGrPreDiabetes'].value_counts()
numDiabeticos = numClases[1]
numNoDiabeticos = numClases[0]
print("Número de registros: ", len(dfDiabetes)) 
print("Número de dibéticos: ", numDiabeticos)
print("Número de no diabéticos: ", numNoDiabeticos)

Número de registros:  229655
Número de dibéticos:  37793
Número de no diabéticos:  191862


In [16]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [17]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 137793
Logintud del Validation set: 45931
Logintud del Test set: 45931


In [18]:
# Modelo Voting Classifier(GBC+GBN+RL+RF)
start_time = time.time()
modelGbc = GradientBoostingClassifier(max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=14)
modelGnb = GaussianNB(priors=[0.999, 0.001])
modelLr = LogisticRegression(C=10, class_weight='balanced', random_state=14, solver='sag', tol=0.01)

modelGbc.fit(X_train, y_train)
modelGnb.fit(X_train, y_train)
modelLr.fit(X_train, y_train)

modelVoting = VotingClassifier(estimators=[('gbc',modelGbc),('gnb', modelGnb),('lr',modelLr)], voting='hard')

modelVoting.fit(X_train, y_train)
print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Tiempo en generación del modelo: 63.174  sg.


In [19]:
# Se realiza la predicción con individuos con la característica Diabetes no definida
y_pred_modelVoting = modelVoting.predict(dfDiabetesNoDefinidos)

In [20]:
y_pred_modelVoting

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [21]:
type(y_pred_modelVoting)

numpy.ndarray

In [22]:
y_pred_modelVoting.shape

(126,)

In [23]:
# De la muestra de 126 individuos sin haber definido la característica Diabetes
# 107: 85% no diabéticos
# 19: 15% diabéticos
serie = pd.Series(y_pred_modelVoting)
agrupado = serie.groupby(serie).size()
print(agrupado)

0    107
1     19
dtype: int64


In [53]:
start_time = time.time()
modelRNA = models.Sequential()
modelRNA.add(layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
modelRNA.add(layers.Dense(64, activation='relu'))
modelRNA.add(layers.Dense(32, activation='relu'))
modelRNA.add(layers.Dense(16, activation='relu'))
modelRNA.add(layers.Dense(1, activation='sigmoid'))

modelRNA.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['acc'])

history = modelRNA.fit(X_train,
                    y_train,
                    epochs=20, 
                    batch_size=512,
                    validation_data=(X_val, y_val))
print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Tiempo en generación del modelo: 12.968  sg.


In [54]:
# Se realiza la predicción con individuos con la característica Diabetes no definida
y_pred_proba = modelRNA.predict(dfDiabetesNoDefinidos)
y_pred_modelRNA = (y_pred_proba > 0.5).astype(int)

In [55]:
y_pred_modelRNA

array([[0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
    

In [56]:
type(y_pred_modelRNA)

numpy.ndarray

In [57]:
y_pred_modelRNA.shape

(126, 1)

In [58]:
# De la muestra de 126 individuos sin haber definido la característica Diabetes
# 114: 90% no diabéticos
# 12: 10% diabéticos
valores_distintos, cuenta = np.unique(y_pred_modelRNA, return_counts=True)
print(valores_distintos)
print(cuenta)

[0 1]
[114  12]


In [59]:
ultima_columna = len(dfDiabetesNoDefinidos.columns)
dfDiabetesNoDefinidos.insert(ultima_columna, 'y_pred_modelVoting', y_pred_modelVoting)

ultima_columna = len(dfDiabetesNoDefinidos.columns)
dfDiabetesNoDefinidos.insert(ultima_columna, 'y_pred_modelRNA', y_pred_modelRNA)

In [60]:
dfDiabetesNoDefinidos.head()

Unnamed: 0,CatBMI,HeartDis,PhysExer,GenHealth,CogDiff,Depression,PhysHlth,WalkDiff,Gender,AgeRange,...,Arthritis,SmokerTrad,AlcDrinker,Race,LastMedChk,HighBP,HighChol,FruitAndVegCon,y_pred_modelVoting,y_pred_modelRNA
0,4,2,1,1,2,2,1,2,2,5,...,2,1,1,1,1,2,2,1,0,0
1,4,1,2,1,2,2,1,1,1,6,...,1,3,1,4,1,1,1,1,1,1
2,4,2,2,2,1,1,3,1,2,5,...,1,1,1,1,3,2,1,2,0,0
3,4,1,2,2,2,2,3,1,2,6,...,2,4,1,1,1,2,1,2,1,1
4,4,2,1,2,2,2,3,1,2,5,...,1,3,1,1,1,1,1,2,1,1


In [61]:
dfDiabetesNoDefinidos.to_csv('2021DiabetesNoDefinidos_Voting_RNA.csv', index=False, sep=';') 

In [62]:
print("F1 score: {:.3f}".format(f1_score(y_pred_modelVoting, y_pred_modelRNA, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_pred_modelVoting, y_pred_modelRNA, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_pred_modelVoting, y_pred_modelRNA)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_pred_modelVoting, y_pred_modelRNA)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_pred_modelVoting, y_pred_modelRNA)))

F1 score: 0.922
Precisión (Precision): 0.928
Exactitud (Accuracy): 0.929
Especificidad (Specificity): 0.991
AUC-ROC: 0.785
