## Autor:
Andrea Campillo Piqueras.
## Estudios:
Universidad Internacional de La Rioja.
Escuela Superior de Ingeniería y Tecnología.<br>
Trabajo Fin de Máster Universitario en Análisis y Visualización de Datos Masivos/ Visual Analytics and Big Data.
## Título:
PrediDia: Un Enfoque Predictivo para la Evaluación de la Diabetes.
## Repositorio:
https://github.com/AndreaCampillo/TFM_PrediDia
## Licencia:
MIT License Copyright (c) 2024 Andrea Campillo Piqueras.

## <center><H1>Voting Classifier</H1></center> 

In [1]:
# Librerías utilizadas
import time
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from imblearn.metrics import specificity_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Dataset 2021

In [2]:
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021DataSet_Diabeticos_NoDiabeticos_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [3]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   Year                229655 non-null  int64
 1   CatBMI              229655 non-null  int64
 2   Stroke              229655 non-null  int64
 3   HeartDis            229655 non-null  int64
 4   PhysExer            229655 non-null  int64
 5   HealthIns           229655 non-null  int64
 6   NoMedCost           229655 non-null  int64
 7   GenHealth           229655 non-null  int64
 8   CogDiff             229655 non-null  int64
 9   Depression          229655 non-null  int64
 10  MentalHlth          229655 non-null  int64
 11  MentalState         229655 non-null  int64
 12  PhysHlth            229655 non-null  int64
 13  WalkDiff            229655 non-null  int64
 14  Gender              229655 non-null  int64
 15  AgeRange            229655 non-null  int64
 16  EdLevel             

In [4]:
dfDiabetes_load.head()

Unnamed: 0,Year,CatBMI,Stroke,HeartDis,PhysExer,HealthIns,NoMedCost,GenHealth,CogDiff,Depression,...,FruitCons,VegCons,FruitOrVegCon,FruitAndVegCon,MarijuanaCon,SleepHours,BrDiabetes,GrDiabetes,SupGrPreDiabetes,SupGrNoPreDiabetes
0,2021,1,2,2,2,1,2,2,2,2,...,1,1,1,1,9,99,3,3,2,2
1,2021,3,2,1,2,1,2,1,2,2,...,1,2,1,2,9,99,1,1,1,1
2,2021,4,2,2,1,1,2,1,2,2,...,1,1,1,1,9,99,1,1,1,1
3,2021,3,1,1,1,1,2,2,2,2,...,1,1,1,1,9,99,1,1,1,1
4,2021,2,2,2,2,1,2,1,2,2,...,2,2,2,2,9,99,3,3,2,2


In [5]:
# Se reproduce el proceso pero eliminando las caracteristcas las características LungDiseases, VisionDiff, UrologyDZ,Stroke
columns = ['CatBMI','HeartDis','PhysExer','GenHealth','CogDiff',
           'Depression', 'PhysHlth', 'WalkDiff','Gender','AgeRange',
           'EdLevel', 'SocClass','Asthma', 'Arthritis', 'SmokerTrad',
           'AlcDrinker','Race', 'LastMedChk', 'HighBP','HighChol', 'FruitAndVegCon',
           'SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [6]:
dfDiabetes['SupGrPreDiabetes'] = dfDiabetes['SupGrPreDiabetes'].replace({2: 0, 1: 1})

In [7]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   HeartDis          229655 non-null  int64
 2   PhysExer          229655 non-null  int64
 3   GenHealth         229655 non-null  int64
 4   CogDiff           229655 non-null  int64
 5   Depression        229655 non-null  int64
 6   PhysHlth          229655 non-null  int64
 7   WalkDiff          229655 non-null  int64
 8   Gender            229655 non-null  int64
 9   AgeRange          229655 non-null  int64
 10  EdLevel           229655 non-null  int64
 11  SocClass          229655 non-null  int64
 12  Asthma            229655 non-null  int64
 13  Arthritis         229655 non-null  int64
 14  SmokerTrad        229655 non-null  int64
 15  AlcDrinker        229655 non-null  int64
 16  Race              229655 non-null  int64
 17  LastMedChk

In [8]:
dfDiabetes.head()

Unnamed: 0,CatBMI,HeartDis,PhysExer,GenHealth,CogDiff,Depression,PhysHlth,WalkDiff,Gender,AgeRange,...,Asthma,Arthritis,SmokerTrad,AlcDrinker,Race,LastMedChk,HighBP,HighChol,FruitAndVegCon,SupGrPreDiabetes
0,1,2,2,2,2,2,3,2,2,6,...,2,1,3,1,1,2,2,1,1,0
1,3,1,2,1,2,2,1,2,2,6,...,1,2,4,1,2,1,1,2,2,1
2,4,2,1,1,2,2,1,2,2,5,...,1,2,4,2,1,1,1,1,1,1
3,3,1,1,2,2,2,3,1,1,6,...,1,2,4,1,4,1,1,1,1,1
4,2,2,2,1,2,2,1,1,1,6,...,1,2,3,1,1,1,2,2,2,0


In [9]:
numClases = dfDiabetes['SupGrPreDiabetes'].value_counts()
numDiabeticos = numClases[1]
numNoDiabeticos = numClases[0]
print("Número de registros: ", len(dfDiabetes)) 
print("Número de dibéticos: ", numDiabeticos)
print("Número de no diabéticos: ", numNoDiabeticos)

Número de registros:  229655
Número de dibéticos:  37793
Número de no diabéticos:  191862


In [10]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [11]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 137793
Logintud del Validation set: 45931
Logintud del Test set: 45931


# Búsqueda de mejor combinación de algorítmos

# Gradient Boosting Classifier + Gaussian NB + Logistic Regression + Random Forest

In [12]:
# Obtención del modelo
start_time = time.time()
modelGbc = GradientBoostingClassifier(max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=14)
modelGnb = GaussianNB(priors=[0.999, 0.001])
modelLr = LogisticRegression(C=10, class_weight='balanced', random_state=14, solver='sag', tol=0.01)
modelRf = RandomForestClassifier(class_weight='balanced', min_samples_split=5, n_jobs=-1, random_state=14)

modelGbc.fit(X_train, y_train)
modelGnb.fit(X_train, y_train)
modelLr.fit(X_train, y_train)
modelRf.fit(X_train, y_train)

modelVoting = VotingClassifier(estimators=[('gbc',modelGbc),('gnb', modelGnb),('lr',modelLr),('rf',modelRf)], voting='hard')

modelVoting.fit(X_train, y_train)
print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Tiempo en generación del modelo: 81.222  sg.


In [13]:
# Obtención de métricas
y_pred = modelVoting.predict(X_val)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.818


In [14]:
# Se comprueba con el dataset de prueba
y_pred = modelVoting.predict(X_test)

In [15]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.817
Precisión (Precision): 0.812
Exactitud (Accuracy): 0.841
Especificidad (Specificity): 0.956
AUC-ROC: 0.606


# Gradient Boosting Classifier + Gaussian NB + Logistic Regression 

In [16]:
# Obtención del modelo
start_time = time.time()
modelGbc = GradientBoostingClassifier(max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=14)
modelGnb = GaussianNB(priors=[0.999, 0.001])
modelLr = LogisticRegression(C=10, class_weight='balanced', random_state=14, solver='sag', tol=0.01)

modelGbc.fit(X_train, y_train)
modelGnb.fit(X_train, y_train)
modelLr.fit(X_train, y_train)

modelVoting = VotingClassifier(estimators=[('gbc',modelGbc),('gnb', modelGnb),('lr',modelLr)], voting='hard')

modelVoting.fit(X_train, y_train)
print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Tiempo en generación del modelo: 67.858  sg.


In [17]:
# Obtención de métricas
y_pred = modelVoting.predict(X_val)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.817


In [18]:
# Se comprueba con el dataset de prueba
y_pred = modelVoting.predict(X_test)

In [19]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.816
Precisión (Precision): 0.808
Exactitud (Accuracy): 0.831
Especificidad (Specificity): 0.933
AUC-ROC: 0.622


# Gradient Boosting Classifier + Gaussian NB + Random Forest

In [20]:
# Obtención del modelo
start_time = time.time()
modelGbc = GradientBoostingClassifier(max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=14)
modelGnb = GaussianNB(priors=[0.999, 0.001])
modelRf = RandomForestClassifier(class_weight='balanced', min_samples_split=5, n_jobs=-1, random_state=14)

modelGbc.fit(X_train, y_train)
modelGnb.fit(X_train, y_train)
modelRf.fit(X_train, y_train)

modelVoting = VotingClassifier(estimators=[('gbc',modelGbc),('gnb', modelGnb),('rf',modelRf)], voting='hard')

modelVoting.fit(X_train, y_train)
print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Tiempo en generación del modelo: 76.189  sg.


In [21]:
# Obtención de métricas
y_pred = modelVoting.predict(X_val)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.818


In [22]:
# Se comprueba con el dataset de prueba
y_pred = modelVoting.predict(X_test)

In [23]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.816
Precisión (Precision): 0.812
Exactitud (Accuracy): 0.841
Especificidad (Specificity): 0.956
AUC-ROC: 0.606


# SMOTE con Voting Classifier: Gradient Boosting Classifier + Gaussian NB + Logistic Regression

In [24]:
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021DataSet_Diabeticos_NoDiabeticos_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [25]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   Year                229655 non-null  int64
 1   CatBMI              229655 non-null  int64
 2   Stroke              229655 non-null  int64
 3   HeartDis            229655 non-null  int64
 4   PhysExer            229655 non-null  int64
 5   HealthIns           229655 non-null  int64
 6   NoMedCost           229655 non-null  int64
 7   GenHealth           229655 non-null  int64
 8   CogDiff             229655 non-null  int64
 9   Depression          229655 non-null  int64
 10  MentalHlth          229655 non-null  int64
 11  MentalState         229655 non-null  int64
 12  PhysHlth            229655 non-null  int64
 13  WalkDiff            229655 non-null  int64
 14  Gender              229655 non-null  int64
 15  AgeRange            229655 non-null  int64
 16  EdLevel             

In [26]:
print(dfDiabetes_load.columns)

Index(['Year', 'CatBMI', 'Stroke', 'HeartDis', 'PhysExer', 'HealthIns',
       'NoMedCost', 'GenHealth', 'CogDiff', 'Depression', 'MentalHlth',
       'MentalState', 'PhysHlth', 'WalkDiff', 'Gender', 'AgeRange', 'EdLevel',
       'AnnIncome', 'SocClass', 'UrologyDz', 'VisionDiff', 'Asthma',
       'LungDiseases', 'Arthritis', 'SmokerTrad', 'ECigSmok', 'AlcDrinker',
       'Race', 'MaritalSt', 'LastMedChk', 'Awareness', 'FootIrrita',
       'FecFootIrrita', 'HighBP', 'HighChol', 'FruitCons', 'VegCons',
       'FruitOrVegCon', 'FruitAndVegCon', 'MarijuanaCon', 'SleepHours',
       'BrDiabetes', 'GrDiabetes', 'SupGrPreDiabetes', 'SupGrNoPreDiabetes'],
      dtype='object')


In [27]:
# Se seleccionan la 21 características que fueron óptimas
# Se reproduce el proceso pero eliminando las caracteristcas las características LungDiseases, VisionDiff, UrologyDZ,Stroke
columns = ['CatBMI','HeartDis','PhysExer','GenHealth','CogDiff',
           'Depression', 'PhysHlth', 'WalkDiff','Gender','AgeRange',
           'EdLevel', 'SocClass','Asthma', 'Arthritis', 'SmokerTrad',
           'AlcDrinker','Race', 'LastMedChk', 'HighBP','HighChol', 'FruitAndVegCon',
           'SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [28]:
dfDiabetes['SupGrPreDiabetes'] = dfDiabetes['SupGrPreDiabetes'].replace({2: 0, 1: 1})

In [29]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   HeartDis          229655 non-null  int64
 2   PhysExer          229655 non-null  int64
 3   GenHealth         229655 non-null  int64
 4   CogDiff           229655 non-null  int64
 5   Depression        229655 non-null  int64
 6   PhysHlth          229655 non-null  int64
 7   WalkDiff          229655 non-null  int64
 8   Gender            229655 non-null  int64
 9   AgeRange          229655 non-null  int64
 10  EdLevel           229655 non-null  int64
 11  SocClass          229655 non-null  int64
 12  Asthma            229655 non-null  int64
 13  Arthritis         229655 non-null  int64
 14  SmokerTrad        229655 non-null  int64
 15  AlcDrinker        229655 non-null  int64
 16  Race              229655 non-null  int64
 17  LastMedChk

In [30]:
dfDiabetes.head()

Unnamed: 0,CatBMI,HeartDis,PhysExer,GenHealth,CogDiff,Depression,PhysHlth,WalkDiff,Gender,AgeRange,...,Asthma,Arthritis,SmokerTrad,AlcDrinker,Race,LastMedChk,HighBP,HighChol,FruitAndVegCon,SupGrPreDiabetes
0,1,2,2,2,2,2,3,2,2,6,...,2,1,3,1,1,2,2,1,1,0
1,3,1,2,1,2,2,1,2,2,6,...,1,2,4,1,2,1,1,2,2,1
2,4,2,1,1,2,2,1,2,2,5,...,1,2,4,2,1,1,1,1,1,1
3,3,1,1,2,2,2,3,1,1,6,...,1,2,4,1,4,1,1,1,1,1
4,2,2,2,1,2,2,1,1,1,6,...,1,2,3,1,1,1,2,2,2,0


In [31]:
numClases = dfDiabetes['SupGrPreDiabetes'].value_counts()
numDiabeticos = numClases[1]
numNoDiabeticos = numClases[0]
print("Número de registros: ", len(dfDiabetes)) 
print("Número de dibéticos: ", numDiabeticos)
print("Número de no diabéticos: ", numNoDiabeticos)

Número de registros:  229655
Número de dibéticos:  37793
Número de no diabéticos:  191862


In [32]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [33]:
y_val.value_counts()

0    38372
1     7559
Name: SupGrPreDiabetes, dtype: int64

In [34]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 137793
Logintud del Validation set: 45931
Logintud del Test set: 45931


In [35]:
# Obtención de datos de entrenamiento con SMOTE para balanceamiento de los datos
start_time = time.time()
smote = SMOTE(sampling_strategy='auto', random_state=14)  
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("Tiempo en generación del SMOTE:", round(time.time()-start_time,3), " sg.")

Tiempo en generación del SMOTE: 12.211  sg.


In [36]:
y_train_smote.value_counts()

1    115117
0    115117
Name: SupGrPreDiabetes, dtype: int64

In [37]:
# Obtenión del modelo. Se elimina el hiperparámetro class_weight='balanced', por haber utilizado SMOTE
start_time = time.time()
modelGbc = GradientBoostingClassifier(max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=14)
modelGnb = GaussianNB(priors=[0.999, 0.001])
modelLr = LogisticRegression(C=10, random_state=14, solver='sag', tol=0.01)

modelGbc.fit(X_train, y_train)
modelGnb.fit(X_train, y_train)
modelLr.fit(X_train, y_train)

modelVoting = VotingClassifier(estimators=[('gbc',modelGbc),('gnb', modelGnb),('lr',modelLr)], voting='hard')

modelVoting.fit(X_train, y_train)
print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Tiempo en generación del modelo: 67.336  sg.


In [38]:
# Obtención de métricas
y_pred = modelVoting.predict(X_val)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.810


In [39]:
# Se comprueba con el dataset de prueba
y_pred = modelVoting.predict(X_test)

In [40]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.809
Precisión (Precision): 0.812
Exactitud (Accuracy): 0.843
Especificidad (Specificity): 0.971
AUC-ROC: 0.583


# Dataset 2021_22 con Voting Classifier: Gradient Boosting Classifier + Gaussian NB + Logistic Regression
Se realiza el mismo test con los datos unificados de 2021 y 2022, por ese motivo se eliminan las caracteríticas HighBP, HighChol, FruitAndVegCon por no estar presentes en el dataset del 2022 

In [41]:
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021_22DataSet_Diabeticos_NoDiabeticos_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [42]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503592 entries, 0 to 503591
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   Year                503592 non-null  int64
 1   CatBMI              503592 non-null  int64
 2   Stroke              503592 non-null  int64
 3   HeartDis            503592 non-null  int64
 4   PhysExer            503592 non-null  int64
 5   HealthIns           503592 non-null  int64
 6   NoMedCost           503592 non-null  int64
 7   GenHealth           503592 non-null  int64
 8   CogDiff             503592 non-null  int64
 9   Depression          503592 non-null  int64
 10  MentalHlth          503592 non-null  int64
 11  MentalState         503592 non-null  int64
 12  PhysHlth            503592 non-null  int64
 13  WalkDiff            503592 non-null  int64
 14  Gender              503592 non-null  int64
 15  AgeRange            503592 non-null  int64
 16  EdLevel             

In [43]:
print(dfDiabetes_load.columns)

Index(['Year', 'CatBMI', 'Stroke', 'HeartDis', 'PhysExer', 'HealthIns',
       'NoMedCost', 'GenHealth', 'CogDiff', 'Depression', 'MentalHlth',
       'MentalState', 'PhysHlth', 'WalkDiff', 'Gender', 'AgeRange', 'EdLevel',
       'AnnIncome', 'SocClass', 'UrologyDz', 'VisionDiff', 'Asthma',
       'LungDiseases', 'Arthritis', 'SmokerTrad', 'ECigSmok', 'AlcDrinker',
       'Race', 'MaritalSt', 'LastMedChk', 'Awareness', 'FootIrrita',
       'FecFootIrrita', 'HighBP', 'HighChol', 'FruitCons', 'VegCons',
       'FruitOrVegCon', 'FruitAndVegCon', 'MarijuanaCon', 'SleepHours',
       'BrDiabetes', 'GrDiabetes', 'SupGrPreDiabetes', 'SupGrNoPreDiabetes'],
      dtype='object')


In [44]:
# Se reproduce el proceso con las características seleccionadas y comunes al 2021 y 2022
columns = ['CatBMI','HeartDis','PhysExer','GenHealth','CogDiff',
           'Depression', 'PhysHlth', 'WalkDiff','Gender','AgeRange',
           'EdLevel', 'SocClass','Asthma', 'Arthritis', 'SmokerTrad',
           'AlcDrinker','Race', 'LastMedChk','SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [45]:
dfDiabetes['SupGrPreDiabetes'] = dfDiabetes['SupGrPreDiabetes'].replace({2: 0, 1: 1})

In [46]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503592 entries, 0 to 503591
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            503592 non-null  int64
 1   HeartDis          503592 non-null  int64
 2   PhysExer          503592 non-null  int64
 3   GenHealth         503592 non-null  int64
 4   CogDiff           503592 non-null  int64
 5   Depression        503592 non-null  int64
 6   PhysHlth          503592 non-null  int64
 7   WalkDiff          503592 non-null  int64
 8   Gender            503592 non-null  int64
 9   AgeRange          503592 non-null  int64
 10  EdLevel           503592 non-null  int64
 11  SocClass          503592 non-null  int64
 12  Asthma            503592 non-null  int64
 13  Arthritis         503592 non-null  int64
 14  SmokerTrad        503592 non-null  int64
 15  AlcDrinker        503592 non-null  int64
 16  Race              503592 non-null  int64
 17  LastMedChk

In [47]:
dfDiabetes.head()

Unnamed: 0,CatBMI,HeartDis,PhysExer,GenHealth,CogDiff,Depression,PhysHlth,WalkDiff,Gender,AgeRange,EdLevel,SocClass,Asthma,Arthritis,SmokerTrad,AlcDrinker,Race,LastMedChk,SupGrPreDiabetes
0,1,2,2,2,2,2,3,2,2,6,2,1,2,1,3,1,1,2,0
1,3,1,2,1,2,2,1,2,2,6,2,1,1,2,4,1,2,1,1
2,4,2,1,1,2,2,1,2,2,5,2,2,1,2,4,2,1,1,1
3,3,1,1,2,2,2,3,1,1,6,1,1,1,2,4,1,4,1,1
4,2,2,2,1,2,2,1,1,1,6,3,1,1,2,3,1,1,1,0


In [48]:
numClases = dfDiabetes['SupGrPreDiabetes'].value_counts()
numDiabeticos = numClases[1]
numNoDiabeticos = numClases[0]
print("Número de registros: ", len(dfDiabetes)) 
print("Número de dibéticos: ", numDiabeticos)
print("Número de no diabéticos: ", numNoDiabeticos)

Número de registros:  503592
Número de dibéticos:  81060
Número de no diabéticos:  422532


In [49]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [50]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 302155
Logintud del Validation set: 100718
Logintud del Test set: 100719


In [51]:
# Obtención del modelo
start_time = time.time()
modelGbc = GradientBoostingClassifier(max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=14)
modelGnb = GaussianNB(priors=[0.999, 0.001])
modelLr = LogisticRegression(C=10, class_weight='balanced', random_state=14, solver='sag', tol=0.01)

modelGbc.fit(X_train, y_train)
modelGnb.fit(X_train, y_train)
modelLr.fit(X_train, y_train)

modelVoting = VotingClassifier(estimators=[('gbc',modelGbc),('gnb', modelGnb),('lr',modelLr)], voting='hard')

modelVoting.fit(X_train, y_train)
print("Tiempo en generación del modelo:", round(time.time()-start_time,3), " sg.")

Tiempo en generación del modelo: 154.342  sg.


In [52]:
# Obtención de métricas
y_pred = modelVoting.predict(X_val)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.812


In [53]:
# Se comprueba con el dataset de prueba
y_pred = modelVoting.predict(X_test)

In [54]:
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))
print("Precisión (Precision): {:.3f}".format(precision_score(y_test, y_pred, average='weighted')))
print("Exactitud (Accuracy): {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Especificidad (Specificity): {:.3f}".format(specificity_score(y_test, y_pred)))
print("AUC-ROC: {:.3f}".format(roc_auc_score(y_test, y_pred)))

F1 score: 0.813
Precisión (Precision): 0.805
Exactitud (Accuracy): 0.832
Especificidad (Specificity): 0.940
AUC-ROC: 0.606
