## Autor:
Andrea Campillo Piqueras.
## Estudios:
Universidad Internacional de La Rioja.
Escuela Superior de Ingeniería y Tecnología.<br>
Trabajo Fin de Máster Universitario en Análisis y Visualización de Datos Masivos/ Visual Analytics and Big Data.
## Título:
PrediDia: Un Enfoque Predictivo para la Evaluación de la Diabetes.
## Repositorio:
https://github.com/AndreaCampillo/TFM_PrediDia
## Licencia:
MIT License Copyright (c) 2024 Andrea Campillo Piqueras.

## <center><H1>Estudio de Correlaciones</H1></center>

In [1]:
# Librerías utilizadas
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# Dataset 2021

In [2]:
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021DataSet_Diabeticos_NoDiabeticos_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [3]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   Year                229655 non-null  int64
 1   CatBMI              229655 non-null  int64
 2   Stroke              229655 non-null  int64
 3   HeartDis            229655 non-null  int64
 4   PhysExer            229655 non-null  int64
 5   HealthIns           229655 non-null  int64
 6   NoMedCost           229655 non-null  int64
 7   GenHealth           229655 non-null  int64
 8   CogDiff             229655 non-null  int64
 9   Depression          229655 non-null  int64
 10  MentalHlth          229655 non-null  int64
 11  MentalState         229655 non-null  int64
 12  PhysHlth            229655 non-null  int64
 13  WalkDiff            229655 non-null  int64
 14  Gender              229655 non-null  int64
 15  AgeRange            229655 non-null  int64
 16  EdLevel             

In [4]:
dfDiabetes_load.head()

Unnamed: 0,Year,CatBMI,Stroke,HeartDis,PhysExer,HealthIns,NoMedCost,GenHealth,CogDiff,Depression,...,FruitCons,VegCons,FruitOrVegCon,FruitAndVegCon,MarijuanaCon,SleepHours,BrDiabetes,GrDiabetes,SupGrPreDiabetes,SupGrNoPreDiabetes
0,2021,1,2,2,2,1,2,2,2,2,...,1,1,1,1,9,99,3,3,2,2
1,2021,3,2,1,2,1,2,1,2,2,...,1,2,1,2,9,99,1,1,1,1
2,2021,4,2,2,1,1,2,1,2,2,...,1,1,1,1,9,99,1,1,1,1
3,2021,3,1,1,1,1,2,2,2,2,...,1,1,1,1,9,99,1,1,1,1
4,2021,2,2,2,2,1,2,1,2,2,...,2,2,2,2,9,99,3,3,2,2


In [5]:
# Eliminación de columnas descartadas tras EDA
columns_to_drop = ['Year','MarijuanaCon','SleepHours','AnnIncome','Awareness','FootIrrita','FecFootIrrita','BrDiabetes','GrDiabetes','SupGrNoPreDiabetes']
dfDiabetes = dfDiabetes_load.drop(columns_to_drop, axis=1)

In [6]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 35 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   Stroke            229655 non-null  int64
 2   HeartDis          229655 non-null  int64
 3   PhysExer          229655 non-null  int64
 4   HealthIns         229655 non-null  int64
 5   NoMedCost         229655 non-null  int64
 6   GenHealth         229655 non-null  int64
 7   CogDiff           229655 non-null  int64
 8   Depression        229655 non-null  int64
 9   MentalHlth        229655 non-null  int64
 10  MentalState       229655 non-null  int64
 11  PhysHlth          229655 non-null  int64
 12  WalkDiff          229655 non-null  int64
 13  Gender            229655 non-null  int64
 14  AgeRange          229655 non-null  int64
 15  EdLevel           229655 non-null  int64
 16  SocClass          229655 non-null  int64
 17  UrologyDz 

In [7]:
dfDiabetes.head()

Unnamed: 0,CatBMI,Stroke,HeartDis,PhysExer,HealthIns,NoMedCost,GenHealth,CogDiff,Depression,MentalHlth,...,Race,MaritalSt,LastMedChk,HighBP,HighChol,FruitCons,VegCons,FruitOrVegCon,FruitAndVegCon,SupGrPreDiabetes
0,1,2,2,2,1,2,2,2,2,2,...,1,1,2,2,1,1,1,1,1,2
1,3,2,1,2,1,2,1,2,2,1,...,2,3,1,1,2,1,2,1,2,1
2,4,2,2,1,1,2,1,2,2,2,...,1,1,1,1,1,1,1,1,1,1
3,3,1,1,1,1,2,2,2,2,1,...,4,1,1,1,1,1,1,1,1,1
4,2,2,2,2,1,2,1,2,2,1,...,1,1,1,2,2,2,2,2,2,2


# Correlaciones

In [8]:
# Correlación de todos los campos
correlation_matrix = dfDiabetes.corr()
print(correlation_matrix)

                    CatBMI    Stroke  HeartDis  PhysExer  HealthIns  \
CatBMI            1.000000 -0.012663 -0.044291  0.129881   0.003335   
Stroke           -0.012663  1.000000  0.174513 -0.078169   0.014495   
HeartDis         -0.044291  0.174513  1.000000 -0.096495   0.024383   
PhysExer          0.129881 -0.078169 -0.096495  1.000000   0.025023   
HealthIns         0.003335  0.014495  0.024383  0.025023   1.000000   
NoMedCost        -0.030049  0.022113  0.014401 -0.050247  -0.253153   
GenHealth         0.109430 -0.153287 -0.223415  0.249767   0.024789   
CogDiff          -0.045780  0.094140  0.069455 -0.118310  -0.031117   
Depression       -0.077076  0.043924  0.034277 -0.092074  -0.007482   
MentalHlth        0.038958 -0.028587 -0.007548  0.074218   0.035387   
MentalState       0.075071 -0.045656 -0.034544  0.105039   0.022294   
PhysHlth          0.087768 -0.116631 -0.148441  0.211767   0.002094   
WalkDiff         -0.141873  0.160877  0.193087 -0.282233   0.020845   
Gender

In [9]:
# Correlacion entre todos los campos creados por el analista al extraer los datos con SupGrPreDiabetes
subset_dfDiabetes = dfDiabetes[['CogDiff','Depression','MentalHlth','MentalState','FruitCons','VegCons','FruitOrVegCon','FruitAndVegCon','SupGrPreDiabetes']]

In [10]:
subset_dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CogDiff           229655 non-null  int64
 1   Depression        229655 non-null  int64
 2   MentalHlth        229655 non-null  int64
 3   MentalState       229655 non-null  int64
 4   FruitCons         229655 non-null  int64
 5   VegCons           229655 non-null  int64
 6   FruitOrVegCon     229655 non-null  int64
 7   FruitAndVegCon    229655 non-null  int64
 8   SupGrPreDiabetes  229655 non-null  int64
dtypes: int64(9)
memory usage: 15.8 MB


In [11]:
# Correlación de todos los campos
correlation_matrix = subset_dfDiabetes.corr()
print(correlation_matrix)

                   CogDiff  Depression  MentalHlth  MentalState  FruitCons  \
CogDiff           1.000000    0.326701   -0.335294    -0.338276  -0.048196   
Depression        0.326701    1.000000   -0.426838    -0.872581  -0.050572   
MentalHlth       -0.335294   -0.426838    1.000000     0.633652   0.058220   
MentalState      -0.338276   -0.872581    0.633652     1.000000   0.054963   
FruitCons        -0.048196   -0.050572    0.058220     0.054963   1.000000   
VegCons          -0.055755   -0.035636    0.033674     0.041336   0.215359   
FruitOrVegCon    -0.050927   -0.042194    0.044804     0.047365   0.437279   
FruitAndVegCon   -0.057998   -0.050431    0.054802     0.055858   0.870212   
SupGrPreDiabetes  0.075709    0.052008   -0.015420    -0.051008  -0.031038   

                   VegCons  FruitOrVegCon  FruitAndVegCon  SupGrPreDiabetes  
CogDiff          -0.055755      -0.050927       -0.057998          0.075709  
Depression       -0.035636      -0.042194       -0.050431      

In [12]:
# Descarte de caracteristicas calculados por el analista (MentalHlth, MentalState, FruitCons, VegCons, FruitOrVegCon) no necesarias 
# Dejar CogDiff, Depression, FruitAndVegCon 
columns_to_drop = ['MentalHlth','MentalState','FruitCons','VegCons','FruitOrVegCon']
dfDiabetes = dfDiabetes.drop(columns_to_drop, axis=1)

In [13]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 30 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   Stroke            229655 non-null  int64
 2   HeartDis          229655 non-null  int64
 3   PhysExer          229655 non-null  int64
 4   HealthIns         229655 non-null  int64
 5   NoMedCost         229655 non-null  int64
 6   GenHealth         229655 non-null  int64
 7   CogDiff           229655 non-null  int64
 8   Depression        229655 non-null  int64
 9   PhysHlth          229655 non-null  int64
 10  WalkDiff          229655 non-null  int64
 11  Gender            229655 non-null  int64
 12  AgeRange          229655 non-null  int64
 13  EdLevel           229655 non-null  int64
 14  SocClass          229655 non-null  int64
 15  UrologyDz         229655 non-null  int64
 16  VisionDiff        229655 non-null  int64
 17  Asthma    

In [14]:
# Correlacion de los campos restantes con SupGrPreDiabetes
dfDiabetes.corrwith(dfDiabetes['SupGrPreDiabetes']).sort_values(ascending=False)

SupGrPreDiabetes    1.000000
HighBP              0.263572
WalkDiff            0.212205
HighChol            0.207535
HeartDis            0.171213
UrologyDz           0.153237
SocClass            0.142661
Arthritis           0.132432
LastMedChk          0.127680
EdLevel             0.108239
LungDiseases        0.101177
Stroke              0.098698
VisionDiff          0.094550
AlcDrinker          0.086821
CogDiff             0.075709
Depression          0.052008
SmokerTrad          0.038260
Gender              0.032266
HealthIns           0.023001
ECigSmok            0.021186
NoMedCost           0.018210
MaritalSt           0.007762
Race               -0.034126
FruitAndVegCon     -0.046565
Asthma             -0.049569
PhysHlth           -0.146570
PhysExer           -0.147603
CatBMI             -0.190952
AgeRange           -0.198577
GenHealth          -0.235988
dtype: float64

In [15]:
# Descarte de las características que tienen una correlacion de menos abs(0,03): HealthIns,ECigSmok,NoMedCost,MaritalSt
columns_to_drop = ['HealthIns','ECigSmok','NoMedCost','MaritalSt']
dfDiabetes = dfDiabetes.drop(columns_to_drop, axis=1)

In [16]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 26 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   Stroke            229655 non-null  int64
 2   HeartDis          229655 non-null  int64
 3   PhysExer          229655 non-null  int64
 4   GenHealth         229655 non-null  int64
 5   CogDiff           229655 non-null  int64
 6   Depression        229655 non-null  int64
 7   PhysHlth          229655 non-null  int64
 8   WalkDiff          229655 non-null  int64
 9   Gender            229655 non-null  int64
 10  AgeRange          229655 non-null  int64
 11  EdLevel           229655 non-null  int64
 12  SocClass          229655 non-null  int64
 13  UrologyDz         229655 non-null  int64
 14  VisionDiff        229655 non-null  int64
 15  Asthma            229655 non-null  int64
 16  LungDiseases      229655 non-null  int64
 17  Arthritis 

In [17]:
dfDiabetes.columns

Index(['CatBMI', 'Stroke', 'HeartDis', 'PhysExer', 'GenHealth', 'CogDiff',
       'Depression', 'PhysHlth', 'WalkDiff', 'Gender', 'AgeRange', 'EdLevel',
       'SocClass', 'UrologyDz', 'VisionDiff', 'Asthma', 'LungDiseases',
       'Arthritis', 'SmokerTrad', 'AlcDrinker', 'Race', 'LastMedChk', 'HighBP',
       'HighChol', 'FruitAndVegCon', 'SupGrPreDiabetes'],
      dtype='object')

# Selección de mejores características restantes con Random Forest, con el hiperparámetro class_weight='balanced'

In [18]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [19]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 137793
Logintud del Validation set: 45931
Logintud del Test set: 45931


In [20]:
# Obtención del modelo Random Forest
model = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=14, n_jobs=-1)
model.fit(X_train, y_train)

In [21]:
#Predecimos con el el conjunto de validación
y_pred = model.predict(X_val)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.793


In [22]:
# Consulta de la importancia de las características según el modelo Random Forest
model.feature_importances_

array([0.06541853, 0.01283187, 0.01940388, 0.02822424, 0.03986888,
       0.01767451, 0.0305416 , 0.0541982 , 0.03216266, 0.03429124,
       0.09663935, 0.07703859, 0.0485551 , 0.01292974, 0.01328406,
       0.02651281, 0.01752341, 0.03714703, 0.06761441, 0.01846766,
       0.04928131, 0.03242232, 0.07941959, 0.04605243, 0.04249657])

In [23]:
# Clasificación de las características por orden de importancia según el modelo Random Forest.
# Para ello se dan los siguientes pasos:
# 1. Crear un diccionario de importancias de características
diccImportances = dict(zip(dfDiabetes.columns, model.feature_importances_))
# 2. Convertir el diccionario a una serie de pandas y ordenar los valores en orden descendente
serieImportancesSorted = pd.Series(diccImportances).sort_values(ascending=False)
# 3. Mostrar las 25 características por orden de importancia
serieImportancesSorted.head(25)

AgeRange          0.096639
HighBP            0.079420
EdLevel           0.077039
SmokerTrad        0.067614
CatBMI            0.065419
PhysHlth          0.054198
Race              0.049281
SocClass          0.048555
HighChol          0.046052
FruitAndVegCon    0.042497
GenHealth         0.039869
Arthritis         0.037147
Gender            0.034291
LastMedChk        0.032422
WalkDiff          0.032163
Depression        0.030542
PhysExer          0.028224
Asthma            0.026513
HeartDis          0.019404
AlcDrinker        0.018468
CogDiff           0.017675
LungDiseases      0.017523
VisionDiff        0.013284
UrologyDz         0.012930
Stroke            0.012832
dtype: float64

In [24]:
# Estudio de las caracteristicas con mas relevancia para el algoritmo basado en la métrica F1 Score
# 25 características: F1 score: 0.795
# 24 características: F1 score: 0.793
# 23 características: F1 score: 0.792
# 22 características: F1 score: 0.792  
# 21 características: F1 score: 0.790 (Ganadora, eliminamos cuatro características LungDiseases, VisionDiff, UrologyDZ,Stroke)
columns = list(serieImportancesSorted.head(21).index)

In [25]:
columns

['AgeRange',
 'HighBP',
 'EdLevel',
 'SmokerTrad',
 'CatBMI',
 'PhysHlth',
 'Race',
 'SocClass',
 'HighChol',
 'FruitAndVegCon',
 'GenHealth',
 'Arthritis',
 'Gender',
 'LastMedChk',
 'WalkDiff',
 'Depression',
 'PhysExer',
 'Asthma',
 'HeartDis',
 'AlcDrinker',
 'CogDiff']

In [26]:
X_train_reduced = X_train[columns].copy()
X_val_reduced = X_val[columns].copy()

In [27]:
X_train_reduced.head()

Unnamed: 0,AgeRange,HighBP,EdLevel,SmokerTrad,CatBMI,PhysHlth,Race,SocClass,HighChol,FruitAndVegCon,...,Arthritis,Gender,LastMedChk,WalkDiff,Depression,PhysExer,Asthma,HeartDis,AlcDrinker,CogDiff
223462,5,1,4,4,4,2,1,1,1,1,...,1,2,1,1,2,2,1,2,1,2
41003,6,1,4,4,2,1,1,2,1,1,...,2,1,1,2,2,1,1,2,1,2
111129,2,2,3,4,4,2,1,2,2,2,...,2,1,3,2,2,1,1,2,2,2
80678,6,1,4,4,2,1,3,2,1,1,...,2,1,1,2,2,1,1,2,1,2
137425,6,1,4,3,2,2,1,2,2,1,...,1,2,1,2,2,1,2,2,2,2


In [28]:
# Obtención del modelo
model = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=14, n_jobs=-1)
model.fit(X_train_reduced, y_train)

In [29]:
# Predecimos con el conjunto de datos de validación
y_pred = model.predict(X_val_reduced)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.793


In [30]:
X_test_reduced = X_test[columns].copy()

In [31]:
X_test_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45931 entries, 214261 to 227011
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   AgeRange        45931 non-null  int64
 1   HighBP          45931 non-null  int64
 2   EdLevel         45931 non-null  int64
 3   SmokerTrad      45931 non-null  int64
 4   CatBMI          45931 non-null  int64
 5   PhysHlth        45931 non-null  int64
 6   Race            45931 non-null  int64
 7   SocClass        45931 non-null  int64
 8   HighChol        45931 non-null  int64
 9   FruitAndVegCon  45931 non-null  int64
 10  GenHealth       45931 non-null  int64
 11  Arthritis       45931 non-null  int64
 12  Gender          45931 non-null  int64
 13  LastMedChk      45931 non-null  int64
 14  WalkDiff        45931 non-null  int64
 15  Depression      45931 non-null  int64
 16  PhysExer        45931 non-null  int64
 17  Asthma          45931 non-null  int64
 18  HeartDis        4593

In [32]:
#Predicimos para valores de test para comprobar su comportamiento con datos que no habia visto hasta ahora
y_pred = model.predict(X_test_reduced)
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))

F1 score: 0.788


In [33]:
# Según el F1 score calculado, se puede dar por bueno el descarte de las características: 
# LungDiseases, VisionDiff, UrologyDZ,Stroke

# Seleccion de mejores caracteristicas con Random Forest y SMOTE

In [34]:
# Creación de un dataset con las columnas eliminadas por deducciones del EDA y las correlaciones, excepto las deducidas por el 
# método del ramdomforest, dejando las 25 primeras características.

In [35]:
url = 'https://github.com/AndreaCampillo/TFM_PrediDia/raw/Datasets/2021DataSet_Diabeticos_NoDiabeticos_Depurado.csv'
dfDiabetes_load = pd.read_csv(url, sep=';')

In [36]:
dfDiabetes_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   Year                229655 non-null  int64
 1   CatBMI              229655 non-null  int64
 2   Stroke              229655 non-null  int64
 3   HeartDis            229655 non-null  int64
 4   PhysExer            229655 non-null  int64
 5   HealthIns           229655 non-null  int64
 6   NoMedCost           229655 non-null  int64
 7   GenHealth           229655 non-null  int64
 8   CogDiff             229655 non-null  int64
 9   Depression          229655 non-null  int64
 10  MentalHlth          229655 non-null  int64
 11  MentalState         229655 non-null  int64
 12  PhysHlth            229655 non-null  int64
 13  WalkDiff            229655 non-null  int64
 14  Gender              229655 non-null  int64
 15  AgeRange            229655 non-null  int64
 16  EdLevel             

In [37]:
print(dfDiabetes_load.columns)

Index(['Year', 'CatBMI', 'Stroke', 'HeartDis', 'PhysExer', 'HealthIns',
       'NoMedCost', 'GenHealth', 'CogDiff', 'Depression', 'MentalHlth',
       'MentalState', 'PhysHlth', 'WalkDiff', 'Gender', 'AgeRange', 'EdLevel',
       'AnnIncome', 'SocClass', 'UrologyDz', 'VisionDiff', 'Asthma',
       'LungDiseases', 'Arthritis', 'SmokerTrad', 'ECigSmok', 'AlcDrinker',
       'Race', 'MaritalSt', 'LastMedChk', 'Awareness', 'FootIrrita',
       'FecFootIrrita', 'HighBP', 'HighChol', 'FruitCons', 'VegCons',
       'FruitOrVegCon', 'FruitAndVegCon', 'MarijuanaCon', 'SleepHours',
       'BrDiabetes', 'GrDiabetes', 'SupGrPreDiabetes', 'SupGrNoPreDiabetes'],
      dtype='object')


In [38]:
columns = ['CatBMI', 'Stroke', 'HeartDis', 'PhysExer', 'GenHealth', 
           'CogDiff', 'Depression', 'PhysHlth', 'WalkDiff','Gender', 
           'AgeRange', 'EdLevel', 'SocClass', 'UrologyDz','VisionDiff',
           'Asthma', 'LungDiseases','Arthritis', 'SmokerTrad', 'AlcDrinker', 
           'Race', 'LastMedChk', 'HighBP','HighChol', 'FruitAndVegCon',
           'SupGrPreDiabetes']
dfDiabetes = dfDiabetes_load[columns].copy()

In [39]:
dfDiabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229655 entries, 0 to 229654
Data columns (total 26 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   CatBMI            229655 non-null  int64
 1   Stroke            229655 non-null  int64
 2   HeartDis          229655 non-null  int64
 3   PhysExer          229655 non-null  int64
 4   GenHealth         229655 non-null  int64
 5   CogDiff           229655 non-null  int64
 6   Depression        229655 non-null  int64
 7   PhysHlth          229655 non-null  int64
 8   WalkDiff          229655 non-null  int64
 9   Gender            229655 non-null  int64
 10  AgeRange          229655 non-null  int64
 11  EdLevel           229655 non-null  int64
 12  SocClass          229655 non-null  int64
 13  UrologyDz         229655 non-null  int64
 14  VisionDiff        229655 non-null  int64
 15  Asthma            229655 non-null  int64
 16  LungDiseases      229655 non-null  int64
 17  Arthritis 

In [40]:
# Se obtienen los datasets de entrenamiento (train), validación (val) y pruebas (test)
train_set, test_set = train_test_split(dfDiabetes, test_size=0.4, random_state=14, shuffle=True, stratify=dfDiabetes['SupGrPreDiabetes'])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=14, shuffle=True, stratify=test_set['SupGrPreDiabetes'])

X_train = train_set.drop('SupGrPreDiabetes', axis=1)
y_train = train_set['SupGrPreDiabetes'].copy()

X_val = val_set.drop('SupGrPreDiabetes', axis=1)
y_val = val_set['SupGrPreDiabetes'].copy()

X_test = test_set.drop('SupGrPreDiabetes', axis=1)
y_test = test_set['SupGrPreDiabetes'].copy()

In [41]:
y_val.value_counts()

2    38372
1     7559
Name: SupGrPreDiabetes, dtype: int64

In [42]:
print("Logintud del Training set:", len(train_set))
print("Logintud del Validation set:", len(val_set))
print("Logintud del Test set:", len(test_set))

Logintud del Training set: 137793
Logintud del Validation set: 45931
Logintud del Test set: 45931


In [43]:
# Obtención del dataset de entrenamiento balanceado con SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=14)  # F1 score: 0.776     
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [44]:
y_train_smote.value_counts()

2    115117
1    115117
Name: SupGrPreDiabetes, dtype: int64

In [45]:
# Obtención del modelo sin el hiperparámetro class_weight='balanced'
model = RandomForestClassifier(n_estimators=50, random_state=14, n_jobs=-1) #En este caso sin el hiperparametro class_weight='balanced'
model.fit(X_train_smote, y_train_smote)

In [46]:
#Predecimos con el el conjunto de validación
y_pred = model.predict(X_val)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.775


In [47]:
# Consulta de la importancia de las características según el modelo Random Forest
model.feature_importances_

array([0.0550544 , 0.01257973, 0.01990708, 0.03139476, 0.02299668,
       0.01747165, 0.02820638, 0.05013167, 0.03688013, 0.02847403,
       0.08857522, 0.06890694, 0.04332364, 0.01300839, 0.01278936,
       0.01948143, 0.016977  , 0.03388591, 0.06217048, 0.02421806,
       0.04991405, 0.04244685, 0.11338056, 0.06799869, 0.0398269 ])

In [48]:
# Clasificación de las características por orden de importancia según el modelo Random Forest.
# Para ello se dan los siguientes pasos:
# 1. Crear un diccionario de importancias de características
diccImportances = dict(zip(dfDiabetes.columns, model.feature_importances_))
# 2. Convertir el diccionario a una serie de pandas y ordenar los valores en orden descendente
serieImportancesSorted = pd.Series(diccImportances).sort_values(ascending=False)
# 3. Mostrar las 25 características por orden de importancia
serieImportancesSorted.head(25)

HighBP            0.113381
AgeRange          0.088575
EdLevel           0.068907
HighChol          0.067999
SmokerTrad        0.062170
CatBMI            0.055054
PhysHlth          0.050132
Race              0.049914
SocClass          0.043324
LastMedChk        0.042447
FruitAndVegCon    0.039827
WalkDiff          0.036880
Arthritis         0.033886
PhysExer          0.031395
Gender            0.028474
Depression        0.028206
AlcDrinker        0.024218
GenHealth         0.022997
HeartDis          0.019907
Asthma            0.019481
CogDiff           0.017472
LungDiseases      0.016977
UrologyDz         0.013008
VisionDiff        0.012789
Stroke            0.012580
dtype: float64

In [49]:
# Extraemos las caracteristicas con mas relevancia para el algoritmo 
# 25 características: F1 score: 0.775
# 21 características: F1 score: 0.771 (Ganadora, eliminamos cuatro características LungDiseases, VisionDiff, UrologyDZ,Stroke)
columns = list(serieImportancesSorted.head(21).index)

In [50]:
columns

['HighBP',
 'AgeRange',
 'EdLevel',
 'HighChol',
 'SmokerTrad',
 'CatBMI',
 'PhysHlth',
 'Race',
 'SocClass',
 'LastMedChk',
 'FruitAndVegCon',
 'WalkDiff',
 'Arthritis',
 'PhysExer',
 'Gender',
 'Depression',
 'AlcDrinker',
 'GenHealth',
 'HeartDis',
 'Asthma',
 'CogDiff']

In [51]:
X_train_reduced = X_train_smote[columns].copy()
X_val_reduced = X_val[columns].copy()

In [52]:
X_train_reduced.head()

Unnamed: 0,HighBP,AgeRange,EdLevel,HighChol,SmokerTrad,CatBMI,PhysHlth,Race,SocClass,LastMedChk,...,WalkDiff,Arthritis,PhysExer,Gender,Depression,AlcDrinker,GenHealth,HeartDis,Asthma,CogDiff
0,1,5,4,1,4,4,2,1,1,1,...,1,1,2,2,2,1,2,2,1,2
1,1,6,4,1,4,2,1,1,2,1,...,2,2,1,1,2,1,1,2,1,2
2,2,2,3,2,4,4,2,1,2,3,...,2,2,1,1,2,2,1,2,1,2
3,1,6,4,1,4,2,1,3,2,1,...,2,2,1,1,2,1,1,2,1,2
4,1,6,4,2,3,2,2,1,2,1,...,2,1,1,2,2,2,1,2,2,2


In [53]:
# Obtención del modelo con 21 columnas y sin el hiperparámetro class_weight='balanced'
model = RandomForestClassifier(n_estimators=50, random_state=14, n_jobs=-1)
model.fit(X_train_reduced, y_train_smote)

In [54]:
# Predecimos con el conjunto de datos de validación
y_pred = model.predict(X_val_reduced)
print("F1 score: {:.3f}".format(f1_score(y_val, y_pred, average='weighted')))

F1 score: 0.771


In [55]:
X_test_reduced = X_test[columns].copy()

In [56]:
X_test_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45931 entries, 214261 to 227011
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   HighBP          45931 non-null  int64
 1   AgeRange        45931 non-null  int64
 2   EdLevel         45931 non-null  int64
 3   HighChol        45931 non-null  int64
 4   SmokerTrad      45931 non-null  int64
 5   CatBMI          45931 non-null  int64
 6   PhysHlth        45931 non-null  int64
 7   Race            45931 non-null  int64
 8   SocClass        45931 non-null  int64
 9   LastMedChk      45931 non-null  int64
 10  FruitAndVegCon  45931 non-null  int64
 11  WalkDiff        45931 non-null  int64
 12  Arthritis       45931 non-null  int64
 13  PhysExer        45931 non-null  int64
 14  Gender          45931 non-null  int64
 15  Depression      45931 non-null  int64
 16  AlcDrinker      45931 non-null  int64
 17  GenHealth       45931 non-null  int64
 18  HeartDis        4593

In [57]:
#Predicimos para valores de test para comprobar su comportamiento con datos que no habia visto hasta ahora
y_pred = model.predict(X_test_reduced)
print("F1 score: {:.3f}".format(f1_score(y_test, y_pred, average='weighted')))

F1 score: 0.767


In [58]:
# Según el F1 score calculado, se puede dar por bueno el descarte de las características: 
# LungDiseases, VisionDiff, UrologyDZ,Stroke