Alejandro Rafael Vega Saavedra

# Proyecto Final Aprendizaje de Máquina

Primero se debe instalar la siguiente librería para importar la base de datos:

In [115]:
!pip install ucimlrepo



In [116]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

## Importacion del dataset y preprocesamiento de datos

Se utiliza la librería instalada anteriormente para importar la base de datos y se le hace get_dummies a las variables binarias.

In [117]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data
x = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

# metadata
print(cdc_diabetes_health_indicators.metadata)

# variable information
print(cdc_diabetes_health_indicators.variables)

x = pd.get_dummies(x, columns = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity',
                                 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex'], drop_first = True)

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

In [118]:
x.head()

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,Age,Education,Income,HighBP_1,HighChol_1,CholCheck_1,...,Stroke_1,HeartDiseaseorAttack_1,PhysActivity_1,Fruits_1,Veggies_1,HvyAlcoholConsump_1,AnyHealthcare_1,NoDocbcCost_1,DiffWalk_1,Sex_1
0,40,5,18,15,9,4,3,1,1,1,...,0,0,0,0,1,0,1,0,1,0
1,25,3,0,0,7,6,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,28,5,30,30,9,4,8,1,1,1,...,0,0,0,1,0,0,1,1,1,0
3,27,2,0,0,11,3,6,1,0,1,...,0,0,1,1,1,0,1,0,0,0
4,24,2,3,0,11,5,4,1,1,1,...,0,0,1,1,1,0,1,0,0,0


In [119]:
y.head()

Unnamed: 0,Diabetes_binary
0,0
1,0
2,0
3,0
4,0


## Análisis inicial de los datos

### Algoritmos de ensamble

In [120]:
col = x.columns
y = np.ravel(y)

xtr, xte, ytr, yte = train_test_split(x, y, test_size = 0.2, random_state = 42)

pd.DataFrame(xtr, columns = col).head()

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,Age,Education,Income,HighBP_1,HighChol_1,CholCheck_1,...,Stroke_1,HeartDiseaseorAttack_1,PhysActivity_1,Fruits_1,Veggies_1,HvyAlcoholConsump_1,AnyHealthcare_1,NoDocbcCost_1,DiffWalk_1,Sex_1
31141,20,2,0,0,12,6,8,0,1,1,...,0,0,1,1,1,0,1,0,0,1
98230,34,3,0,0,8,5,8,0,0,1,...,0,0,1,0,1,0,1,0,0,1
89662,24,2,0,5,12,5,6,1,1,1,...,0,0,1,1,1,0,1,0,0,1
208255,27,1,0,0,5,6,7,0,1,1,...,0,0,1,1,1,0,1,0,0,1
233415,24,3,0,0,12,4,6,0,1,1,...,0,0,1,1,1,0,1,0,1,0


In [121]:
# Entrenar un modelo de Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(xtr, ytr)
rf_predictions = rf_classifier.predict(xte)

# Entrenar un modelo de AdaBoost
ada_classifier = AdaBoostClassifier(random_state=42)
ada_classifier.fit(xtr, ytr)
ada_predictions = ada_classifier.predict(xte)

# Entrenar un modelo de Gradient Boosting
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(xtr, ytr)
gb_predictions = gb_classifier.predict(xte)

# Entrenar un modelo de XGBoost
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(xtr, ytr)
xgb_predictions = xgb_classifier.predict(xte)

In [122]:
# Evaluar el rendimiento de cada modelo
rf_accuracy = accuracy_score(yte, rf_predictions)
ada_accuracy = accuracy_score(yte, ada_predictions)
gb_accuracy = accuracy_score(yte, gb_predictions)
xgb_accuracy = accuracy_score(yte, xgb_predictions)

# Mostrar la precisión de cada modelo
{'Random Forest Accuracy': rf_accuracy, 'AdaBoost Accuracy': ada_accuracy, 'Gradient Boosting Accuracy': gb_accuracy, 'XGBoost Accuracy': xgb_accuracy}

{'Random Forest Accuracy': 0.8597445600756859,
 'AdaBoost Accuracy': 0.8664262062440871,
 'Gradient Boosting Accuracy': 0.8675102491327656,
 'XGBoost Accuracy': 0.8667612740460423}

### Árbol de clasificación

In [123]:
xtr, xte, ytr, yte = train_test_split(x, y, test_size = 0.2, random_state = 42)

model = DecisionTreeClassifier()
model.fit(xtr, ytr)
ypr = model.predict(xte)

accuracy = accuracy_score(yte, ypr)
print(" DecisionTreeClassifier Accuracy:", accuracy)

 DecisionTreeClassifier Accuracy: 0.7967321034374014


### Clasificación con k-vecinos

In [124]:
xtr, xte, ytr, yte = train_test_split(x, y, test_size = 0.2, random_state = 42)

xtr = np.ascontiguousarray(xtr)
xte = np.ascontiguousarray(xte)

model = KNeighborsClassifier(n_neighbors=1)
model.fit(xtr, ytr)
ypr = model.predict(xte)

accuracy = accuracy_score(yte, ypr)
print("KNeighborsClassifier Accuracy:", accuracy)

KNeighborsClassifier Accuracy: 0.8052073478397982


### Redes neuronales

#### Conjuntos de entrenamiento, validación y test

A continuación se dividirá la base de datos en los conjuntos de entrenamiento, validación y test para el modelo. Para entrenamiento se dejará el 80% de los datos, y para validación y test 10% para cada uno.

In [125]:
d = x.copy()
d['Diabetes_binary'] = y.tolist()
d

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,Age,Education,Income,HighBP_1,HighChol_1,CholCheck_1,...,HeartDiseaseorAttack_1,PhysActivity_1,Fruits_1,Veggies_1,HvyAlcoholConsump_1,AnyHealthcare_1,NoDocbcCost_1,DiffWalk_1,Sex_1,Diabetes_binary
0,40,5,18,15,9,4,3,1,1,1,...,0,0,0,1,0,1,0,1,0,0
1,25,3,0,0,7,6,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,28,5,30,30,9,4,8,1,1,1,...,0,0,1,0,0,1,1,1,0,0
3,27,2,0,0,11,3,6,1,0,1,...,0,1,1,1,0,1,0,0,0,0
4,24,2,3,0,11,5,4,1,1,1,...,0,1,1,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,45,3,0,5,5,6,7,1,1,1,...,0,0,1,1,0,1,0,0,1,0
253676,18,4,0,0,11,2,4,1,1,1,...,0,0,0,0,0,1,0,1,0,1
253677,28,1,0,0,2,5,2,0,0,1,...,0,1,1,0,0,1,0,0,0,0
253678,23,3,0,0,7,5,1,1,0,1,...,0,0,1,1,0,1,0,0,1,0


In [126]:
d = d.sample(n = 10000, random_state = 42)

train = d.iloc[:8000,:]   #Equivale al 80% de los datos.
val = d.iloc[8000:9000,:] #Equivale al 10% de los datos.
test = d.iloc[9000:,:]    #Equivale al 10% de los datos.

#### Creación de tensores y DataLoaders

In [127]:
class MyDataset():

  def __init__(self,df,target_column):

    y = df[target_column].values
    X = df.drop(target_column, axis = 1).values
    self.X = torch.tensor(X, dtype = torch.float32)
    self.y = torch.tensor(y, dtype = torch.float32)

  def __len__(self):
    return len(self.y)

  def __getitem__(self,idx):
    return self.X[idx],self.y[idx]

Convertir los conjuntos en tensores.

In [128]:
traint = MyDataset(train, 'Diabetes_binary')
testt = MyDataset(test, 'Diabetes_binary')
valt = MyDataset(val, 'Diabetes_binary')

Crear los DataLoaders.

In [129]:
traind = DataLoader(traint, batch_size = 2,
                            shuffle = False,
                            num_workers = 0,
                            collate_fn = None,
                            pin_memory = False,)

testd = DataLoader(testt, batch_size = 3,
                          shuffle = False,
                          num_workers = 0,
                          collate_fn = None,
                          pin_memory = False,)

vald = DataLoader(valt, batch_size = 3,
                        shuffle = False,
                        num_workers = 0,
                        collate_fn = None,
                        pin_memory = False,)

#### Definición de la clase Net

In [130]:
class Net(nn.Module):

    def __init__(self, num_inputs, num_hidden, num_outputs):
        super().__init__()

        self.fc1 = nn.Linear(num_inputs, num_hidden)
        self.act_fn = nn.Sigmoid()
        self.fc2 = nn.Linear(num_hidden, num_outputs)

    def forward(self, x):

        x = self.fc1(x)
        x = self.act_fn(x)
        x = self.fc2(x)
        return x

Revisar el dispositivo que se está usando.

In [131]:
gpu_avail = torch.cuda.is_available()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"Is the GPU available? {gpu_avail}")
print("Device:", device)

Is the GPU available? False
Device: cpu


Creación del modelo, el optimizador y la función de costo.

In [132]:
model = Net(num_inputs = 21, num_hidden = 3, num_outputs = 1)

optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
criterion = nn.MSELoss()

#### Entrenamiento

In [133]:
model.to(device)
def train_model(model, optimizer, loss_module, train_loader, valid_loader, num_epochs):

  valid_loss_min = np.inf

  for i in range(num_epochs):
    model.train()
    train_loss = 0.0
    valid_loss = 0.0

    for data, target in train_loader:
        atributes = data.to(device)
        labels = target.to(device)

        optimizer.zero_grad()

        pred = model(atributes)
        pred = pred.squeeze(dim = 1)

        loss = loss_module(pred, labels.float())
        loss.backward()

        optimizer.step()

        train_loss += loss.item() * data.size(0)

    train_loss = train_loss/len(train_loader.dataset)

    model.eval()

    for data,target in valid_loader:
      data = data.to(device)
      target = target.to(device)
      output = model(data)
      loss = criterion(output, target)
      valid_loss += loss.item()*data.size(0)
    valid_loss = valid_loss/len(valid_loader.dataset)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        i, train_loss, valid_loss))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'diabetesmodel.pt')
        valid_loss_min = valid_loss

In [134]:
train_model(model, optimizer, criterion, traind, vald, num_epochs = 100)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0 	Training Loss: 0.112558 	Validation Loss: 0.124215
Validation loss decreased (inf --> 0.124215).  Saving model ...
Epoch: 1 	Training Loss: 0.109761 	Validation Loss: 0.123138
Validation loss decreased (0.124215 --> 0.123138).  Saving model ...
Epoch: 2 	Training Loss: 0.108777 	Validation Loss: 0.122769
Validation loss decreased (0.123138 --> 0.122769).  Saving model ...
Epoch: 3 	Training Loss: 0.108016 	Validation Loss: 0.123044
Epoch: 4 	Training Loss: 0.107302 	Validation Loss: 0.123150
Epoch: 5 	Training Loss: 0.106695 	Validation Loss: 0.122861
Epoch: 6 	Training Loss: 0.106180 	Validation Loss: 0.122609
Validation loss decreased (0.122769 --> 0.122609).  Saving model ...
Epoch: 7 	Training Loss: 0.105731 	Validation Loss: 0.122510
Validation loss decreased (0.122609 --> 0.122510).  Saving model ...
Epoch: 8 	Training Loss: 0.105337 	Validation Loss: 0.122489
Validation loss decreased (0.122510 --> 0.122489).  Saving model ...
Epoch: 9 	Training Loss: 0.104993 	Validat

In [135]:
model.load_state_dict(torch.load('diabetesmodel.pt'))

<All keys matched successfully>

#### Evaluar la precisión del modelo

In [136]:
def eval_model(model, data_loader):
    model.eval()
    true_preds, num_preds = 0., 0.

    with torch.no_grad():
        for data_inputs, data_labels in data_loader:

            data_inputs, data_labels = data_inputs.to(device), data_labels.to(device)
            preds = model(data_inputs)
            preds = preds.squeeze(dim=1)
            preds = torch.sigmoid(preds)
            pred_labels = (preds >= 0.5).long()

            true_preds += (pred_labels == data_labels).sum()
            num_preds += data_labels.shape[0]

    acc = true_preds / num_preds
    return acc

In [137]:
print(f"Accuracy of the model: {100.0*eval_model(model, testd):4.2f}%")

Accuracy of the model: 13.70%


## Atributos con mayor ganancia

Ahora se calculan los atributos con mayor ganancia para evaluar los modelos con esos atributos.

In [138]:
gm = pd.DataFrame(x, columns = x.columns.values)

clf = DecisionTreeClassifier(random_state= 42, criterion = 'entropy')
clf.fit(x,y)

fi = clf.feature_importances_
si = np.argsort(fi)[::-1]

for i in si:
  fn = gm.columns.values[i]
  imp = fi[i]
  print("Ganancia para el atributo", fn, ": ", imp)

Ganancia para el atributo BMI :  0.13758470960727265
Ganancia para el atributo Age :  0.10308230727058107
Ganancia para el atributo Income :  0.09952461219887272
Ganancia para el atributo HighBP_1 :  0.08920612209111388
Ganancia para el atributo PhysHlth :  0.0870992176829414
Ganancia para el atributo GenHlth :  0.07442832513723022
Ganancia para el atributo Education :  0.07289078570113422
Ganancia para el atributo MentHlth :  0.06687087282045902
Ganancia para el atributo Smoker_1 :  0.03742594196038871
Ganancia para el atributo Fruits_1 :  0.03703998409050273
Ganancia para el atributo PhysActivity_1 :  0.03239549832170855
Ganancia para el atributo Veggies_1 :  0.028470315593243475
Ganancia para el atributo Sex_1 :  0.02619135808351302
Ganancia para el atributo DiffWalk_1 :  0.021510643879186806
Ganancia para el atributo HeartDiseaseorAttack_1 :  0.018222225544362893
Ganancia para el atributo HighChol_1 :  0.01741249747187778
Ganancia para el atributo NoDocbcCost_1 :  0.014536518787410

### Ganancia mayor o igual a 10%

Se seleccionan todos los atributos con ganancia mayor o igual a 0.1 o 10%, los cuales son BMI y Age.

#### Algoritmos de ensamble

In [139]:
xg1 = x[['BMI', 'Age']]

In [140]:
xg1tr, xg1te, yg1tr, yg1te = train_test_split(xg1, y, test_size = 0.2, random_state = 42)

# Entrenar un modelo de Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(xg1tr, yg1tr)
rf_predictions = rf_classifier.predict(xg1te)

# Entrenar un modelo de AdaBoost
ada_classifier = AdaBoostClassifier(random_state=42)
ada_classifier.fit(xg1tr, yg1tr)
ada_predictions = ada_classifier.predict(xg1te)

# Entrenar un modelo de Gradient Boosting
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(xg1tr, yg1tr)
gb_predictions = gb_classifier.predict(xg1te)

# Entrenar un modelo de XGBoost
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(xg1tr, yg1tr)
xgb_predictions = xgb_classifier.predict(xg1te)

In [141]:
# Evaluar el rendimiento de cada modelo
rf_accuracy = accuracy_score(yg1te, rf_predictions)
ada_accuracy = accuracy_score(yg1te, ada_predictions)
gb_accuracy = accuracy_score(yg1te, gb_predictions)
xgb_accuracy = accuracy_score(yg1te, xgb_predictions)

# Mostrar la precisión de cada modelo
{'Random Forest Accuracy': rf_accuracy, 'AdaBoost Accuracy': ada_accuracy, 'Gradient Boosting Accuracy': gb_accuracy, 'XGBoost Accuracy': xgb_accuracy}

{'Random Forest Accuracy': 0.8613410596026491,
 'AdaBoost Accuracy': 0.8617549668874173,
 'Gradient Boosting Accuracy': 0.8621885840428887,
 'XGBoost Accuracy': 0.8613804793440555}

Aquí se puede ver que, comparado con el modelo inicial, Random Forest aumento su accuracy de 85.9% a 86.1%, mientras que los otros disminuyeron 0.5%.

#### Árbol de clasificación

In [142]:
xg1tr, xg1te, yg1tr, yg1te = train_test_split(xg1, y, test_size = 0.2, random_state = 42)

model = DecisionTreeClassifier()
model.fit(xg1tr, yg1tr)
yg1pr = model.predict(xg1te)

accuracy = accuracy_score(yg1te, yg1pr)
print(" DecisionTreeClassifier Accuracy:", accuracy)

 DecisionTreeClassifier Accuracy: 0.8614593188268685


Comparándolo con el modelo inicial, se puede ver que aumento de 79.% a 86.1%.

#### Clasificación con k-vecinos

In [143]:
xg1tr, xg1te, yg1tr, yg1te = train_test_split(xg1, y, test_size = 0.2, random_state = 42)

xg1tr = np.ascontiguousarray(xg1tr)
xg1te = np.ascontiguousarray(xg1te)

model = KNeighborsClassifier(n_neighbors=1)
model.fit(xg1tr, yg1tr)
yg1pr = model.predict(xg1te)

accuracy = accuracy_score(yg1te, yg1pr)
print("KNeighborsClassifier Accuracy:", accuracy)

KNeighborsClassifier Accuracy: 0.8099180069378745


Ahora, se puede ver que aumento de 80.5% a 80.9% a comparación con el modelo inicial.

#### Redes neuronales

In [144]:
dg1 = xg1.copy()
dg1['Diabetes_binary'] = y.tolist()
dg1

Unnamed: 0,BMI,Age,Diabetes_binary
0,40,9,0
1,25,7,0
2,28,9,0
3,27,11,0
4,24,11,0
...,...,...,...
253675,45,5,0
253676,18,11,1
253677,28,2,0
253678,23,7,0


In [145]:
dg1 = dg1.sample(n = 10000, random_state = 42)

traing1 = dg1.iloc[:8000,:]   #Equivale al 80% de los datos.
valg1 = dg1.iloc[8000:9000,:] #Equivale al 10% de los datos.
testg1 = dg1.iloc[9000:,:]    #Equivale al 10% de los datos.

In [146]:
traintg1 = MyDataset(traing1, 'Diabetes_binary')
testtg1 = MyDataset(testg1, 'Diabetes_binary')
valtg1 = MyDataset(valg1, 'Diabetes_binary')

In [147]:
traindg1 = DataLoader(traintg1, batch_size = 2,
                            shuffle = False,
                            num_workers = 0,
                            collate_fn = None,
                            pin_memory = False,)

testdg1 = DataLoader(testtg1, batch_size = 3,
                          shuffle = False,
                          num_workers = 0,
                          collate_fn = None,
                          pin_memory = False,)

valdg1 = DataLoader(valtg1, batch_size = 3,
                        shuffle = False,
                        num_workers = 0,
                        collate_fn = None,
                        pin_memory = False,)

In [148]:
modelg1 = Net(num_inputs = 2, num_hidden = 3, num_outputs = 1)

optimizerg1 = torch.optim.SGD(modelg1.parameters(), lr = 0.01)
criteriong1 = nn.MSELoss()

In [149]:
modelg1.to(device)

Net(
  (fc1): Linear(in_features=2, out_features=3, bias=True)
  (act_fn): Sigmoid()
  (fc2): Linear(in_features=3, out_features=1, bias=True)
)

In [150]:
train_model(modelg1, optimizerg1, criteriong1, traindg1, valdg1, num_epochs = 100)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0 	Training Loss: 0.121213 	Validation Loss: 0.123882
Validation loss decreased (inf --> 0.123882).  Saving model ...
Epoch: 1 	Training Loss: 0.119395 	Validation Loss: 0.123777
Validation loss decreased (0.123882 --> 0.123777).  Saving model ...
Epoch: 2 	Training Loss: 0.118782 	Validation Loss: 0.123835
Epoch: 3 	Training Loss: 0.117961 	Validation Loss: 0.124578
Epoch: 4 	Training Loss: 0.117479 	Validation Loss: 0.124922
Epoch: 5 	Training Loss: 0.117210 	Validation Loss: 0.125165
Epoch: 6 	Training Loss: 0.117015 	Validation Loss: 0.125404
Epoch: 7 	Training Loss: 0.116852 	Validation Loss: 0.125640
Epoch: 8 	Training Loss: 0.116714 	Validation Loss: 0.125856
Epoch: 9 	Training Loss: 0.116596 	Validation Loss: 0.126039
Epoch: 10 	Training Loss: 0.116493 	Validation Loss: 0.126191
Epoch: 11 	Training Loss: 0.116402 	Validation Loss: 0.126314
Epoch: 12 	Training Loss: 0.116319 	Validation Loss: 0.126415
Epoch: 13 	Training Loss: 0.116244 	Validation Loss: 0.126498
Epoch: 14

In [151]:
modelg1.load_state_dict(torch.load('diabetesmodel.pt'))

<All keys matched successfully>

In [152]:
print(f"Accuracy of the model: {100.0*eval_model(modelg1, testdg1):4.2f}%")

Accuracy of the model: 13.70%


En este caso se mantuvo igual que el modelo inicial.

### Ganancia mayor o igual a 5%

Ahora se seleccionan todos los atributos con ganancia mayor o igual a 0.05 o 5%, los cuales son BMI, Age, Income, HighBP_1, PhysHlth, GenHlth, Education, MentHlth.

#### Algoritmos de ensamble

In [153]:
xg2 = x[['BMI', 'Age', 'Income', 'HighBP_1', 'PhysHlth', 'GenHlth', 'Education', 'MentHlth']]

In [154]:
xg2tr, xg2te, yg2tr, yg2te = train_test_split(xg2, y, test_size = 0.2, random_state = 42)

# Entrenar un modelo de Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(xg2tr, yg2tr)
rf_predictions = rf_classifier.predict(xg2te)

# Entrenar un modelo de AdaBoost
ada_classifier = AdaBoostClassifier(random_state=42)
ada_classifier.fit(xg2tr, yg2tr)
ada_predictions = ada_classifier.predict(xg2te)

# Entrenar un modelo de Gradient Boosting
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(xg2tr, yg2tr)
gb_predictions = gb_classifier.predict(xg2te)

# Entrenar un modelo de XGBoost
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(xg2tr, yg2tr)
xgb_predictions = xgb_classifier.predict(xg2te)

In [155]:
# Evaluar el rendimiento de cada modelo
rf_accuracy = accuracy_score(yg2te, rf_predictions)
ada_accuracy = accuracy_score(yg2te, ada_predictions)
gb_accuracy = accuracy_score(yg2te, gb_predictions)
xgb_accuracy = accuracy_score(yg2te, xgb_predictions)

# Mostrar la precisión de cada modelo
{'Random Forest Accuracy': rf_accuracy, 'AdaBoost Accuracy': ada_accuracy, 'Gradient Boosting Accuracy': gb_accuracy, 'XGBoost Accuracy': xgb_accuracy}

{'Random Forest Accuracy': 0.847524440239672,
 'AdaBoost Accuracy': 0.8649676758120467,
 'Gradient Boosting Accuracy': 0.8650268054241564,
 'XGBoost Accuracy': 0.864435509303059}

Aquí se puede ver que comparándolo con los atributos con 10% de ganancia, todos aumentaron un poco, excepto Random Forest, pero comparándolo con el modelo inicial, todos disminuyeron un poco.

#### Árbol de clasificación

In [156]:
xg2tr, xg2te, yg2tr, yg2te = train_test_split(xg2, y, test_size = 0.2, random_state = 42)

model = DecisionTreeClassifier()
model.fit(xg2tr, yg2tr)
yg2pr = model.predict(xg2te)

accuracy = accuracy_score(yg2te, yg2pr)
print(" DecisionTreeClassifier Accuracy:", accuracy)

 DecisionTreeClassifier Accuracy: 0.8163434247871334


Se obtuvo un resultado mayor al modelo inicial pero menor al anterior.

#### Clasificación con k-vecinos

In [157]:
xg2tr, xg2te, yg2tr, yg2te = train_test_split(xg2, y, test_size = 0.2, random_state = 42)

xg2tr = np.ascontiguousarray(xg2tr)
xg2te = np.ascontiguousarray(xg2te)

model = KNeighborsClassifier(n_neighbors=1)
model.fit(xg2tr, yg2tr)
yg2pr = model.predict(xg2te)

accuracy = accuracy_score(yg2te, yg2pr)
print("KNeighborsClassifier Accuracy:", accuracy)

KNeighborsClassifier Accuracy: 0.8008908861557869


En este caso se mantuvo prácticamente igual, pero aun así es el menor de los anteriores.

#### Redes neuronales

In [158]:
dg2 = xg2.copy()
dg2['Diabetes_binary'] = y.tolist()
dg2

Unnamed: 0,BMI,Age,Income,HighBP_1,PhysHlth,GenHlth,Education,MentHlth,Diabetes_binary
0,40,9,3,1,15,5,4,18,0
1,25,7,1,0,0,3,6,0,0
2,28,9,8,1,30,5,4,30,0
3,27,11,6,1,0,2,3,0,0
4,24,11,4,1,0,2,5,3,0
...,...,...,...,...,...,...,...,...,...
253675,45,5,7,1,5,3,6,0,0
253676,18,11,4,1,0,4,2,0,1
253677,28,2,2,0,0,1,5,0,0
253678,23,7,1,1,0,3,5,0,0


In [159]:
dg2 = dg2.sample(n = 10000, random_state = 42)

traing2 = dg2.iloc[:8000,:]   #Equivale al 80% de los datos.
valg2 = dg2.iloc[8000:9000,:] #Equivale al 10% de los datos.
testg2 = dg2.iloc[9000:,:]    #Equivale al 10% de los datos.

In [160]:
traintg2 = MyDataset(traing2, 'Diabetes_binary')
testtg2 = MyDataset(testg2, 'Diabetes_binary')
valtg2 = MyDataset(valg2, 'Diabetes_binary')

In [161]:
traindg2 = DataLoader(traintg2, batch_size = 2,
                            shuffle = False,
                            num_workers = 0,
                            collate_fn = None,
                            pin_memory = False,)

testdg2 = DataLoader(testtg2, batch_size = 3,
                          shuffle = False,
                          num_workers = 0,
                          collate_fn = None,
                          pin_memory = False,)

valdg2 = DataLoader(valtg2, batch_size = 3,
                        shuffle = False,
                        num_workers = 0,
                        collate_fn = None,
                        pin_memory = False,)

In [162]:
modelg2 = Net(num_inputs = 8, num_hidden = 3, num_outputs = 1)

optimizerg2 = torch.optim.SGD(modelg2.parameters(), lr = 0.01)
criteriong2 = nn.MSELoss()

In [163]:
modelg2.to(device)

Net(
  (fc1): Linear(in_features=8, out_features=3, bias=True)
  (act_fn): Sigmoid()
  (fc2): Linear(in_features=3, out_features=1, bias=True)
)

In [164]:
train_model(modelg2, optimizerg2, criteriong2, traindg2, valdg2, num_epochs = 100)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0 	Training Loss: 0.115271 	Validation Loss: 0.121261
Validation loss decreased (inf --> 0.121261).  Saving model ...
Epoch: 1 	Training Loss: 0.110120 	Validation Loss: 0.119707
Validation loss decreased (0.121261 --> 0.119707).  Saving model ...
Epoch: 2 	Training Loss: 0.109118 	Validation Loss: 0.119591
Validation loss decreased (0.119707 --> 0.119591).  Saving model ...
Epoch: 3 	Training Loss: 0.108676 	Validation Loss: 0.119644
Epoch: 4 	Training Loss: 0.108346 	Validation Loss: 0.119738
Epoch: 5 	Training Loss: 0.108077 	Validation Loss: 0.119857
Epoch: 6 	Training Loss: 0.107849 	Validation Loss: 0.119988
Epoch: 7 	Training Loss: 0.107649 	Validation Loss: 0.120118
Epoch: 8 	Training Loss: 0.107470 	Validation Loss: 0.120238
Epoch: 9 	Training Loss: 0.107308 	Validation Loss: 0.120347
Epoch: 10 	Training Loss: 0.107161 	Validation Loss: 0.120445
Epoch: 11 	Training Loss: 0.107026 	Validation Loss: 0.120533
Epoch: 12 	Training Loss: 0.106900 	Validation Loss: 0.120614
Ep

In [165]:
modelg2.load_state_dict(torch.load('diabetesmodel.pt'))

<All keys matched successfully>

In [166]:
print(f"Accuracy of the model: {100.0*eval_model(modelg2, testdg2):4.2f}%")

Accuracy of the model: 13.70%


Se mantuvo igual que en los anteriores.

## Normalización

Ahora, se normalizan los datos del dataset inicial, excepto la variable objetivo.

In [167]:
xn = x.copy()
columns = xn.columns.values

for i in columns:
    mean = np.mean(xn[i])           #Normalizar los datos cogiendo cada columna, restando
    std = np.std(xn[i])             #la media de esa columna y diviviendo por su
    xn[i] = (xn[i] - mean) / std    #desviacion estandar

xn

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,Age,Education,Income,HighBP_1,HighChol_1,CholCheck_1,...,Stroke_1,HeartDiseaseorAttack_1,PhysActivity_1,Fruits_1,Veggies_1,HvyAlcoholConsump_1,AnyHealthcare_1,NoDocbcCost_1,DiffWalk_1,Sex_1
0,1.757936,2.329121,1.998592,1.233999,0.316900,-1.065595,-1.474487,1.153688,1.165254,0.196922,...,-0.205637,-0.322458,-1.762814,-1.316872,0.482087,-0.244014,0.226863,-0.303173,2.223615,-0.887021
1,-0.511806,0.457294,-0.429630,-0.486592,-0.337933,0.963272,-2.440138,-0.866785,-0.858182,-5.078164,...,-0.205637,-0.322458,0.567275,-1.316872,-2.074316,-0.244014,-4.407954,3.298445,-0.449718,-0.887021
2,-0.057858,2.329121,3.617407,2.954590,0.316900,-1.065595,0.939638,1.153688,1.165254,0.196922,...,-0.205637,-0.322458,-1.762814,0.759375,-2.074316,-0.244014,0.226863,3.298445,2.223615,-0.887021
3,-0.209174,-0.478619,-0.429630,-0.486592,0.971733,-2.080028,-0.026012,1.153688,-0.858182,0.196922,...,-0.205637,-0.322458,0.567275,0.759375,0.482087,-0.244014,0.226863,-0.303173,-0.449718,-0.887021
4,-0.663122,-0.478619,-0.024926,-0.486592,0.971733,-0.051162,-0.991662,1.153688,1.165254,0.196922,...,-0.205637,-0.322458,0.567275,0.759375,0.482087,-0.244014,0.226863,-0.303173,-0.449718,-0.887021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,2.514516,0.457294,-0.429630,0.086938,-0.992766,0.963272,0.456813,1.153688,1.165254,0.196922,...,-0.205637,-0.322458,-1.762814,0.759375,0.482087,-0.244014,0.226863,-0.303173,-0.449718,1.127369
253676,-1.571019,1.393207,-0.429630,-0.486592,0.971733,-3.094461,-0.991662,1.153688,1.165254,0.196922,...,-0.205637,-0.322458,-1.762814,-1.316872,-2.074316,-0.244014,0.226863,-0.303173,2.223615,-0.887021
253677,-0.057858,-1.414532,-0.429630,-0.486592,-1.975015,-0.051162,-1.957312,-0.866785,-0.858182,0.196922,...,-0.205637,-0.322458,0.567275,0.759375,-2.074316,-0.244014,0.226863,-0.303173,-0.449718,-0.887021
253678,-0.814438,0.457294,-0.429630,-0.486592,-0.337933,-0.051162,-2.440138,1.153688,-0.858182,0.196922,...,-0.205637,-0.322458,-1.762814,0.759375,0.482087,-0.244014,0.226863,-0.303173,-0.449718,1.127369


#### Algoritmos de ensamble

In [168]:
xntr, xnte, yntr, ynte = train_test_split(xn, y, test_size = 0.2, random_state = 42)

# Entrenar un modelo de Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(xntr, yntr)
rf_predictions = rf_classifier.predict(xnte)

# Entrenar un modelo de AdaBoost
ada_classifier = AdaBoostClassifier(random_state=42)
ada_classifier.fit(xntr, yntr)
ada_predictions = ada_classifier.predict(xnte)

# Entrenar un modelo de Gradient Boosting
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(xntr, yntr)
gb_predictions = gb_classifier.predict(xnte)

# Entrenar un modelo de XGBoost
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(xntr, yntr)
xgb_predictions = xgb_classifier.predict(xnte)

In [169]:
# Evaluar el rendimiento de cada modelo
rf_accuracy = accuracy_score(ynte, rf_predictions)
ada_accuracy = accuracy_score(ynte, ada_predictions)
gb_accuracy = accuracy_score(ynte, gb_predictions)
xgb_accuracy = accuracy_score(ynte, xgb_predictions)

# Mostrar la precisión de cada modelo
{'Random Forest Accuracy': rf_accuracy, 'AdaBoost Accuracy': ada_accuracy, 'Gradient Boosting Accuracy': gb_accuracy, 'XGBoost Accuracy': xgb_accuracy}

{'Random Forest Accuracy': 0.8599219489120151,
 'AdaBoost Accuracy': 0.8664262062440871,
 'Gradient Boosting Accuracy': 0.8675102491327656,
 'XGBoost Accuracy': 0.8667612740460423}

Como se puede ver, se obtuvo el mismo accuracy que con el modelo inicial.

#### Árbol de clasificación

In [170]:
xntr, xnte, yntr, ynte = train_test_split(xn, y, test_size = 0.2, random_state = 42)

model = DecisionTreeClassifier()
model.fit(xntr, yntr)
ynpr = model.predict(xnte)

accuracy = accuracy_score(ynte, ynpr)
print(" DecisionTreeClassifier Accuracy:", accuracy)

 DecisionTreeClassifier Accuracy: 0.7975796278776411


Aquí se obtuvo prácticamente lo mismo que en el modelo inicial, pero cambio de 79.6% a 79.7%.

#### Clasificación con k-vecinos

In [171]:
xntr, xnte, yntr, ynte = train_test_split(xn, y, test_size = 0.2, random_state = 42)

xntr = np.ascontiguousarray(xntr)
xnte = np.ascontiguousarray(xnte)

model = KNeighborsClassifier(n_neighbors=1)
model.fit(xntr, yntr)
ynpr = model.predict(xnte)

accuracy = accuracy_score(ynte, ynpr)
print("KNeighborsClassifier Accuracy:", accuracy)

KNeighborsClassifier Accuracy: 0.8031378114159571


Igual que para el árbol de clasificación, cambio de 80.5% en el modelo inicial a 80.3%.

#### Redes neuronales

In [172]:
dn = xn.copy()
dn['Diabetes_binary'] = y.tolist()
dn

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,Age,Education,Income,HighBP_1,HighChol_1,CholCheck_1,...,HeartDiseaseorAttack_1,PhysActivity_1,Fruits_1,Veggies_1,HvyAlcoholConsump_1,AnyHealthcare_1,NoDocbcCost_1,DiffWalk_1,Sex_1,Diabetes_binary
0,1.757936,2.329121,1.998592,1.233999,0.316900,-1.065595,-1.474487,1.153688,1.165254,0.196922,...,-0.322458,-1.762814,-1.316872,0.482087,-0.244014,0.226863,-0.303173,2.223615,-0.887021,0
1,-0.511806,0.457294,-0.429630,-0.486592,-0.337933,0.963272,-2.440138,-0.866785,-0.858182,-5.078164,...,-0.322458,0.567275,-1.316872,-2.074316,-0.244014,-4.407954,3.298445,-0.449718,-0.887021,0
2,-0.057858,2.329121,3.617407,2.954590,0.316900,-1.065595,0.939638,1.153688,1.165254,0.196922,...,-0.322458,-1.762814,0.759375,-2.074316,-0.244014,0.226863,3.298445,2.223615,-0.887021,0
3,-0.209174,-0.478619,-0.429630,-0.486592,0.971733,-2.080028,-0.026012,1.153688,-0.858182,0.196922,...,-0.322458,0.567275,0.759375,0.482087,-0.244014,0.226863,-0.303173,-0.449718,-0.887021,0
4,-0.663122,-0.478619,-0.024926,-0.486592,0.971733,-0.051162,-0.991662,1.153688,1.165254,0.196922,...,-0.322458,0.567275,0.759375,0.482087,-0.244014,0.226863,-0.303173,-0.449718,-0.887021,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,2.514516,0.457294,-0.429630,0.086938,-0.992766,0.963272,0.456813,1.153688,1.165254,0.196922,...,-0.322458,-1.762814,0.759375,0.482087,-0.244014,0.226863,-0.303173,-0.449718,1.127369,0
253676,-1.571019,1.393207,-0.429630,-0.486592,0.971733,-3.094461,-0.991662,1.153688,1.165254,0.196922,...,-0.322458,-1.762814,-1.316872,-2.074316,-0.244014,0.226863,-0.303173,2.223615,-0.887021,1
253677,-0.057858,-1.414532,-0.429630,-0.486592,-1.975015,-0.051162,-1.957312,-0.866785,-0.858182,0.196922,...,-0.322458,0.567275,0.759375,-2.074316,-0.244014,0.226863,-0.303173,-0.449718,-0.887021,0
253678,-0.814438,0.457294,-0.429630,-0.486592,-0.337933,-0.051162,-2.440138,1.153688,-0.858182,0.196922,...,-0.322458,-1.762814,0.759375,0.482087,-0.244014,0.226863,-0.303173,-0.449718,1.127369,0


In [173]:
dn = dn.sample(n = 10000, random_state = 42)

trainn = dn.iloc[:8000,:]   #Equivale al 80% de los datos.
valn = dn.iloc[8000:9000,:] #Equivale al 10% de los datos.
testn = dn.iloc[9000:,:]    #Equivale al 10% de los datos.

In [174]:
traintn = MyDataset(trainn, 'Diabetes_binary')
testtn = MyDataset(testn, 'Diabetes_binary')
valtn = MyDataset(valn, 'Diabetes_binary')

In [175]:
traindn = DataLoader(traintn, batch_size = 2,
                            shuffle = False,
                            num_workers = 0,
                            collate_fn = None,
                            pin_memory = False,)

testdn = DataLoader(testtn, batch_size = 3,
                          shuffle = False,
                          num_workers = 0,
                          collate_fn = None,
                          pin_memory = False,)

valdn = DataLoader(valtn, batch_size = 3,
                        shuffle = False,
                        num_workers = 0,
                        collate_fn = None,
                        pin_memory = False,)

In [176]:
modeln = Net(num_inputs = 21, num_hidden = 3, num_outputs = 1)

optimizern = torch.optim.SGD(modeln.parameters(), lr = 0.01)
criterionn = nn.MSELoss()

In [177]:
modeln.to(device)

Net(
  (fc1): Linear(in_features=21, out_features=3, bias=True)
  (act_fn): Sigmoid()
  (fc2): Linear(in_features=3, out_features=1, bias=True)
)

In [178]:
train_model(modeln, optimizern, criterionn, traindn, valdn, num_epochs = 100)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0 	Training Loss: 0.103764 	Validation Loss: 0.123648
Validation loss decreased (inf --> 0.123648).  Saving model ...
Epoch: 1 	Training Loss: 0.099529 	Validation Loss: 0.124128
Epoch: 2 	Training Loss: 0.099190 	Validation Loss: 0.124245
Epoch: 3 	Training Loss: 0.099029 	Validation Loss: 0.124306
Epoch: 4 	Training Loss: 0.098887 	Validation Loss: 0.124353
Epoch: 5 	Training Loss: 0.098752 	Validation Loss: 0.124392
Epoch: 6 	Training Loss: 0.098621 	Validation Loss: 0.124423
Epoch: 7 	Training Loss: 0.098497 	Validation Loss: 0.124449
Epoch: 8 	Training Loss: 0.098379 	Validation Loss: 0.124470
Epoch: 9 	Training Loss: 0.098268 	Validation Loss: 0.124487
Epoch: 10 	Training Loss: 0.098166 	Validation Loss: 0.124501
Epoch: 11 	Training Loss: 0.098071 	Validation Loss: 0.124512
Epoch: 12 	Training Loss: 0.097985 	Validation Loss: 0.124519
Epoch: 13 	Training Loss: 0.097907 	Validation Loss: 0.124524
Epoch: 14 	Training Loss: 0.097837 	Validation Loss: 0.124527
Epoch: 15 	Train

In [179]:
modeln.load_state_dict(torch.load('diabetesmodel.pt'))

<All keys matched successfully>

In [180]:
print(f"Accuracy of the model: {100.0*eval_model(modeln, testdn):4.2f}%")

Accuracy of the model: 23.60%


En este caso, sí se ve una mejora significante para redes neuronales, ya que paso de 13.7% en todos los anteriores a 23.6%.

#### Normalización utilizando atributos con ganancia mayor o igual a 10%

Para esta parte se utiliza el dataset normalizado y se toman los atributos que tienen ganancia de 10%.

##### Algoritmos de ensamble

In [181]:
xg1n = xn[['BMI', 'Age']]

In [182]:
xg1ntr, xg1nte, yg1ntr, yg1nte = train_test_split(xg1n, y, test_size = 0.2, random_state = 42)

# Entrenar un modelo de Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(xg1ntr, yg1ntr)
rf_predictions = rf_classifier.predict(xg1nte)

# Entrenar un modelo de AdaBoost
ada_classifier = AdaBoostClassifier(random_state=42)
ada_classifier.fit(xg1ntr, yg1ntr)
ada_predictions = ada_classifier.predict(xg1nte)

# Entrenar un modelo de Gradient Boosting
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(xg1ntr, yg1ntr)
gb_predictions = gb_classifier.predict(xg1nte)

# Entrenar un modelo de XGBoost
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(xg1ntr, yg1ntr)
xgb_predictions = xgb_classifier.predict(xg1nte)

In [183]:
# Evaluar el rendimiento de cada modelo
rf_accuracy = accuracy_score(yg1nte, rf_predictions)
ada_accuracy = accuracy_score(yg1nte, ada_predictions)
gb_accuracy = accuracy_score(yg1nte, gb_predictions)
xgb_accuracy = accuracy_score(yg1nte, xgb_predictions)

# Mostrar la precisión de cada modelo
{'Random Forest Accuracy': rf_accuracy, 'AdaBoost Accuracy': ada_accuracy, 'Gradient Boosting Accuracy': gb_accuracy, 'XGBoost Accuracy': xgb_accuracy}

{'Random Forest Accuracy': 0.8613213497319457,
 'AdaBoost Accuracy': 0.8617549668874173,
 'Gradient Boosting Accuracy': 0.8621885840428887,
 'XGBoost Accuracy': 0.8613804793440555}

En este caso se mantienen en el rango de 86%, que es casi igual a los anteriores.

##### Árbol de clasificación

In [184]:
xg1ntr, xg1nte, yg1ntr, yg1nte = train_test_split(xg1n, y, test_size = 0.2, random_state = 42)

model = DecisionTreeClassifier()
model.fit(xg1ntr, yg1ntr)
yg1npr = model.predict(xg1nte)

accuracy = accuracy_score(yg1nte, yg1npr)
print(" DecisionTreeClassifier Accuracy:", accuracy)

 DecisionTreeClassifier Accuracy: 0.8614593188268685


Aquí se obtiene un accuracy del 86.1%, igualando el resultado con los atributos con ganancia mayor o igual al 10%.

##### Clasificación con k-vecinos

In [185]:
xg1ntr, xg1nte, yg1ntr, yg1nte = train_test_split(xg1n, y, test_size = 0.2, random_state = 42)

xg1ntr = np.ascontiguousarray(xg1ntr)
xg1nte = np.ascontiguousarray(xg1nte)

model = KNeighborsClassifier(n_neighbors=1)
model.fit(xg1ntr, yg1ntr)
yg1npr = model.predict(xg1nte)

accuracy = accuracy_score(yg1nte, yg1npr)
print("KNeighborsClassifier Accuracy:", accuracy)

KNeighborsClassifier Accuracy: 0.7711486912645853


Por el contrario, aquí se obtuvo un accuracy del 77.1%, siendo el peor hasta ahora.

##### Redes neuronales

In [186]:
dg1n = xg1n.copy()
dg1n['Diabetes_binary'] = y.tolist()
dg1n

Unnamed: 0,BMI,Age,Diabetes_binary
0,1.757936,0.316900,0
1,-0.511806,-0.337933,0
2,-0.057858,0.316900,0
3,-0.209174,0.971733,0
4,-0.663122,0.971733,0
...,...,...,...
253675,2.514516,-0.992766,0
253676,-1.571019,0.971733,1
253677,-0.057858,-1.975015,0
253678,-0.814438,-0.337933,0


In [187]:
dg1n = dg1n.sample(n = 10000, random_state = 42)

traing1n = dg1n.iloc[:8000,:]   #Equivale al 80% de los datos.
valg1n = dg1n.iloc[8000:9000,:] #Equivale al 10% de los datos.
testg1n = dg1n.iloc[9000:,:]    #Equivale al 10% de los datos.

In [188]:
traintg1n = MyDataset(traing1n, 'Diabetes_binary')
testtg1n = MyDataset(testg1n, 'Diabetes_binary')
valtg1n = MyDataset(valg1n, 'Diabetes_binary')

In [189]:
traindg1n = DataLoader(traintg1n, batch_size = 2,
                            shuffle = False,
                            num_workers = 0,
                            collate_fn = None,
                            pin_memory = False,)

testdg1n = DataLoader(testtg1n, batch_size = 3,
                          shuffle = False,
                          num_workers = 0,
                          collate_fn = None,
                          pin_memory = False,)

valdg1n = DataLoader(valtg1n, batch_size = 3,
                        shuffle = False,
                        num_workers = 0,
                        collate_fn = None,
                        pin_memory = False,)

In [190]:
modelg1n = Net(num_inputs = 2, num_hidden = 3, num_outputs = 1)

optimizerg1n = torch.optim.SGD(modelg1n.parameters(), lr = 0.01)
criteriong1n = nn.MSELoss()

In [191]:
modelg1n.to(device)

Net(
  (fc1): Linear(in_features=2, out_features=3, bias=True)
  (act_fn): Sigmoid()
  (fc2): Linear(in_features=3, out_features=1, bias=True)
)

In [192]:
train_model(modelg1n, optimizerg1n, criteriong1n, traindg1n, valdg1n, num_epochs = 100)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0 	Training Loss: 0.109143 	Validation Loss: 0.122894
Validation loss decreased (inf --> 0.122894).  Saving model ...
Epoch: 1 	Training Loss: 0.108320 	Validation Loss: 0.122973
Epoch: 2 	Training Loss: 0.108241 	Validation Loss: 0.123024
Epoch: 3 	Training Loss: 0.108165 	Validation Loss: 0.123083
Epoch: 4 	Training Loss: 0.108095 	Validation Loss: 0.123149
Epoch: 5 	Training Loss: 0.108033 	Validation Loss: 0.123217
Epoch: 6 	Training Loss: 0.107980 	Validation Loss: 0.123280
Epoch: 7 	Training Loss: 0.107936 	Validation Loss: 0.123336
Epoch: 8 	Training Loss: 0.107899 	Validation Loss: 0.123385
Epoch: 9 	Training Loss: 0.107867 	Validation Loss: 0.123426
Epoch: 10 	Training Loss: 0.107840 	Validation Loss: 0.123461
Epoch: 11 	Training Loss: 0.107815 	Validation Loss: 0.123491
Epoch: 12 	Training Loss: 0.107794 	Validation Loss: 0.123516
Epoch: 13 	Training Loss: 0.107775 	Validation Loss: 0.123538
Epoch: 14 	Training Loss: 0.107757 	Validation Loss: 0.123556
Epoch: 15 	Train

In [193]:
modelg1n.load_state_dict(torch.load('diabetesmodel.pt'))

<All keys matched successfully>

In [194]:
print(f"Accuracy of the model: {100.0*eval_model(modelg1n, testdg1n):4.2f}%")

Accuracy of the model: 15.50%


Aquí se obtiene una mejora a comparación de todos los otros exceptuando el resultado de la normalización sola.

#### Normalización utilizando atributos con ganancia mayor o igual a 5%

Para esta parte se utiliza el dataset normalizado y se toman los atributos que tienen ganancia de 5%.

##### Algoritmos de ensamble

In [195]:
xg2n = xn[['BMI', 'Age', 'Income', 'HighBP_1', 'PhysHlth', 'GenHlth', 'Education', 'MentHlth']]

In [196]:
xg2ntr, xg2nte, yg2ntr, yg2nte = train_test_split(xg2n, y, test_size = 0.2, random_state = 42)

# Entrenar un modelo de Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(xg2ntr, yg2ntr)
rf_predictions = rf_classifier.predict(xg2nte)

# Entrenar un modelo de AdaBoost
ada_classifier = AdaBoostClassifier(random_state=42)
ada_classifier.fit(xg2ntr, yg2ntr)
ada_predictions = ada_classifier.predict(xg2nte)

# Entrenar un modelo de Gradient Boosting
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(xg2ntr, yg2ntr)
gb_predictions = gb_classifier.predict(xg2nte)

# Entrenar un modelo de XGBoost
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(xg2ntr, yg2ntr)
xgb_predictions = xgb_classifier.predict(xg2nte)

In [197]:
# Evaluar el rendimiento de cada modelo
rf_accuracy = accuracy_score(yg2nte, rf_predictions)
ada_accuracy = accuracy_score(yg2nte, ada_predictions)
gb_accuracy = accuracy_score(yg2nte, gb_predictions)
xgb_accuracy = accuracy_score(yg2nte, xgb_predictions)

# Mostrar la precisión de cada modelo
{'Random Forest Accuracy': rf_accuracy, 'AdaBoost Accuracy': ada_accuracy, 'Gradient Boosting Accuracy': gb_accuracy, 'XGBoost Accuracy': xgb_accuracy}

{'Random Forest Accuracy': 0.8472682119205298,
 'AdaBoost Accuracy': 0.8649676758120467,
 'Gradient Boosting Accuracy': 0.8650268054241564,
 'XGBoost Accuracy': 0.864435509303059}

Aquí, otra vez, se mantienen al rededor del 86%, exceptuando Random Forest que disminuyo a 84.7%.

##### Árbol de clasificación

In [198]:
xg2ntr, xg2nte, yg2ntr, yg2nte = train_test_split(xg2n, y, test_size = 0.2, random_state = 42)

model = DecisionTreeClassifier()
model.fit(xg2ntr, yg2ntr)
yg2npr = model.predict(xg2nte)

accuracy = accuracy_score(yg2nte, yg2npr)
print(" DecisionTreeClassifier Accuracy:", accuracy)

 DecisionTreeClassifier Accuracy: 0.8173092084515926


En este caso disminuyo a 81.7%.

##### Clasificación con k-vecinos

In [199]:
xg2ntr, xg2nte, yg2ntr, yg2nte = train_test_split(xg2n, y, test_size = 0.2, random_state = 42)

xg2ntr = np.ascontiguousarray(xg2ntr)
xg2nte = np.ascontiguousarray(xg2nte)

model = KNeighborsClassifier(n_neighbors=1)
model.fit(xg2ntr, yg2ntr)
yg2npr = model.predict(xg2nte)

accuracy = accuracy_score(yg2nte, yg2npr)
print("KNeighborsClassifier Accuracy:", accuracy)

KNeighborsClassifier Accuracy: 0.8017187007253233


Para este se mantiene en el rango del 80%, como a gran mayoría.

##### Redes neuronales

In [200]:
dg2n = xg2n.copy()
dg2n['Diabetes_binary'] = y.tolist()
dg2n

Unnamed: 0,BMI,Age,Income,HighBP_1,PhysHlth,GenHlth,Education,MentHlth,Diabetes_binary
0,1.757936,0.316900,-1.474487,1.153688,1.233999,2.329121,-1.065595,1.998592,0
1,-0.511806,-0.337933,-2.440138,-0.866785,-0.486592,0.457294,0.963272,-0.429630,0
2,-0.057858,0.316900,0.939638,1.153688,2.954590,2.329121,-1.065595,3.617407,0
3,-0.209174,0.971733,-0.026012,1.153688,-0.486592,-0.478619,-2.080028,-0.429630,0
4,-0.663122,0.971733,-0.991662,1.153688,-0.486592,-0.478619,-0.051162,-0.024926,0
...,...,...,...,...,...,...,...,...,...
253675,2.514516,-0.992766,0.456813,1.153688,0.086938,0.457294,0.963272,-0.429630,0
253676,-1.571019,0.971733,-0.991662,1.153688,-0.486592,1.393207,-3.094461,-0.429630,1
253677,-0.057858,-1.975015,-1.957312,-0.866785,-0.486592,-1.414532,-0.051162,-0.429630,0
253678,-0.814438,-0.337933,-2.440138,1.153688,-0.486592,0.457294,-0.051162,-0.429630,0


In [201]:
dg2n = dg2n.sample(n = 10000, random_state = 42)

traing2n = dg2n.iloc[:8000,:]   #Equivale al 80% de los datos.
valg2n = dg2n.iloc[8000:9000,:] #Equivale al 10% de los datos.
testg2n = dg2n.iloc[9000:,:]    #Equivale al 10% de los datos.

In [202]:
traintg2n = MyDataset(traing2n, 'Diabetes_binary')
testtg2n = MyDataset(testg2n, 'Diabetes_binary')
valtg2n = MyDataset(valg2n, 'Diabetes_binary')

In [203]:
traindg2n = DataLoader(traintg2n, batch_size = 2,
                            shuffle = False,
                            num_workers = 0,
                            collate_fn = None,
                            pin_memory = False,)

testdg2n = DataLoader(testtg2n, batch_size = 3,
                          shuffle = False,
                          num_workers = 0,
                          collate_fn = None,
                          pin_memory = False,)

valdg2n = DataLoader(valtg2n, batch_size = 3,
                        shuffle = False,
                        num_workers = 0,
                        collate_fn = None,
                        pin_memory = False,)

In [204]:
modelg2n = Net(num_inputs = 8, num_hidden = 3, num_outputs = 1)

optimizerg2n = torch.optim.SGD(modelg2n.parameters(), lr = 0.01)
criteriong2n = nn.MSELoss()

In [205]:
modelg2n.to(device)

Net(
  (fc1): Linear(in_features=8, out_features=3, bias=True)
  (act_fn): Sigmoid()
  (fc2): Linear(in_features=3, out_features=1, bias=True)
)

In [206]:
train_model(modelg2n, optimizerg2n, criteriong2n, traindg2n, valdg2n, num_epochs = 100)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0 	Training Loss: 0.110338 	Validation Loss: 0.121141
Validation loss decreased (inf --> 0.121141).  Saving model ...
Epoch: 1 	Training Loss: 0.102520 	Validation Loss: 0.122892
Epoch: 2 	Training Loss: 0.101808 	Validation Loss: 0.123433
Epoch: 3 	Training Loss: 0.101508 	Validation Loss: 0.123774
Epoch: 4 	Training Loss: 0.101340 	Validation Loss: 0.123990
Epoch: 5 	Training Loss: 0.101216 	Validation Loss: 0.124140
Epoch: 6 	Training Loss: 0.101109 	Validation Loss: 0.124252
Epoch: 7 	Training Loss: 0.101011 	Validation Loss: 0.124342
Epoch: 8 	Training Loss: 0.100916 	Validation Loss: 0.124417
Epoch: 9 	Training Loss: 0.100825 	Validation Loss: 0.124479
Epoch: 10 	Training Loss: 0.100735 	Validation Loss: 0.124532
Epoch: 11 	Training Loss: 0.100648 	Validation Loss: 0.124578
Epoch: 12 	Training Loss: 0.100563 	Validation Loss: 0.124617
Epoch: 13 	Training Loss: 0.100481 	Validation Loss: 0.124652
Epoch: 14 	Training Loss: 0.100403 	Validation Loss: 0.124682
Epoch: 15 	Train

In [207]:
modelg2n.load_state_dict(torch.load('diabetesmodel.pt'))

<All keys matched successfully>

In [208]:
print(f"Accuracy of the model: {100.0*eval_model(modelg2n, testdg2n):4.2f}%")

Accuracy of the model: 17.20%


Y por ultimo, aqui tambien aumenta el accuracy, pero no sobrepasa el 23.6% obtenido antes.

Y como conclusión, para los algoritmos de ensamble, los mejores resultados fueron los siguientes:
* Random Forest: 86.1% con los atributos con ganancia mayor o igual al 10% o con el dataset normalizado y usando los atributos con ganancia mayor o igual al 10%.
* AdaBoost: 86.6% con el modelo inicial o el dataset normalizado.
* Gradient Boosting: 86.7% con el modelo inicial o el dataset normalizado.
* XGBoost: 86.6% con el modelo inicial o el dataset normalizado.

Ahora, para el árbol de clasificación, el mejor resultado fue de 86.1% con los atributos con ganancia mayor o igual al 10% o con el dataset normalizado y usando los atributos con ganancia mayor o igual al 10%.

Para k-vecinos fue de 80.9% con los atributos con ganancia mayor o igual al 10%.

Y por último, para redes neuronales el mejor fue de 23.6% con el dataset normalizado.