In [1]:
pip install ucimlrepo



In [2]:
import pandas as pd
import torch
import torch.utils.data
import torch.nn as nn
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [3]:
from ucimlrepo import fetch_ucirepo
import ssl

# Ignore ssl certificate verification
ssl._create_default_https_context = ssl._create_unverified_context

# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
X_dataframe = cdc_diabetes_health_indicators.data.features
y_dataframe = cdc_diabetes_health_indicators.data.targets

# metadata
print(cdc_diabetes_health_indicators.metadata)

# variable information
print(cdc_diabetes_health_indicators.variables)

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

In [4]:
# Converting DataFrames to Pytorch tensors... because who wants to learn Pandas right now?
X = torch.from_numpy(X_dataframe.to_numpy())
y = torch.from_numpy(y_dataframe.to_numpy())

print(type(X), type(y))
print(X.shape, y.shape)

<class 'torch.Tensor'> <class 'torch.Tensor'>
torch.Size([253680, 21]) torch.Size([253680, 1])


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Converting labels to 1-hot encoding
def to1hot(labels):
    return torch.eye(2)[labels]

labels = to1hot(y[:,0])
print(labels[:3])

# Splitting data into train, validation, and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=.3, stratify=labels, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=.5, stratify=y_test, shuffle=True, random_state=42)
print("X_test:", X_test.shape, "X_val:", X_val.shape, "X_train:", X_train.shape)

# Standardizing features
standardizer = StandardScaler()
X_train = torch.from_numpy(standardizer.fit_transform(X_train))
X_val = torch.from_numpy(standardizer.transform(X_val))
X_test = torch.from_numpy(standardizer.transform(X_test))

tensor([[1., 0.],
        [1., 0.],
        [1., 0.]])
X_test: torch.Size([38052, 21]) X_val: torch.Size([38052, 21]) X_train: torch.Size([177576, 21])


In [6]:
from torch.utils.data import DataLoader, Dataset

# Making Datasets for training, validation, and testing
class HealthData(Dataset):
    def __init__(self, participants):
        self.participants = participants

    def __len__(self):
        # The total number of particpants in the dataset
        return len(self.participants)

    def __getitem__(self, idx):
        # Extract the particpant data
        participant = self.participants[idx]
        return participant


# Each dataset is an R x 23 tensor, where the final 2 columns serve as a 1-hot encoding of the labels
train_dataset = HealthData(torch.concat((X_train, y_train), dim=1).float())
validation_dataset = HealthData(torch.concat((X_val, y_val), dim=1).float())
test_dataset = HealthData(torch.concat((X_test, y_test), dim=1).float())

In [7]:
# Define model with residual connections and implement batch normalization
class BatchNormDNN(nn.Module):
    def __init__(self, input_dim=21, hidden_dim=5, num_classes=2, num_layers=15, activation=nn.LeakyReLU):
        super(BatchNormDNN, self).__init__()

        # Define layers
        layers = [nn.Linear(input_dim, hidden_dim)]
        for i in range(1, num_layers-1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))

        layers.append(nn.Linear(hidden_dim, num_classes))

        self.layers = nn.ModuleList(layers)
        self.relu = activation()
        self.batch_norm = torch.nn.BatchNorm1d(hidden_dim)

        # Initialize weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, mean=0, std=0.01)

    def forward(self, x):
        for i, layer in enumerate(self.layers):
            if i == 0:
                x = self.relu(self.batch_norm(layer(x)))
            # Don't apply relu or batch_norm to last layer
            elif i == len(self.layers) - 1:
                x =  layer(x)
            else:
                # Residual connections
                x = x + self.relu(self.batch_norm(layer(x)))
        return x

In [8]:
import torch.optim as optim

def train(model, loss, dataloader, optimizer):
    """Helper function to train our model."""
    total_error = 0.
    for it, batch in enumerate(dataloader):
        inputs = batch[:, :-2]
        labels = batch[:, -2:]
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Compute model predictions
        pred = model(inputs)

        optimizer.zero_grad()

        # Compute the loss
        l = loss(pred, labels)
        total_error += l.item()

        l.backward()

        # Update the weights
        optimizer.step()

    return total_error / len(dataloader)


def fit(model, loss, dataloader, epochs=41):
    optimizer = optim.Adam(model.parameters(), lr=0.03)
    for ep in range(epochs):
        err = train(model, loss, dataloader, optimizer)
        if ep % 10 == 0:
            print(f"[Ep{ep}] Loss {err:.3f}")


In [9]:
def evaluate_model(model, dataloader):
    all_predictions = []
    all_labels = []

    for batch in dataloader:
        inputs = batch[:, :-2]
        labels = batch[:, -2:]
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Get model predictions
        predictions = model(inputs)

        # Collect predictions and labels
        all_predictions.extend(torch.argmax(predictions, dim=1).tolist())
        all_labels.extend(torch.argmax(labels, dim=1).tolist())

    # Confusion matrix, Accuracy, Error
    cm = confusion_matrix(all_labels, all_predictions)
    accuracy = accuracy_score(all_labels, all_predictions)
    error = 1 - accuracy

    print("Confusion Matrix:")
    print(cm)
    print("Accuracy:", accuracy)
    print("Error:", error)
    return accuracy

In [10]:
# Define Loss Function
loss = nn.CrossEntropyLoss()

# Define data loader for easy batching
dataloader = DataLoader(train_dataset, batch_size=10000, shuffle=True, drop_last=True)
valloader = DataLoader(validation_dataset, batch_size=5000, shuffle=True, drop_last=True)

# Model
print("LeakyReLU activation")
leaky_relu_model = BatchNormDNN(num_layers=5, hidden_dim=30, activation=nn.LeakyReLU)
leaky_relu_model = leaky_relu_model.to(device)
fit(leaky_relu_model, loss, dataloader, epochs=50)
print()
evaluate_model(leaky_relu_model, valloader)

print("Sigmoid activation")
sigmoid_model = BatchNormDNN(num_layers=5, hidden_dim=30, activation=nn.Sigmoid)
sigmoid_model = sigmoid_model.to(device)
fit(sigmoid_model, loss, dataloader, epochs=50)
print()
evaluate_model(sigmoid_model, valloader)

print("TanH activation")
tanh_model = BatchNormDNN(num_layers=5, hidden_dim=30, activation=nn.Tanh)
tanh_model = tanh_model.to(device)
fit(tanh_model, loss, dataloader, epochs=50)
print()
evaluate_model(tanh_model, valloader)

LeakyReLU activation
[Ep0] Loss 0.381
[Ep10] Loss 0.312
[Ep20] Loss 0.310
[Ep30] Loss 0.310
[Ep40] Loss 0.309

Confusion Matrix:
[[29566   571]
 [ 4034   829]]
Accuracy: 0.8684285714285714
Error: 0.13157142857142856
Sigmoid activation
[Ep0] Loss 0.436
[Ep10] Loss 0.313
[Ep20] Loss 0.311
[Ep30] Loss 0.311
[Ep40] Loss 0.311

Confusion Matrix:
[[29753   406]
 [ 4189   652]]
Accuracy: 0.8687142857142857
Error: 0.13128571428571434
TanH activation
[Ep0] Loss 0.465
[Ep10] Loss 0.313
[Ep20] Loss 0.312
[Ep30] Loss 0.311
[Ep40] Loss 0.312

Confusion Matrix:
[[29605   564]
 [ 4016   815]]
Accuracy: 0.8691428571428571
Error: 0.1308571428571429


0.8691428571428571

## We can clearly see from the validation set that even though negligible, **LeakyReLU** has the best accuracy among all the different activation functions used!

In [11]:
loss = nn.CrossEntropyLoss()
dataloader = DataLoader(train_dataset, batch_size=10000, shuffle=True, drop_last=True)
accuracies = {}
for i in [10, 30, 50, 100]:
    for j in [5, 10, 21, 50, 100]:
        print(f"HIDDEN LAYERS = {i}")
        print(f"DIMENSIONS = {j}")
        experiment_model = BatchNormDNN(num_layers=i, hidden_dim=j)
        experiment_model = experiment_model.to(device)
        fit(experiment_model, loss, dataloader, epochs=50)
        print()
        accuracy = evaluate_model(experiment_model, valloader)
        print()
        print()
        accuracies[f"{i},{j}"] = accuracy

HIDDEN LAYERS = 10
DIMENSIONS = 5
[Ep0] Loss 0.374
[Ep10] Loss 0.314
[Ep20] Loss 0.314
[Ep30] Loss 0.313
[Ep40] Loss 0.313

Confusion Matrix:
[[29446   667]
 [ 4004   883]]
Accuracy: 0.8665428571428572
Error: 0.13345714285714283


HIDDEN LAYERS = 10
DIMENSIONS = 10
[Ep0] Loss 0.394
[Ep10] Loss 0.313
[Ep20] Loss 0.312
[Ep30] Loss 0.312
[Ep40] Loss 0.311

Confusion Matrix:
[[29691   454]
 [ 4108   747]]
Accuracy: 0.8696571428571429
Error: 0.1303428571428571


HIDDEN LAYERS = 10
DIMENSIONS = 21
[Ep0] Loss 0.391
[Ep10] Loss 0.312
[Ep20] Loss 0.312
[Ep30] Loss 0.311
[Ep40] Loss 0.309

Confusion Matrix:
[[29683   452]
 [ 4205   660]]
Accuracy: 0.8669428571428571
Error: 0.13305714285714287


HIDDEN LAYERS = 10
DIMENSIONS = 50
[Ep0] Loss 0.559
[Ep10] Loss 0.311
[Ep20] Loss 0.309
[Ep30] Loss 0.307
[Ep40] Loss 0.305

Confusion Matrix:
[[29504   638]
 [ 3979   879]]
Accuracy: 0.8680857142857142
Error: 0.13191428571428576


HIDDEN LAYERS = 10
DIMENSIONS = 100
[Ep0] Loss 0.851
[Ep10] Loss 0.312
[Ep

In [21]:
best_key = max(accuracies, key=accuracies.get)
best_i, best_j = map(int, best_key.split(','))

print(f"Best hidden layers: {best_i}")
print(f"Best dimensions: {best_j}")

# Fit the final Model
test_model_bn = BatchNormDNN(num_layers=best_i, hidden_dim=best_j).to(device)
test_dataloader = DataLoader(test_dataset, batch_size=5000, shuffle=True, drop_last=True)
fit(test_model_bn, loss, test_dataloader, epochs=50)

Best hidden layers: 10
Best dimensions: 10
[Ep0] Loss 0.417
[Ep10] Loss 0.314
[Ep20] Loss 0.310
[Ep30] Loss 0.311
[Ep40] Loss 0.310


In [19]:
for model in {test_model_bn}:
    test_dataloader = DataLoader(test_dataset, batch_size=5000, shuffle=True, drop_last=True)
    test_error = 0
    test_acc= 0
    print({model})
    for it, batch in enumerate(test_dataloader):
        test_inputs = batch[:, :-2]
        test_inputs = test_inputs.to(device)
        test_labels = batch[:, -2:]
        test_labels = test_labels.to(device)
        # Compute model predictions
        test_pred = model(test_inputs)
        l = loss(test_pred, test_labels)
        test_acc += (sum(torch.argmax(test_pred, dim=1) == torch.argmax(test_labels, dim=1)) / len(batch)).item()
        test_error += l.item()
    avg_test_acc = test_acc / len(test_dataloader)
    avg_error = test_error / len(test_dataloader)
    print("Average Test Accuracy:", avg_test_acc)
    print("Average Test Loss:", avg_error)

{BatchNormDNN(
  (layers): ModuleList(
    (0): Linear(in_features=21, out_features=10, bias=True)
    (1-8): 8 x Linear(in_features=10, out_features=10, bias=True)
    (9): Linear(in_features=10, out_features=2, bias=True)
  )
  (relu): LeakyReLU(negative_slope=0.01)
  (batch_norm): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)}
Average Test Accuracy: 0.867485693522862
Average Test Loss: 0.3056148375783648


In [22]:
test_acc = evaluate_model(test_model_bn, test_dataloader)

Confusion Matrix:
[[29566   547]
 [ 4106   781]]
Accuracy: 0.8670571428571429
Error: 0.13294285714285714
