In [15]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

df = pd.read_csv("diabetes.csv")
df.sample()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
176,6,85,78,0,0,31.2,0.382,42,0


In [16]:
def preprocess_data(df: pd.DataFrame):
    x, y = df.drop(columns=["Outcome"]), df["Outcome"]
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)
    y_array = y.values
    x_train, x_val, y_train, y_val = train_test_split(x_scaled,y_array,test_size=0.2,random_state=42,stratify=y)
    return x_train, x_val, y_train, y_val, scaler

In [17]:
X_train, X_val, y_train, y_val, scaler = preprocess_data(df)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((614, 8), (154, 8), (614,), (154,))

In [18]:
class DiabetesDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()

        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y.reshape(-1,1), dtype=torch.float32)

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [19]:
train_ds = DiabetesDataset(X_train, y_train)
val_ds = DiabetesDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)

len(train_ds), len(val_ds)

(614, 154)

In [20]:
def build_model(input_dim: int) -> nn.Module:
    # model = nn.Sequential(
    #     nn.Linear(input_dim, 32),
    #     nn.ReLU(),
    #     nn.Linear(32, 16),
    #     nn.ReLU(),
    #     nn.Linear(16, 1),
    #     nn.Sigmoid(),
    # )

    model = nn.Sequential(
        nn.Linear(input_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, 1),
        nn.Sigmoid()
    )
    return model


In [21]:
input_dim = X_train.shape[1]
model = build_model(input_dim)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model

Sequential(
  (0): Linear(in_features=8, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=1, bias=True)
  (5): Sigmoid()
)

In [22]:
def train_one_epoch(model: nn.Module,
                    train_loader: DataLoader,
                    criterion,
                    optimizer) -> float:
    model.train()
    losses = []
    batches = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        batches += 1
    return sum(losses) / batches

In [23]:
def evaluate(model: nn.Module, val_loader: DataLoader) -> float:
    model.eval()

    all_true_labels = []
    all_predicted_label = []

    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            predictions = (outputs >= 0.5).float()
            all_predicted_label.extend(predictions.numpy().flatten())
            all_true_labels.extend(labels.numpy().flatten())

    return float(accuracy_score(all_true_labels, all_predicted_label))

In [24]:
epochs = 10
train_losses = []
val_accuracies = []

for epoch in range(epochs):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)

    train_losses.append(train_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}/{epoch} | Train loss: {train_loss:.4f} | Val acc: {val_acc:.4f}")

Epoch 1/0 | Train loss: 0.6520 | Val acc: 0.6623
Epoch 2/1 | Train loss: 0.6085 | Val acc: 0.6753
Epoch 3/2 | Train loss: 0.5734 | Val acc: 0.6883
Epoch 4/3 | Train loss: 0.5382 | Val acc: 0.7403
Epoch 5/4 | Train loss: 0.5081 | Val acc: 0.7143
Epoch 6/5 | Train loss: 0.4856 | Val acc: 0.7273
Epoch 7/6 | Train loss: 0.4694 | Val acc: 0.7273
Epoch 8/7 | Train loss: 0.4676 | Val acc: 0.7208
Epoch 9/8 | Train loss: 0.4555 | Val acc: 0.7273
Epoch 10/9 | Train loss: 0.4473 | Val acc: 0.7273


In [25]:
### TEST
train_loader_test = DataLoader(DiabetesDataset(X_train, y_train), batch_size=64, shuffle=True)

model_te = build_model(X_train.shape[1])
criterion_te = torch.nn.BCELoss()
optimizer_te = torch.optim.Adam(model_te.parameters(), lr=0.001)

loss1 = train_one_epoch(model_te, train_loader_test, criterion_te, optimizer_te)

assert isinstance(loss1, float)
assert 0.0 < loss1 < 10.0   # loose bounds

print("✔ Test passed!")

✔ Test passed!


In [26]:
### TEST
val_loader_test = DataLoader(DiabetesDataset(X_val, y_val), batch_size=256, shuffle=False)

acc_val = evaluate(model_te, val_loader_test)

print("Accuracy score for validatoin:", acc_val)

assert isinstance(acc_val, float)
assert 0.0 <= acc_val <= 1.0

print("✔ Test passed!")

Accuracy score for validatoin: 0.7272727272727273
✔ Test passed!
