In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
)

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("E:\Java Projects\conda envir\exam prep ana\healthcare-dataset-stroke-data.csv")

In [3]:
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [4]:
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

In [5]:
label_encoder = LabelEncoder()
df["gender"] = label_encoder.fit_transform(df["gender"])
df["ever_married"] = label_encoder.fit_transform(df["ever_married"])
df["work_type"] = label_encoder.fit_transform(df["work_type"])
df["Residence_type"] = label_encoder.fit_transform(df["Residence_type"])

In [6]:
df["smoking_status"] = label_encoder.fit_transform(df["smoking_status"])
label_encoder.classes_

array(['Unknown', 'formerly smoked', 'never smoked', 'smokes'],
      dtype=object)

In [7]:
df.loc[df["smoking_status"] == 4, "smoking_status"] = np.nan

In [8]:
median_imputer = SimpleImputer(strategy="median")

In [9]:
df[["bmi"]] = median_imputer.fit_transform(df[["bmi"]])

In [10]:
knn_imputer = KNNImputer(n_neighbors=31)

In [11]:
df["smoking_status"] = knn_imputer.fit_transform(df[["smoking_status"]])
df["heart_disease"] = knn_imputer.fit_transform(df[["heart_disease"]])
df["hypertension"] = knn_imputer.fit_transform(df[["hypertension"]])

In [12]:
data_stroke = df.loc[df['stroke'] == 1]

In [13]:
data_no_stroke = df.loc[df['stroke'] == 0]

In [14]:
data_no_stroke = data_no_stroke.sample(350)

In [15]:
data = pd.concat([data_no_stroke, data_stroke])
data = data.sample(frac=1)

In [16]:
x = data.drop(columns=["stroke"]).values.astype("float32")
y = data["stroke"].values.astype("float32")

x.shape, y.shape

((599, 11), (599,))

In [17]:
x_train, x_val, y_train, y_val = train_test_split(x,y,test_size=0.2)

In [18]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

In [19]:
class StrokeDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_ds = StrokeDataset(x_train, y_train)
val_ds = StrokeDataset(x_val, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)

len(train_ds), len(val_ds)

(479, 120)

In [20]:
class StrokeNet(nn.Module):
    def __init__(self, in_features: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)
    
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

model = StrokeNet(in_features=x_train.shape[1]).to(device)
model

Using device:  cpu


StrokeNet(
  (net): Sequential(
    (0): Linear(in_features=11, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [21]:
pos_weight_value = len(data_no_stroke) / len(data_stroke)
print(f"Positive weight value: {pos_weight_value}")

pos_weight_tensor = torch.tensor([pos_weight_value], dtype=torch.float32).to(device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

Positive weight value: 1.4056224899598393


In [22]:
def train_one_epoch(epoch_index: int):
    model.train()
    losses = []

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch_index}", leave=False)

    for x_batch, y_batch in progress_bar:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(x_batch)
        loss = criterion(logits, y_batch)

        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    return float(np.mean(losses))

def evaluate():
    model.eval()
    all_probs = []
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch = x_batch.to(device)
            logits = model(x_batch)

            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs >= 0.5).astype(int)

            all_probs.append(probs)
            all_preds.append(preds)
            all_targets.append(y_batch.numpy())

        all_probs = np.concatenate(all_probs).reshape(-1)
        all_preds = np.concatenate(all_preds).reshape(-1)
        all_targets = np.concatenate(all_targets).reshape(-1)

        acc = accuracy_score(all_targets, all_preds)
        return acc, all_targets, all_preds

In [23]:
EPOCHS = 20

for epoch in range(EPOCHS):
    train_loss = train_one_epoch(epoch)
    val_acc, _, _ = evaluate()
    print(
        f"Epoch {epoch+1:02d}/{EPOCHS} | "
        f"train_loss = {train_loss:.4f} | val_acc = {val_acc:.4f}"
    )

print("Training finished!")

                                                                   

Epoch 01/20 | train_loss = 0.8056 | val_acc = 0.4333


                                                           

Epoch 02/20 | train_loss = 0.7708 | val_acc = 0.6583


                                                           

Epoch 03/20 | train_loss = 0.7450 | val_acc = 0.7000


                                                           

Epoch 04/20 | train_loss = 0.7122 | val_acc = 0.7167


                                                           

Epoch 05/20 | train_loss = 0.6827 | val_acc = 0.7417


                                                           

Epoch 06/20 | train_loss = 0.6488 | val_acc = 0.7750


                                                           

Epoch 07/20 | train_loss = 0.6227 | val_acc = 0.7917


                                                           

Epoch 08/20 | train_loss = 0.5999 | val_acc = 0.7833


                                                           

Epoch 09/20 | train_loss = 0.5794 | val_acc = 0.7917


                                                           

Epoch 10/20 | train_loss = 0.5533 | val_acc = 0.7917


                                                            

Epoch 11/20 | train_loss = 0.5508 | val_acc = 0.7750


                                                            

Epoch 12/20 | train_loss = 0.5355 | val_acc = 0.7667


                                                            

Epoch 13/20 | train_loss = 0.5237 | val_acc = 0.7750


                                                            

Epoch 14/20 | train_loss = 0.5101 | val_acc = 0.7833


                                                            

Epoch 15/20 | train_loss = 0.5048 | val_acc = 0.7667


                                                            

Epoch 16/20 | train_loss = 0.4970 | val_acc = 0.7583


                                                            

Epoch 17/20 | train_loss = 0.4892 | val_acc = 0.7583


                                                            

Epoch 18/20 | train_loss = 0.4981 | val_acc = 0.7583


                                                            

Epoch 19/20 | train_loss = 0.4819 | val_acc = 0.7583


                                                            

Epoch 20/20 | train_loss = 0.4829 | val_acc = 0.7667
Training finished!


In [24]:
val_acc, y_true, y_pred = evaluate()

print("Validation accuracy: ", val_acc)
print()
print("Classification error:")
print(classification_report(y_true, y_pred, digits=3))

Validation accuracy:  0.7666666666666667

Classification error:
              precision    recall  f1-score   support

         0.0      0.865     0.780     0.821        82
         1.0      0.609     0.737     0.667        38

    accuracy                          0.767       120
   macro avg      0.737     0.759     0.744       120
weighted avg      0.784     0.767     0.772       120

