In [4]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
)

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report

In [3]:
%pip install torch

Collecting torch
  Downloading torch-2.9.1-cp313-cp313-win_amd64.whl.metadata (30 kB)
Downloading torch-2.9.1-cp313-cp313-win_amd64.whl (110.9 MB)
   ---------------------------------------- 0.0/110.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/110.9 MB ? eta -:--:--
   ---------------------------------------- 0.8/110.9 MB 2.1 MB/s eta 0:00:53
   ---------------------------------------- 1.3/110.9 MB 2.5 MB/s eta 0:00:45
    --------------------------------------- 2.1/110.9 MB 2.7 MB/s eta 0:00:40
    --------------------------------------- 2.6/110.9 MB 2.9 MB/s eta 0:00:38
   - -------------------------------------- 3.4/110.9 MB 3.0 MB/s eta 0:00:36
   - -------------------------------------- 3.9/110.9 MB 3.1 MB/s eta 0:00:36
   - -------------------------------------- 4.7/110.9 MB 3.1 MB/s eta 0:00:35
   - -------------------------------------- 5.5/110.9 MB 3.1 MB/s eta 0:00:34
   -- ------------------------------------- 6.3/110.9 MB 3.2 MB/s eta 0:00:33
   -- ---

In [5]:
df = pd.read_csv("E:\Java Projects\conda envir\exam prep ana\healthcare-dataset-stroke-data.csv")

In [6]:
df.sample(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
2115,8345,Female,49.0,0,0,Yes,Private,Rural,114.76,24.7,never smoked,0
3995,30961,Male,45.0,0,0,Yes,Private,Rural,95.62,29.5,smokes,0
2291,48875,Male,12.0,0,0,No,children,Rural,196.91,19.7,Unknown,0
945,65256,Female,57.0,0,0,Yes,Self-employed,Rural,128.28,34.2,never smoked,0
4607,68209,Male,47.0,0,0,Yes,Govt_job,Rural,58.23,31.4,formerly smoked,0
4450,65103,Female,59.0,0,0,Yes,Private,Urban,81.51,25.6,formerly smoked,0
3514,5455,Male,49.0,0,0,Yes,Private,Rural,78.34,32.5,Unknown,0
4258,22330,Female,45.0,0,0,Yes,Self-employed,Urban,82.94,29.3,Unknown,0
3781,42628,Female,69.0,0,1,No,Private,Urban,193.45,34.5,never smoked,0
2314,24289,Male,82.0,0,0,Yes,Private,Urban,89.83,24.7,smokes,0


In [8]:
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [9]:
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

In [10]:
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])
df['work_type'] = label_encoder.fit_transform(df['work_type'])
df['Residence_type'] = label_encoder.fit_transform(df['Residence_type'])


In [11]:
df['smoking_status'] = label_encoder.fit_transform(df['smoking_status'])

In [12]:
df.loc[df['smoking_status'] == 4, 'smoking_status'] = np.nan

In [14]:
median_imputer = SimpleImputer(strategy="median")

In [15]:
df[["bmi"]] = median_imputer.fit_transform(df[["bmi"]])

In [16]:
knn_imputer = KNNImputer(n_neighbors=31)

In [17]:
df['smoking_status'] = knn_imputer.fit_transform(df[['smoking_status']])
df['heart_disease'] = knn_imputer.fit_transform(df[['heart_disease']])
df['hypertension'] = knn_imputer.fit_transform(df[['hypertension']])

In [18]:
data_stroke = df.loc[df['stroke'] == 1]
data_stroke.info()

<class 'pandas.core.frame.DataFrame'>
Index: 249 entries, 0 to 248
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 249 non-null    int64  
 1   gender             249 non-null    int64  
 2   age                249 non-null    float64
 3   hypertension       249 non-null    float64
 4   heart_disease      249 non-null    float64
 5   ever_married       249 non-null    int64  
 6   work_type          249 non-null    int64  
 7   Residence_type     249 non-null    int64  
 8   avg_glucose_level  249 non-null    float64
 9   bmi                249 non-null    float64
 10  smoking_status     249 non-null    float64
 11  stroke             249 non-null    int64  
dtypes: float64(6), int64(6)
memory usage: 25.3 KB


In [19]:
data_no_stroke = df.loc[df['stroke'] == 0]
data_no_stroke.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4861 entries, 249 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4861 non-null   int64  
 1   gender             4861 non-null   int64  
 2   age                4861 non-null   float64
 3   hypertension       4861 non-null   float64
 4   heart_disease      4861 non-null   float64
 5   ever_married       4861 non-null   int64  
 6   work_type          4861 non-null   int64  
 7   Residence_type     4861 non-null   int64  
 8   avg_glucose_level  4861 non-null   float64
 9   bmi                4861 non-null   float64
 10  smoking_status     4861 non-null   float64
 11  stroke             4861 non-null   int64  
dtypes: float64(6), int64(6)
memory usage: 493.7 KB


In [20]:
data_no_stroke = data_no_stroke.sample(350)
data_no_stroke.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 350 entries, 3540 to 1652
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 350 non-null    int64  
 1   gender             350 non-null    int64  
 2   age                350 non-null    float64
 3   hypertension       350 non-null    float64
 4   heart_disease      350 non-null    float64
 5   ever_married       350 non-null    int64  
 6   work_type          350 non-null    int64  
 7   Residence_type     350 non-null    int64  
 8   avg_glucose_level  350 non-null    float64
 9   bmi                350 non-null    float64
 10  smoking_status     350 non-null    float64
 11  stroke             350 non-null    int64  
dtypes: float64(6), int64(6)
memory usage: 35.5 KB


In [21]:
data = pd.concat([data_no_stroke, data_stroke])
data = data.sample(frac=1)
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
169,210,1,81.0,0.0,0.0,1,3,0,91.54,31.4,2.0,1
225,39186,0,57.0,0.0,1.0,1,2,1,216.58,31.0,0.0,1
1329,40571,1,29.0,0.0,0.0,0,2,1,73.75,28.3,2.0,0
116,60744,1,61.0,1.0,0.0,1,3,0,76.11,27.3,3.0,1
625,21886,0,40.0,0.0,0.0,1,2,1,71.20,27.1,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
229,69959,0,80.0,1.0,0.0,0,2,1,66.03,35.4,2.0,1
193,69112,1,68.0,1.0,1.0,1,2,0,271.74,31.1,3.0,1
224,8899,1,49.0,0.0,0.0,0,2,0,104.86,31.9,3.0,1
171,60739,0,79.0,1.0,1.0,0,3,0,60.94,28.1,2.0,1


In [22]:
X = data.drop(columns=["stroke"]).values.astype("float32")
y = data["stroke"].values.astype("float32")

X.shape, y.shape

((599, 11), (599,))

In [23]:
x_train, x_val, y_train, y_val = train_test_split(X,y,test_size=0.2)

In [24]:
scaler = StandardScaler()

In [27]:
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_train[:3]

array([[-1.1900709 ,  1.2068182 ,  0.9793981 ,  2.1684973 ,  3.3128538 ,
         0.6232221 ,  0.8291108 ,  0.99791443,  1.4798639 , -0.08865789,
        -0.45043677],
       [-1.1970099 , -0.8286252 , -1.8908887 , -0.4611488 , -0.30185455,
        -1.6045644 ,  1.7613739 ,  0.99791443, -0.89433086, -1.7856481 ,
        -1.4223251 ],
       [ 0.7857315 , -0.8286252 ,  1.2792788 , -0.4611488 , -0.30185455,
         0.6232221 ,  0.8291108 , -1.0020899 , -0.69595456,  0.2790231 ,
        -0.45043677]], dtype=float32)

In [28]:
class StrokeDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.from_numpy(x)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
train_ds = StrokeDataset(x_train, y_train)
val_ds = StrokeDataset(x_val, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)

len(train_ds), len(val_ds)

(479, 120)

In [29]:
class StrokeNet(nn.Module):
    def __init__(self, in_features: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        )

    def forward(self, x):
        return self.net(x).squeeze(1)
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = StrokeNet(in_features=x_train.shape[1]).to(device)
model

Using device: cpu


StrokeNet(
  (net): Sequential(
    (0): Linear(in_features=11, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [35]:
pos_weight_value = len(data_no_stroke) / len(data_stroke)
pos_weight_tensor = torch.tensor([pos_weight_value], dtype=torch.float32).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [36]:
def train_one_epoch(epoch_idx: int):
    model.train()
    losses = []

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch_idx+1}", leave=False)

    for x_batch, y_batch in progress_bar:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(x_batch)
        loss = criterion(logits, y_batch)

        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    return float(np.mean(losses))

def evaluate():
    model.eval()
    all_probs = []
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch = x_batch.to(device)
            logits = model(x_batch)

            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs >= 0.5).astype(int)

            all_probs.append(probs)
            all_preds.append(preds)
            all_targets.append(y_batch.numpy())

    all_probs = np.concatenate(all_probs).reshape(-1)
    all_preds = np.concatenate(all_preds).reshape(-1)
    all_targets = np.concatenate(all_targets).reshape(-1)

    acc = accuracy_score(all_targets, all_preds)
    return acc, all_targets, all_preds

In [37]:
EPOCHS = 20

for epoch in range(EPOCHS):
    train_loss = train_one_epoch(epoch)
    val_acc, _, _ = evaluate()
    print(
        f"Epoch {epoch+1:02d}/{EPOCHS} | "
        f"train_loss = {train_loss:.4f} | val_acc = {val_acc:.4f}"
    )
print("Training finished!")

                                                                   

Epoch 01/20 | train_loss = 0.8024 | val_acc = 0.5833


                                                           

Epoch 02/20 | train_loss = 0.7741 | val_acc = 0.7333


                                                           

Epoch 03/20 | train_loss = 0.7440 | val_acc = 0.7417


                                                                   

Epoch 04/20 | train_loss = 0.7154 | val_acc = 0.7500


                                                           

Epoch 05/20 | train_loss = 0.6793 | val_acc = 0.7500


                                                           

Epoch 06/20 | train_loss = 0.6494 | val_acc = 0.7667


                                                           

Epoch 07/20 | train_loss = 0.6315 | val_acc = 0.7667


                                                           

Epoch 08/20 | train_loss = 0.6033 | val_acc = 0.7750


                                                           

Epoch 09/20 | train_loss = 0.5849 | val_acc = 0.7750


                                                            

Epoch 10/20 | train_loss = 0.5757 | val_acc = 0.7750


                                                            

Epoch 11/20 | train_loss = 0.5719 | val_acc = 0.7667


                                                            

Epoch 12/20 | train_loss = 0.5478 | val_acc = 0.7667


                                                            

Epoch 13/20 | train_loss = 0.5530 | val_acc = 0.7667


                                                            

Epoch 14/20 | train_loss = 0.5347 | val_acc = 0.7667


                                                            

Epoch 15/20 | train_loss = 0.5413 | val_acc = 0.7833


                                                            

Epoch 16/20 | train_loss = 0.5282 | val_acc = 0.7917


                                                            

Epoch 17/20 | train_loss = 0.5274 | val_acc = 0.7750


                                                            

Epoch 18/20 | train_loss = 0.5299 | val_acc = 0.7750


                                                            

Epoch 19/20 | train_loss = 0.5169 | val_acc = 0.7833


                                                            

Epoch 20/20 | train_loss = 0.5111 | val_acc = 0.7750
Training finished!




In [38]:
val_acc, y_true, y_pred = evaluate()
print("Validation accuracy:", val_acc)
print()
print("Classification report:")
print(classification_report(y_true,y_pred,digits=3))

Validation accuracy: 0.775

Classification report:
              precision    recall  f1-score   support

         0.0      0.817     0.754     0.784        65
         1.0      0.733     0.800     0.765        55

    accuracy                          0.775       120
   macro avg      0.775     0.777     0.775       120
weighted avg      0.778     0.775     0.775       120

