In [6]:
# !pip install torch scikit-learn pandas

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, log_loss
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset


In [7]:
# Load data
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/heart_dataset_train_all.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/heart_dataset_test.csv')
print(df.shape, df_test.shape)
# Check if there are any NaNs and remove them (for training and test data)
df = df.dropna()
df_test = df_test.dropna()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(273, 14) (31, 14)


In [8]:
# Separating features from labels
X = df.drop(columns='target')
y = df['target']
X_test = df_test.drop(columns='target')
y_test = df_test['target']

# Split training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Value Fields and Category Fields
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = list(set(X.columns) - set(numeric_cols))

# Convert object type columns to string type
for col in categorical_cols:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)
    X_test[col] = X_test[col].astype(str) # This is the crucial change


# Building a preprocessor
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# Fit-transform
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(X_test)

# Convert to Tensor
X_train_tensor = torch.tensor(X_train_proc, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val_proc, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_proc, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)


In [9]:
def get_dataloader(X, y, batch_size=32):
    return DataLoader(TensorDataset(X, y), batch_size=batch_size, shuffle=True)

def train_model(model, optimizer, criterion, train_loader, val_loader, epochs=100):
    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
    # Evaluation after the last training
    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train_tensor)
        y_val_pred = model(X_val_tensor)
        y_test_pred = model(X_test_tensor)
    return {
        'train_acc': accuracy_score(y_train_tensor, y_train_pred >= 0.5),
        'val_acc': accuracy_score(y_val_tensor, y_val_pred >= 0.5),
        'test_acc': accuracy_score(y_test_tensor, y_test_pred >= 0.5),
        'train_loss': log_loss(y_train_tensor, y_train_pred),
        'val_loss': log_loss(y_val_tensor, y_val_pred),
        'test_loss': log_loss(y_test_tensor, y_test_pred)
    }


In [10]:
input_dim = X_train_tensor.shape[1]
results = []

hidden_sizes = [8, 16, 32]
learning_rates = [0.001, 0.01, 0.1]


for h in hidden_sizes:
    for lr in learning_rates:
        model = nn.Sequential(
            nn.Linear(input_dim, h),
            nn.ReLU(),
            nn.Linear(h, 1),
            nn.Sigmoid()
        )
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = nn.BCEWithLogitsLoss()
        train_loader = get_dataloader(X_train_tensor, y_train_tensor, batch_size=32)
        val_loader = get_dataloader(X_val_tensor, y_val_tensor, batch_size=32)
        result = train_model(model, optimizer, criterion, train_loader, val_loader, epochs=50)
        result.update({'hidden_size': h, 'learning_rate': lr})
        results.append(result)

# The results are organized into a DataFrame
results_df = pd.DataFrame(results)
results_df.sort_values(by='val_acc', ascending=False)


Unnamed: 0,train_acc,val_acc,test_acc,train_loss,val_loss,test_loss,hidden_size,learning_rate
3,0.842593,0.925926,0.709677,0.405541,0.241608,0.642166,16,0.001
6,0.847222,0.925926,0.709677,0.415144,0.253939,0.685191,32,0.001
1,0.916667,0.907407,0.677419,0.711477,0.596657,1.470861,8,0.01
0,0.819444,0.888889,0.677419,0.407594,0.259426,0.639823,8,0.001
2,0.898148,0.87037,0.709677,3.401132,1.804441,5.153871,8,0.1
4,0.893519,0.851852,0.709677,1.333676,1.036434,1.907168,16,0.01
5,0.888889,0.851852,0.741935,3.702525,3.323871,6.544828,16,0.1
7,0.893519,0.851852,0.741935,2.088002,1.504202,2.245832,32,0.01
8,0.888889,0.851852,0.741935,3.653885,2.440895,7.215479,32,0.1
