<a href="https://colab.research.google.com/github/BenBuchanan2001/BenBuchanan2001/blob/main/Heart_Disease_Prediction(joes).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading data and preproccessing data

In [None]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
import joblib  # For saving the scaler

"""
Team Member: Ben Buchanan
What the code does:
- Loads data from a CSV file (using the given file path) into a pandas DataFrame.
- Prints the first few rows to give a preview of the data.
- Iterates over each column to print its name and datatype.
- Displays descriptive statistics for the "Heart Disease" column.
- Returns the loaded DataFrame for further analysis.
"""
def load_and_analyse_data(data_path):
    raw_data = pd.read_csv(data_path)

    print(f"Data head: \n {raw_data.head()}")

    for column, dtype in raw_data.dtypes.items():
        print(f"Column name: {column}, Datatype: {dtype}")

    print(raw_data["Heart Disease"].describe())

    return raw_data


def categorise(row):
    """
    My simple categorisation function:
    """
    if row["Heart Disease"] == "Presence":
        return 1

    elif row["Heart Disease"] == "Absence":
        return 0

    else:
        raise ValueError(f"Unexpected value: {row['Heart Disease']}")


def preprocess_data(raw_data, save_scalar=True, scalar_path="scalars/scaler.pkl"):

    X = raw_data.drop(["Heart Disease", "Target"], axis=1)
    y = raw_data["Target"]

    scalar = StandardScaler()

    X_scaled = pd.DataFrame(scalar.fit_transform(X), columns=X.columns, index=X.index)

    if save_scalar:
        os.makedirs(os.path.dirname(scalar_path), exist_ok=True)
        joblib.dump(scalar, scalar_path)

    return X_scaled, y


data_path = "Heart_Disease_Prediction.csv"

raw_data = load_and_analyse_data(data_path)
# all datatypes are either int64 or float64 apart from the the target attribute "Heart Disease"
# The only variable that needs onehotencoding is the target variable and because the only two values
# are either absense or presense I think it will be quicker to just write a onehotencoder myself:

raw_data["Target"] = raw_data.apply(lambda row: categorise(row), axis=1)
print(raw_data)

X, y = preprocess_data(raw_data)

## inspect the data:
print("Processed X:")
print(X)
print("Processed y:")
print(y)

output_dir = "preprocessed_data"

os.makedirs(output_dir, exist_ok=True)

X.to_csv(output_dir + "/X.csv")
y.to_csv(output_dir + "/y.csv")

print(f"Scaled data saved to {output_dir}")

FileNotFoundError: [Errno 2] No such file or directory: 'Heart_Disease_Prediction.csv'

# Split data into train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# Model Definition

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class HeartDataset(Dataset):
    def __init__(self, features, labels):
            if isinstance(features, pd.DataFrame):
                self.features = torch.tensor(features.values, dtype=torch.float32)
            else:
                self.features = torch.tensor(features, dtype=torch.float32)

            if isinstance(labels, pd.Series):
                self.labels = torch.tensor(labels.values, dtype=torch.float32).reshape(-1, 1)
            else:
                self.labels = torch.tensor(labels, dtype=torch.float32).reshape(-1, 1)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class HeartDiseasePredictionModel(nn.Module):
    def __init__(self, input_dim):
        super(HeartDiseasePredictionModel, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100, patience=10):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_loss_list = []
    val_losses = []
    best_val_loss = float('inf')

    epochs_no_improve = 0

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0


        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        train_loss = running_loss / len(train_loader)
        train_loss_list.append(train_loss)


        # val phase

        model.eval()
        running_val_loss = 0.0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                running_val_loss += loss.item()

        val_loss = running_val_loss / len(val_loader)
        val_losses.append(val_loss)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

    return model, train_loss_list, val_losses






In [None]:
input_dim = len(X_train.columns) if isinstance(X_train, pd.DataFrame) else X_train.shape[1]

train_dataset = HeartDataset(X_train, y_train)
test_dataset = HeartDataset(X_test, y_test)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model = HeartDiseasePredictionModel(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

trained_model, train_losses, val_losses = train_model(
    model,
    train_loader,
    test_loader,
    criterion,
    optimizer
)

Epoch 1/100, Train Loss: 0.6929, Val Loss: 0.6788
Epoch 2/100, Train Loss: 0.6874, Val Loss: 0.6737
Epoch 3/100, Train Loss: 0.6809, Val Loss: 0.6688
Epoch 4/100, Train Loss: 0.6767, Val Loss: 0.6639
Epoch 5/100, Train Loss: 0.6681, Val Loss: 0.6588
Epoch 6/100, Train Loss: 0.6633, Val Loss: 0.6531
Epoch 7/100, Train Loss: 0.6566, Val Loss: 0.6465
Epoch 8/100, Train Loss: 0.6472, Val Loss: 0.6387
Epoch 9/100, Train Loss: 0.6382, Val Loss: 0.6299
Epoch 10/100, Train Loss: 0.6300, Val Loss: 0.6198
Epoch 11/100, Train Loss: 0.6188, Val Loss: 0.6086
Epoch 12/100, Train Loss: 0.6082, Val Loss: 0.5965
Epoch 13/100, Train Loss: 0.5923, Val Loss: 0.5833
Epoch 14/100, Train Loss: 0.5746, Val Loss: 0.5681
Epoch 15/100, Train Loss: 0.5543, Val Loss: 0.5503
Epoch 16/100, Train Loss: 0.5385, Val Loss: 0.5292
Epoch 17/100, Train Loss: 0.5197, Val Loss: 0.5064
Epoch 18/100, Train Loss: 0.4925, Val Loss: 0.4821
Epoch 19/100, Train Loss: 0.4681, Val Loss: 0.4590
Epoch 20/100, Train Loss: 0.4501, Val Lo

# Model evaluation

In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

