In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from google.colab import files
uploaded = files.upload()
data = pd.read_csv("CVD_cleaned.csv")

# Handle Age_Category ranges (convert to numeric midpoints)
age_map = {
    '18-24': 21,
    '25-29': 27,
    '30-34': 32,
    '35-39': 37,
    '40-44': 42,
    '45-49': 47,
    '50-54': 52,
    '55-59': 57,
    '60-64': 62,
    '65-69': 67,
    '70-74': 72,
    '75-79': 77,
    '80': 80,
    '80+': 81
}

# Apply mapping
data['Age_Category'] = data['Age_Category'].replace(age_map)

# Ensure the column is numeric after replacement
data['Age_Category'] = pd.to_numeric(data['Age_Category'], errors='coerce')

## Encode categorical variables
from sklearn.preprocessing import LabelEncoder

binary_cols = ['General_Health', 'Exercise', 'Heart_Disease',
               'Skin_Cancer', 'Other_Cancer', 'Depression', 'Arthritis', 'Sex',
               'Smoking_History']
for col in binary_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

## One-hot encoding for Checkup and diabetes column
data = pd.get_dummies(
    data,
    columns=['Checkup', 'Diabetes'],
    drop_first=False,
    dtype=int
)

# Print Checkup one-hot columns only
print("\nCHECKUP COLUMNS")
print(data.filter(like="Checkup").head())

# Print Diabetes one-hot columns only
print("\nDIABETES COLUMNS")
print(data.filter(like="Diabetes").head())

# Print all other columns (exclude Checkup and Diabetes)
other_cols = data.drop(columns=data.filter(regex="Checkup|Diabetes").columns)

print("\nALL OTHER COLUMNS")
print(other_cols.head())

print("\nNew shape after encoding:", data.shape)

print("Preprocessing complete.")


# Train/Test Split (80/20)
X = data.drop(columns=['Heart_Disease']).values
Y = data['Heart_Disease'].values

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=0, stratify=Y
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM Models
from sklearn.svm import SVC, LinearSVC
import time

# Originally tested 4 different kernels
# Only linear kernel (others commented out)
kernels = ["linear"]
# kernels = ["linear", "poly", "rbf", "sigmoid"]  # original list

results = {
    "Kernel": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1": [],
    "Train_Time_sec": []
}

for kernel in kernels:
    print(f"\nTraining SVM kernel = {kernel}")

    start = time.time()

    # Linear SVM
    if kernel == "linear":
        model = LinearSVC(C=1.0, max_iter=8000, class_weight='balanced')

    # Non-linear SVMs (slower on 300k rows)
    # else:
    #     model = SVC(kernel=kernel, C=1.0, gamma="scale", class_weight='balanced')

    model.fit(X_train_scaled, Y_train)
    train_time = time.time() - start

    y_pred = model.predict(X_test_scaled)

    # Metrics
    acc = accuracy_score(Y_test, y_pred)
    prec = precision_score(Y_test, y_pred, zero_division=0)
    rec = recall_score(Y_test, y_pred, zero_division=0)
    f1 = f1_score(Y_test, y_pred, zero_division=0)

    # Confusion Matrix
    cm = confusion_matrix(Y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)
    print(f"TN={TN}, FP={FP}, FN={FN}, TP={TP}\n")

    results["Kernel"].append(kernel)
    results["Accuracy"].append(acc)
    results["Precision"].append(prec)
    results["Recall"].append(rec)
    results["F1"].append(f1)
    results["Train_Time_sec"].append(train_time)

# Results to DataFrame
df_results = pd.DataFrame(results)
print("\n=== SVM Results ===")
print(df_results.round(4))





Saving CVD_cleaned.csv to CVD_cleaned (1).csv


  data['Age_Category'] = data['Age_Category'].replace(age_map)



CHECKUP COLUMNS
   Checkup_5 or more years ago  Checkup_Never  \
0                            0              0   
1                            0              0   
2                            0              0   
3                            0              0   
4                            0              0   

   Checkup_Within the past 2 years  Checkup_Within the past 5 years  \
0                                1                                0   
1                                0                                0   
2                                0                                0   
3                                0                                0   
4                                0                                0   

   Checkup_Within the past year  
0                             0  
1                             1  
2                             1  
3                             1  
4                             1  

DIABETES COLUMNS
   Diabetes_No  Diabetes_No, pre-diabet

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Upload dataset
from google.colab import files
uploaded = files.upload()
data = pd.read_csv("CVD_cleaned.csv")

# Handle Age_Category ranges (convert to numeric midpoints)
age_map = {
    '18-24': 21,
    '25-29': 27,
    '30-34': 32,
    '35-39': 37,
    '40-44': 42,
    '45-49': 47,
    '50-54': 52,
    '55-59': 57,
    '60-64': 62,
    '65-69': 67,
    '70-74': 72,
    '75-79': 77,
    '80': 80,
    '80+': 81
}

# Apply mapping
data['Age_Category'] = data['Age_Category'].replace(age_map)

# Ensure the column is numeric after replacement
data['Age_Category'] = pd.to_numeric(data['Age_Category'], errors='coerce')

## Encode categorical variables
from sklearn.preprocessing import LabelEncoder

binary_cols = ['General_Health', 'Exercise', 'Heart_Disease',
               'Skin_Cancer', 'Other_Cancer', 'Depression', 'Arthritis', 'Sex',
               'Smoking_History']
for col in binary_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

## One-hot encoding for Checkup and diabetes column
data = pd.get_dummies(
    data,
    columns=['Checkup', 'Diabetes'],
    drop_first=False,
    dtype=int
)


# Train/Test split (80/20)
X = data.drop(columns=['Heart_Disease']).values
Y = data['Heart_Disease'].values

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=0, stratify=Y
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Validation Accuracy
from sklearn.svm import LinearSVC

model = LinearSVC(C=1.0, class_weight='balanced', max_iter=8000)

val_scores = cross_val_score(model, X_train_scaled, Y_train,
                             cv=5, scoring='accuracy')

print("Validation accuracies for each fold:", val_scores)
print("Mean validation accuracy:", val_scores.mean())


# Train final Linear SVM on full training set
model.fit(X_train_scaled, Y_train)

# Test accuracy at default threshold
y_pred = model.predict(X_test_scaled)

print("\n=== Default Threshold Results ===")
cm = confusion_matrix(Y_test, y_pred, labels=[0,1])
TN, FP, FN, TP = cm.ravel()
print(f"TN={TN}, FP={FP}, FN={FN}, TP={TP}")
print(f"Accuracy:  {accuracy_score(Y_test, y_pred):.4f}")
print(f"Precision: {precision_score(Y_test, y_pred):.4f}")
print(f"Recall:    {recall_score(Y_test, y_pred):.4f}")
print(f"F1 Score:  {f1_score(Y_test, y_pred):.4f}")


# Decision Threshold Tuning (-0.25 to 0.1, excluding 0)
scores = model.decision_function(X_test_scaled)

thresholds = [-0.25, -0.2, -0.15, -0.1, -0.05, 0.05, 0.1]

print("\n=== Threshold Tuning Results ===")
for t in thresholds:
    y_pred_t = (scores > t).astype(int)

    acc = accuracy_score(Y_test, y_pred_t)
    prec = precision_score(Y_test, y_pred_t, zero_division=0)
    rec = recall_score(Y_test, y_pred_t, zero_division=0)
    f1 = f1_score(Y_test, y_pred_t, zero_division=0)

    cm = confusion_matrix(Y_test, y_pred_t, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()

    print(f"\nThreshold = {t}")
    print(f"TN={TN}, FP={FP}, FN={FN}, TP={TP}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")



Saving CVD_cleaned.csv to CVD_cleaned (1).csv


  data['Age_Category'] = pd.to_numeric(data['Age_Category'].replace(age_map), errors='coerce')


New shape after encoding: (308854, 26)
Validation accuracies for each fold: [0.7044944  0.7001032  0.70309812 0.70027926 0.69892343]
Mean validation accuracy: 0.7013796839253188

=== Default Threshold Results ===
TN=39478, FP=17299, FN=1058, TP=3936
Accuracy:  0.7028
Precision: 0.1854
Recall:    0.7881
F1 Score:  0.3001

=== Threshold Tuning Results ===

Threshold = -0.25
TN=30364, FP=26413, FN=461, TP=4533
Accuracy:  0.5649
Precision: 0.1465
Recall:    0.9077
F1 Score:  0.2523

Threshold = -0.2
TN=32190, FP=24587, FN=546, TP=4448
Accuracy:  0.5931
Precision: 0.1532
Recall:    0.8907
F1 Score:  0.2614

Threshold = -0.15
TN=33980, FP=22797, FN=642, TP=4352
Accuracy:  0.6206
Precision: 0.1603
Recall:    0.8714
F1 Score:  0.2708

Threshold = -0.1
TN=35814, FP=20963, FN=746, TP=4248
Accuracy:  0.6486
Precision: 0.1685
Recall:    0.8506
F1 Score:  0.2813

Threshold = -0.05
TN=37700, FP=19077, FN=896, TP=4098
Accuracy:  0.6767
Precision: 0.1768
Recall:    0.8206
F1 Score:  0.2910

Threshold 