In [1]:
!pip install xgboost



In [2]:
# ============================================
# Machine Learning Assignment 2
# Heart Disease Classification - Train Models
# ============================================

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier


# ============================================
# Step 1: Load Dataset
# ============================================

print("\nðŸ“Œ Loading dataset...")

df = pd.read_csv("heart_disease_uci.csv")

print("Dataset Shape:", df.shape)


# ============================================
# Step 2: Convert Target Column (num â†’ binary)
# ============================================

print("\nðŸ“Œ Converting target column (num)...")

# 0 â†’ No disease
# 1,2,3,4 â†’ Disease present
df["num"] = df["num"].apply(lambda x: 1 if x > 0 else 0)

print("Target Value Counts:\n", df["num"].value_counts())


# ============================================
# Step 3: Handle Missing Values (NaNs)
# ============================================

print("\nðŸ“Œ Handling missing values...")

# Separate categorical and numeric columns
categorical_cols = df.select_dtypes(include=["object"]).columns
numeric_cols = df.select_dtypes(exclude=["object"]).columns

# Fill numeric missing values with mean
num_imputer = SimpleImputer(strategy="mean")
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])

# Fill categorical missing values with most frequent value
cat_imputer = SimpleImputer(strategy="most_frequent")
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

print("âœ… Missing values handled successfully!")


# ============================================
# Step 4: Encode Categorical Columns
# ============================================

print("\nðŸ“Œ Encoding categorical columns...")

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("Dataset Shape After Encoding:", df.shape)


# ============================================
# Step 5: Split Features and Target
# ============================================

X = df.drop("num", axis=1)
y = df["num"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training Set Shape:", X_train.shape)
print("Testing Set Shape :", X_test.shape)


# ============================================
# Step 6: Define Models (All 6 Required)
# ============================================

models = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "XGBoost": XGBClassifier(eval_metric="logloss")
}


# ============================================
# Step 7: Train Models + Evaluate Metrics
# ============================================

print("\nðŸ“Œ Training models and calculating metrics...\n")

results = []

for name, model in models.items():
    print("Training:", name)

    # Train model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Probabilities for AUC
    y_prob = model.predict_proba(X_test)[:, 1]

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([name, acc, auc, prec, rec, f1, mcc])


# ============================================
# Step 8: Comparison Table Output
# ============================================

comparison_df = pd.DataFrame(
    results,
    columns=["Model", "Accuracy", "AUC", "Precision", "Recall", "F1 Score", "MCC"]
)

print("\n============================================")
print("âœ… FINAL MODEL COMPARISON TABLE")
print("============================================\n")

print(comparison_df)

# Save results to CSV (optional)
comparison_df.to_csv("model_comparison_results.csv", index=False)

print("\nðŸ“Œ Results saved as: model_comparison_results.csv")
print("\nðŸŽ‰ Training Completed Successfully!\n")



ðŸ“Œ Loading dataset...
Dataset Shape: (920, 16)

ðŸ“Œ Converting target column (num)...
Target Value Counts:
 num
1    509
0    411
Name: count, dtype: int64

ðŸ“Œ Handling missing values...
âœ… Missing values handled successfully!

ðŸ“Œ Encoding categorical columns...
Dataset Shape After Encoding: (920, 23)
Training Set Shape: (736, 22)
Testing Set Shape : (184, 22)

ðŸ“Œ Training models and calculating metrics...

Training: Logistic Regression
Training: Decision Tree
Training: KNN
Training: Naive Bayes
Training: Random Forest
Training: XGBoost

âœ… FINAL MODEL COMPARISON TABLE

                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression  0.847826  0.916820   0.878505  0.862385  0.870370   
1        Decision Tree  0.782609  0.789480   0.863158  0.752294  0.803922   
2                  KNN  0.809783  0.909297   0.877551  0.788991  0.830918   
3          Naive Bayes  0.836957  0.909602   0.883495  0.834862  0.858491   
4        Random Forest  0.