In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, 
                           recall_score, f1_score, matthews_corrcoef, 
                           confusion_matrix, classification_report)
import xgboost as xgb
import joblib
import os
import warnings
import kagglehub
import shutil
warnings.filterwarnings('ignore')

print("="*60)
print("MACHINE LEARNING ASSIGNMENT 2 - MODEL TRAINING")
print("Dataset: Heart Disease UCI")
print("="*60)

# Create models directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Step 1: Load and prepare the dataset
print("\n1. LOADING DATASET...")
df = pd.read_csv("I:\\BITS MTech\\Semester-1\\3-Machine Learning\\Assignment-2\\dataset\\heart.csv")

print(f"\nDataset Shape: {df.shape}")
print(f"Features: {df.shape[1]-1}")
print(f"Instances: {df.shape[0]}")
print(f"\nTarget Distribution:")
print(df['HeartDisease'].value_counts())
print(f"\nDataset Info:")
print(df.info())

# Step 2: Data Preprocessing
print("\n2. PREPROCESSING DATA...")
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Encode categorical variables if any
le = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X[column] = le.fit_transform(X[column])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features for certain algorithms
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Save scaler for later use
joblib.dump(scaler, 'models/scaler.pkl')

# Step 3: Define models
print("\n3. TRAINING MODELS...")
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=5),
    'K-Nearest Neighbor': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
}

# Dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for Logistic Regression and KNN
    if name in ['Logistic Regression', 'K-Nearest Neighbor']:
        X_train_use = X_train_scaled
        X_test_use = X_test_scaled
    else:
        X_train_use = X_train
        X_test_use = X_test
    
    # Train the model
    model.fit(X_train_use, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_use)
    
    # For AUC score
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test_use)[:, 1]
        auc_score = roc_auc_score(y_test, y_pred_proba)
    else:
        auc_score = roc_auc_score(y_test, y_pred)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # Store results
    results[name] = {
        'Accuracy': round(accuracy, 4),
        'AUC': round(auc_score, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1': round(f1, 4),
        'MCC': round(mcc, 4),
        'Model': model,
        'Predictions': y_pred,
        'True': y_test.values
    }
    
    # Save model
    filename = name.lower().replace(' ', '_').replace('-', '_')
    joblib.dump(model, f'models/{filename}.pkl')
    
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  AUC: {auc_score:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(f"  MCC: {mcc:.4f}")

# Step 4: Display results table
print("\n" + "="*60)
print("4. MODEL COMPARISON TABLE")
print("="*60)

comparison_df = pd.DataFrame(results).T
comparison_table = comparison_df[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']]
print(comparison_table.to_string())

# Save results to CSV
comparison_table.to_csv('models/model_comparison.csv')
print("\nResults saved to models/model_comparison.csv")

# Step 5: Cross-validation scores
print("\n" + "="*60)
print("5. CROSS-VALIDATION SCORES (5-Fold)")
print("="*60)

for name, model in models.items():
    if name in ['Logistic Regression', 'K-Nearest Neighbor']:
        X_use = X_train_scaled
    else:
        X_use = X_train
    
    cv_scores = cross_val_score(model, X_use, y_train, cv=5, scoring='accuracy')
    print(f"{name:25} CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

print("\n" + "="*60)
print("MODEL TRAINING COMPLETED SUCCESSFULLY!")
print("="*60)

MACHINE LEARNING ASSIGNMENT 2 - MODEL TRAINING
Dataset: Heart Disease UCI

1. LOADING DATASET...

Dataset Shape: (918, 12)
Features: 11
Instances: 918

Target Distribution:
HeartDisease
1    508
0    410
Name: count, dtype: int64

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), 