In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score


In [2]:
# Load and clean
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
    'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
    'ca', 'thal', 'target'
]
df = pd.read_csv(url, names=columns)
df.replace('?', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

# Feature selection (based on Step 3 output)
selected_features = ['cp', 'thalach', 'exang', 'oldpeak', 'ca', 'thal']
X = df[selected_features]
y = df['target']

# Scale
X_scaled = StandardScaler().fit_transform(X)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)


In [3]:
# Parameter grid
rf_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(rf, rf_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)

# Results
print("Best Random Forest Params:", grid_rf.best_params_)
print("Random Forest F1 Score:", grid_rf.score(X_test, y_test))


Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Random Forest Params: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Random Forest F1 Score: 0.8148148148148148


In [4]:
svm_params = {
    'C': np.logspace(-3, 3, 10),
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
}

svm = SVC(probability=True, random_state=42)
random_svm = RandomizedSearchCV(svm, svm_params, n_iter=10, cv=5, scoring='f1', n_jobs=-1, random_state=42, verbose=1)
random_svm.fit(X_train, y_train)

# Results
print("Best SVM Params:", random_svm.best_params_)
print("SVM F1 Score:", random_svm.score(X_test, y_test))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best SVM Params: {'kernel': 'linear', 'gamma': 'scale', 'C': np.float64(10.0)}
SVM F1 Score: 0.8076923076923077
