In [None]:
# ----------------------------------------
# 🧠 Step 1: Import Required Libraries
# ----------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
import joblib


In [None]:
# ----------------------------------------
# 📥 Step 2: Load the Dataset
# ----------------------------------------
file_path = "processed.cleveland.data"

columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
    'restecg', 'thalach', 'exang', 'oldpeak',
    'slope', 'ca', 'thal', 'target'
]

df = pd.read_csv(file_path, names=columns, na_values='?')
df.dropna(inplace=True)
df = df.astype({'ca': 'float64', 'thal': 'float64'})
df['target'] = df['target'].apply(lambda x: 1 if int(x) > 0 else 0)


In [None]:
# ----------------------------------------
# 🧹 Step 3: Data Cleaning & Inspection
# ----------------------------------------
print(df.info())
print(df.describe())
print(df.isnull().sum())


In [None]:
# ----------------------------------------
# 📊 Step 4: Exploratory Data Analysis (EDA)
# ----------------------------------------
sns.countplot(x='target', data=df)
plt.title('Heart Disease Presence (1) vs Absence (0)')
plt.show()

sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

sns.boxplot(x='target', y='chol', data=df)
plt.title('Cholesterol Levels vs Heart Disease')
plt.show()


In [None]:
# ----------------------------------------
# 🏗️ Feature Engineering
# ----------------------------------------
df['age_group'] = pd.cut(df['age'], bins=[29, 40, 50, 60, 70, 80], labels=['30s', '40s', '50s', '60s', '70s'])
df['chol_per_age'] = df['chol'] / df['age']
df['thalach_cp'] = df['thalach'] * df['cp']
df['age_oldpeak'] = df['age'] * df['oldpeak']
df = pd.get_dummies(df, columns=['cp', 'thal', 'slope', 'age_group'], drop_first=True)


In [None]:
# ----------------------------------------
# 🧪 Step 6: Preprocessing
# ----------------------------------------
X = df.drop('target', axis=1)
y = df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# ----------------------------------------
# 📦 Step 7: Train-Test Split
# ----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# ----------------------------------------
# 📊 Step 8: Model Comparison
# ----------------------------------------
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(kernel='linear', probability=True)
}

cv_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
    cv_results[name] = scores
    print(f"{name}: Mean Accuracy = {scores.mean():.4f}, Std = {scores.std():.4f}")


In [None]:
# ----------------------------------------
# 🛠️ Step 9: Hyperparameter Tuning
# ----------------------------------------
log_params = {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
knn_params = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 4, 6], 'min_samples_split': [2, 5]}
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

rf_model = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, scoring='accuracy')
rf_model.fit(X_scaled, y)
print("Best Random Forest:", rf_model.best_params_)

# Save the best model and scaler
joblib.dump(rf_model.best_estimator_, "model_pro.pkl")
joblib.dump(scaler, "scaler_pro.pkl")
