In [11]:

# Project Title: Heart Disease Detection using Classification Algorithms
# Improved Model Training for Higher Accuracy

# Step 1: Import libraries
import pandas as pd
import pickle
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Step 2: Load dataset

data = pd.read_csv("D:\MUFG\heart_disease_dataset Capstone project 2.csv")
# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

# Separate features and target variable
X = data.drop("heart_disease", axis=1)
y = data["heart_disease"]

Dataset Preview:
   age  sex  chest_pain_type  resting_blood_pressure  cholesterol  \
0   58    1                1                     134          246   
1   52    0                2                     112          245   
2   59    0                2                     126          196   
3   67    1                1                     165          201   
4   51    1                0                     146          174   

   fasting_blood_sugar  resting_ecg  max_heart_rate  exercise_induced_angina  \
0                    0            0             155                        0   
1                    0            0             143                        1   
2                    0            0             162                        1   
3                    0            0             180                        0   
4                    1            1             133                        1   

   st_depression  st_slope  num_major_vessels  thalassemia  heart_disease  
0          

In [3]:
data.isnull().sum()

age                        0
sex                        0
chest_pain_type            0
resting_blood_pressure     0
cholesterol                0
fasting_blood_sugar        0
resting_ecg                0
max_heart_rate             0
exercise_induced_angina    0
st_depression              0
st_slope                   0
num_major_vessels          0
thalassemia                0
heart_disease              0
dtype: int64

In [4]:
# Step 3: Split data into training and testing sets (80/20)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [5]:
# Step 4: Feature Scaling
# Scaling helps models like Logistic Regression and SVM perform better
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [6]:
# Step 5: Initialize models with tuned hyperparameters
models = {
    "Decision Tree": DecisionTreeClassifier(
        criterion="entropy", max_depth=6, min_samples_split=4, random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200, max_depth=8, min_samples_split=3, random_state=42
    ),
    "Logistic Regression": LogisticRegression(
        C=1.5, solver="liblinear", max_iter=2000, random_state=42
    ),
    "SVM": SVC(
        kernel="rbf", C=3, gamma=0.1, probability=True, random_state=42
    )
}

In [7]:
# Step 6: Train and evaluate each model
print("\nModel Performance After Optimization:\n")
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    
    print("-------------------------------------------------------------")
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))


Model Performance After Optimization:

-------------------------------------------------------------
Decision Tree Accuracy: 0.5875
              precision    recall  f1-score   support

           0       0.58      0.31      0.40        36
           1       0.59      0.82      0.69        44

    accuracy                           0.59        80
   macro avg       0.58      0.56      0.54        80
weighted avg       0.59      0.59      0.56        80

-------------------------------------------------------------
Random Forest Accuracy: 0.6625
              precision    recall  f1-score   support

           0       0.65      0.56      0.60        36
           1       0.67      0.75      0.71        44

    accuracy                           0.66        80
   macro avg       0.66      0.65      0.65        80
weighted avg       0.66      0.66      0.66        80

-------------------------------------------------------------
Logistic Regression Accuracy: 0.6750
              precisi

In [8]:
# Step 7: Compare all accuracies
print("\n-------------------------------------------------------------")
print("Model Comparison Summary (Improved Accuracies):")
for name, score in results.items():
    print(f"{name}: {score:.4f}")


-------------------------------------------------------------
Model Comparison Summary (Improved Accuracies):
Decision Tree: 0.5875
Random Forest: 0.6625
Logistic Regression: 0.6750
SVM: 0.6750


In [9]:
# Step 8 (Optional): Find the best model
best_model = max(results, key=results.get)
print(f"\n✅ Best Performing Model: {best_model} ({results[best_model]:.4f} Accuracy)")



✅ Best Performing Model: Logistic Regression (0.6750 Accuracy)


In [12]:
# Step 9: Model saving
with open("linear_regression_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("feature_columns.json", "w") as f:
    json.dump(X.columns.tolist(), f)

print("Model, scaler, and feature columns saved.")


Model, scaler, and feature columns saved.
