In [1]:
import pandas as pd
df = pd.read_csv(r'C:\Users\Bhavya\OneDrive\Desktop\Heart Disease Prediction\framingham.csv')
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [2]:
df.shape

(4240, 16)

In [3]:
df.drop('education',axis=1, inplace=True)

In [4]:
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [5]:
bin_cols = ["male", "currentSmoker", "prevalentStroke", "prevalentHyp", "diabetes"]
for col in bin_cols:
    mode_val = df[col].mode()[0]
    df[col] = df[col].fillna(mode_val)

In [6]:
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [7]:
import numpy as np

numeric_cols = ["cigsPerDay", "BPMeds", "totChol", "BMI","heartRate", "glucose"]
for col in numeric_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

In [8]:
df.isnull().sum()

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [9]:
df['TenYearCHD'].value_counts()

TenYearCHD
0    3596
1     644
Name: count, dtype: int64

In [10]:
from sklearn.utils import resample

df_majority = df[df['TenYearCHD'] == 0]
df_minority = df[df['TenYearCHD'] == 1]

df_minority_upsampled = resample(df_minority,
                                  replace=True,
                                  n_samples=len(df_majority),
                                  random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])

In [11]:
df_balanced['TenYearCHD'].value_counts()

TenYearCHD
0    3596
1    3596
Name: count, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_balanced.drop(columns=['TenYearCHD'])
Y = df_balanced['TenYearCHD']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [13]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()

rf_model.fit(X_train_scaled,y_train)

y_pred = rf_model.predict(X_test_scaled)

In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [16]:
accuracy_score(y_test, y_pred)

0.9715079916608756

In [17]:
confusion_matrix(y_test,y_pred)

array([[699,  36],
       [  5, 699]], dtype=int64)

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439



In [19]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [20]:
classifiers = [
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    GaussianNB(),
    XGBClassifier()
]

results = {}

results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])  

for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy}")

    
    print(f"Classification Report for {clf_name}:")
    print(classification_report(y_test, y_pred))

    
    print(f"Confusion Matrix for{clf_name}:")
    print(confusion_matrix(y_test, y_pred))
    print("="*50)

    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    f1_score = report['weighted avg']['f1-score']
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']

    
    results_df = pd.concat([results_df, pd.DataFrame([{'Model': clf_name, 'Accuracy': accuracy, 'F1-Score': f1_score,
                                    'Precision': precision, 'Recall':recall}])], ignore_index=True)
    
results_df 

RandomForestClassifier Accuracy: 0.9694232105628909
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix forRandomForestClassifier:
[[696  39]
 [  5 699]]


  results_df = pd.concat([results_df, pd.DataFrame([{'Model': clf_name, 'Accuracy': accuracy, 'F1-Score': f1_score,


AdaBoostClassifier Accuracy: 0.6719944405837387
Classification Report for AdaBoostClassifier:
              precision    recall  f1-score   support

           0       0.69      0.66      0.67       735
           1       0.66      0.68      0.67       704

    accuracy                           0.67      1439
   macro avg       0.67      0.67      0.67      1439
weighted avg       0.67      0.67      0.67      1439

Confusion Matrix forAdaBoostClassifier:
[[486 249]
 [223 481]]
GradientBoostingClassifier Accuracy: 0.7289784572619875
Classification Report for GradientBoostingClassifier:
              precision    recall  f1-score   support

           0       0.76      0.69      0.72       735
           1       0.70      0.77      0.74       704

    accuracy                           0.73      1439
   macro avg       0.73      0.73      0.73      1439
weighted avg       0.73      0.73      0.73      1439

Confusion Matrix forGradientBoostingClassifier:
[[508 227]
 [163 541]]
Logistic

Unnamed: 0,Model,Accuracy,F1-Score,Precision,Recall
0,RandomForestClassifier,0.969423,0.969422,0.970503,0.969423
1,AdaBoostClassifier,0.671994,0.672015,0.672474,0.671994
2,GradientBoostingClassifier,0.728978,0.728702,0.73132,0.728978
3,LogisticRegression,0.658791,0.65883,0.659053,0.658791
4,SVC,0.683113,0.683126,0.683656,0.683113
5,KNeighborsClassifier,0.787352,0.783833,0.812481,0.787352
6,DecisionTreeClassifier,0.913829,0.9134,0.925501,0.913829
7,GaussianNB,0.583044,0.530092,0.635597,0.583044
8,XGBClassifier,0.906185,0.905977,0.912148,0.906185


In [21]:
results_df

Unnamed: 0,Model,Accuracy,F1-Score,Precision,Recall
0,RandomForestClassifier,0.969423,0.969422,0.970503,0.969423
1,AdaBoostClassifier,0.671994,0.672015,0.672474,0.671994
2,GradientBoostingClassifier,0.728978,0.728702,0.73132,0.728978
3,LogisticRegression,0.658791,0.65883,0.659053,0.658791
4,SVC,0.683113,0.683126,0.683656,0.683113
5,KNeighborsClassifier,0.787352,0.783833,0.812481,0.787352
6,DecisionTreeClassifier,0.913829,0.9134,0.925501,0.913829
7,GaussianNB,0.583044,0.530092,0.635597,0.583044
8,XGBClassifier,0.906185,0.905977,0.912148,0.906185


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

rf_classifier = RandomForestClassifier()

rf_classifier.fit(X_train_scaled, y_train)

y_pred = rf_classifier.predict(X_test_scaled)

accuracy_rf = accuracy_score(y_test, y_pred)
print("Random Forest Classifier Accuracy:", accuracy_rf)

print("Classification Report for Random Forest Classifier:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test, y_pred))

Random Forest Classifier Accuracy: 0.9742876997915219
Classification Report for Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       735
           1       0.96      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for Random Forest Classifier:
[[703  32]
 [  5 699]]


In [23]:
print("predicted class ",rf_classifier.predict(X_test_scaled[10].reshape(1, -1))[0])
print("actual class ", y_test.iloc[10])

predicted class  0
actual class  0


In [24]:
print("predicted class ",rf_classifier.predict(X_test_scaled[200].reshape(1, -1))[0])
print("actual class ", y_test.iloc[200])

predicted class  1
actual class  1


In [25]:
import pickle

pickle.dump(rf_classifier, open(r"C:\Users\Bhavya\OneDrive\Desktop\Heart Disease Prediction\Models\rf_classifier.pkl", "wb"))
pickle.dump(scaler, open(r"C:\Users\Bhavya\OneDrive\Desktop\Heart Disease Prediction\Models\scaler.pkl", "wb"))

In [26]:
with open(r"C:\Users\Bhavya\OneDrive\Desktop\Heart Disease Prediction\Models\rf_classifier.pkl", "rb") as file:
    rf_classifier = pickle.load(file)

with open(r"C:\Users\Bhavya\OneDrive\Desktop\Heart Disease Prediction\Models\scaler.pkl", "rb") as file:
    scaler = pickle.load(file)

In [43]:
def predict(rf_classifier, scaler, male, age, currentSmoker, cigsPerday, BPMeds, prevalentStroke, prevalentHyp, diabetes, totchol, sysBP, diaBP, BMI, heartRate, glucose):
    
    male_encoded = 1 if male.lower() == "male" else 0
    currentSmoker_encoded = 1 if currentSmoker.lower() == "yes" else 0
    BPMeds_encoded = 1 if BPMeds.lower() == "yes" else 0
    prevalentStroke_encoded = 1 if prevalentStroke.lower() == "yes" else 0
    prevalentHyp_encoded = 1 if prevalentHyp.lower() == "yes" else 0
    diabetes_encoded = 1 if diabetes.lower() == "yes" else 0

    features = np.array([[male_encoded, age, currentSmoker_encoded, cigsPerDay, BPMeds_encoded, prevalentStroke_encoded, prevalentHyp_encoded, diabetes_encoded, totchol, sysBP, diaBP, BMI, heartRate, glucose]])

    scaled_features = scaler.transform(features)

    result = rf_classifier.predict(scaled_features)
    
    return result[0]

In [45]:
male = "female"
age = 56.00
currentSmoker = "yes"
cigsPerDay = 3.00
BPMeds = "no"
prevalentStroke = "no"
prevalentHyp = "yes"
diabetes = 'no'
totchol = 285.00
sysBP = 145.00
diaBP = 100.00
BMI = 30.14
heartRate = 80.00
glucose = 86.00

result = predict(rf_classifier, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totchol, sysBP, diaBP, BMI, heartRate, glucose)

if result==1:
    print("This patient has heart disease")
else:
    print("This patient has not heart disease")

This patient has not heart disease


