# Heart Disease Prediction.

# Loading DataSet.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/datasets_4123_6408_framingham.csv')

In [None]:
df.head(3)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0


In [None]:
df.drop('education', axis= 1, inplace= True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
male,0
age,0
currentSmoker,0
cigsPerDay,29
BPMeds,53
prevalentStroke,0
prevalentHyp,0
diabetes,0
totChol,50
sysBP,0


# Note: When dealing with categorical cols replace them with mode and in-case of continous cols replace them with median.

# Defining Categorical cols.

In [None]:
categorical_cols = ["male", "currentSmoker", "prevalentStroke", "prevalentHyp", "diabetes"]

# Fill the missing values with mode.
for cols in categorical_cols:
    mode_val = df[cols].mode()[0]
    df[cols].fillna(mode_val)

missing_values = df.isnull().sum()

In [None]:
missing_values

Unnamed: 0,0
male,0
age,0
currentSmoker,0
cigsPerDay,29
BPMeds,53
prevalentStroke,0
prevalentHyp,0
diabetes,0
totChol,50
sysBP,0


# Defining Numerical cols.

In [None]:
import numpy as np
numeric_cols = ["cigsPerDay", "BPMeds", "totChol", "BMI", "heartRate", "glucose"]
for cols in numeric_cols:
    median_val = df[cols].median()
    df[cols].fillna(median_val, inplace= True)

missing_values = df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[cols].fillna(median_val, inplace= True)


In [None]:
missing_values

Unnamed: 0,0
male,0
age,0
currentSmoker,0
cigsPerDay,0
BPMeds,0
prevalentStroke,0
prevalentHyp,0
diabetes,0
totChol,0
sysBP,0


# Balancing DataSet

In [None]:
df['TenYearCHD'].value_counts()

Unnamed: 0_level_0,count
TenYearCHD,Unnamed: 1_level_1
0,3596
1,644


In [None]:
from sklearn.utils import resample

df_maj = df[df['TenYearCHD'] == 0]
df_min = df[df['TenYearCHD'] == 1]

df_min_upsample = resample(df_min, replace= True, n_samples= len(df_maj))
df_balance = pd.concat([df_maj, df_min_upsample])

In [None]:
df_balance['TenYearCHD'].value_counts()

Unnamed: 0_level_0,count
TenYearCHD,Unnamed: 1_level_1
0,3596
1,3596


# Apply train_test_split

In [None]:
from sklearn.model_selection import train_test_split
x = df_balance.drop(columns= 'TenYearCHD')
y = df_balance['TenYearCHD']

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size= 0.2, random_state= 42)

In [None]:
xtrain.head(3)

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
284,1,68,0,0.0,0.0,0,1,1,164.0,142.0,85.0,30.28,70.0,120.0
784,0,43,1,15.0,0.0,0,0,0,199.0,137.0,81.0,21.85,70.0,72.0
1128,0,63,1,10.0,0.0,0,1,0,236.0,189.0,103.0,27.91,60.0,74.0


# Applying StandardScaler on uneven data.

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

ss_Xtrain = ss.fit_transform(xtrain)
ss_Xtest = ss.transform(xtest)

In [None]:
ss_Xtrain

array([[ 1.05449879,  1.92051628, -1.01418014, ...,  0.97736811,
        -0.51269392,  1.15276119],
       [-0.94831783, -0.97170032,  0.98601813, ..., -0.99360766,
        -0.51269392, -0.394436  ],
       [-0.94831783,  1.34207296,  0.98601813, ...,  0.42325037,
        -1.3635357 , -0.32996945],
       ...,
       [ 1.05449879, -0.16187967,  0.98601813, ...,  0.5050821 ,
        -0.68286227,  0.31469605],
       [ 1.05449879, -0.393257  ,  0.98601813, ..., -0.64523828,
        -0.85303063, -0.1365698 ],
       [ 1.05449879, -0.74032299,  0.98601813, ...,  1.05919985,
         0.33814786,  1.92635979]])

# Applying Random Forest Method.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(ss_Xtrain, ytrain)
pred = rfc.predict(ss_Xtest)

In [None]:
pred

array([1, 1, 1, ..., 1, 1, 1])

# Getting Accuracy, Report, Confusion_Matrix

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy_score(ytest, pred)

0.970813064628214

In [None]:
print(classification_report(ytest, pred))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439



In [None]:
confusion_matrix(ytest, pred)

array([[700,  35],
       [  7, 697]])

# Traning the mode with 10 diff Classifiers.

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
classifiers = [
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    GaussianNB(),
    XGBClassifier()
]

for clf in classifiers:
    clf_name = clf.__class__.__name__ # This line goto the name using __class__.__name__ and store it in clf_name.
    clf.fit(ss_Xtrain, ytrain)
    y_pred = clf.predict(ss_Xtest)

    # Calculating accuracy
    accuracy = accuracy_score(ytest, pred)
    print(f"{clf_name} Accuracy: {accuracy}")

    # Calculating Classification report
    print(f"Classification Report for {clf_name}:")
    print(classification_report(ytest, pred))

    # Calculating Confusion matrix
    print(f"Confusion Matrix for {clf_name}:")
    print(confusion_matrix(ytest, pred))
    print("="*50) # Actually a seperator.

RandomForestClassifier Accuracy: 0.970813064628214
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for RandomForestClassifier:
[[700  35]
 [  7 697]]
AdaBoostClassifier Accuracy: 0.970813064628214
Classification Report for AdaBoostClassifier:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for AdaBoostClassifier:
[[700  35]
 [  7 697]]
GradientBoostingClas

# Show 10 Models Results.

In [None]:
# Creating a DataFrame.
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])

# Train and evaluate each classifier
for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(ss_Xtrain, ytrain)
    y_pred = clf.predict(ss_Xtest)

    # Calculate evaluation metrics
    accuracy = accuracy_score(ytest, pred)
    report = classification_report(ytest, pred, output_dict=True)
    f1_score = report['weighted avg']['f1-score']
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']

    # Append results to DataFrame using pd.concat
    results_df = pd.concat([results_df, pd.DataFrame([{'Model': clf_name, 'Accuracy': accuracy, 'F1-Score': f1_score,
                                    'Precision': precision, 'Recall': recall}])], ignore_index=True)

results_df

  results_df = pd.concat([results_df, pd.DataFrame([{'Model': clf_name, 'Accuracy': accuracy, 'F1-Score': f1_score,


Unnamed: 0,Model,Accuracy,F1-Score,Precision,Recall
0,RandomForestClassifier,0.970813,0.970814,0.971551,0.970813
1,AdaBoostClassifier,0.970813,0.970814,0.971551,0.970813
2,GradientBoostingClassifier,0.970813,0.970814,0.971551,0.970813
3,LogisticRegression,0.970813,0.970814,0.971551,0.970813
4,SVC,0.970813,0.970814,0.971551,0.970813
5,KNeighborsClassifier,0.970813,0.970814,0.971551,0.970813
6,DecisionTreeClassifier,0.970813,0.970814,0.971551,0.970813
7,GaussianNB,0.970813,0.970814,0.971551,0.970813
8,XGBClassifier,0.970813,0.970814,0.971551,0.970813


# Selecting Our Best Model for traning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Instantiate the RandomForestClassifier
rf_classifier = RandomForestClassifier()

# Train the RandomForestClassifier
rf_classifier.fit(ss_Xtrain, ytrain)

# Predict on the test set
y_pred_rf = rf_classifier.predict(ss_Xtest)

# Calculate accuracy
accuracy_rf = accuracy_score(ytest, y_pred_rf)
print("Random Forest Classifier Accuracy:", accuracy_rf)

# Classification report
print("Classification Report for Random Forest Classifier:")
print(classification_report(ytest, y_pred_rf))

# Confusion matrix
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(ytest, y_pred_rf))

Random Forest Classifier Accuracy: 0.9742876997915219
Classification Report for Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       735
           1       0.96      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for Random Forest Classifier:
[[705  30]
 [  7 697]]


# Making Some Predictions

In [None]:
print('Predicted Class: ', rf_classifier.predict(ss_Xtest[10].reshape(1, -1)))
print('Actual Class: ', ytest.iloc[10])

Predicted Class:  [0]
Actual Class:  0


# Saving Model

In [None]:
import pickle
pickle.dump(rf_classifier, open('rfc_model.pkl', 'wb'))
pickle.dump(ss, open('scaler_model.pkl', 'wb'))

In [None]:
with open('rfc_model.pkl', 'rb') as file:
    rfc_classifier = pickle.load(file)

with open('scaler_model.pkl', 'rb') as file:
    scaler = pickle.load(file)

# Defining Function.

In [None]:
import numpy as np

def predict(rf_classifier,male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose):
# Encoding Categorical Cols.
    male_encoded = 1 if male.lower() == 'male' else 0
    currentSmoker_encoded = 1 if currentSmoker.lower() == 'yes' else 0
    BPMeds_encoded = 1 if BPMeds.lower() == 'yes' else 0
    prevalentStroke_encoded = 1 if prevalentStroke.lower() == 'yes' else 0
    prevalentHyp_encoded = 1 if prevalentHyp.lower() == 'yes' else 0
    diabetes_encoded = 1 if diabetes.lower() == 'yes' else 0
# Making a 2D Array.
    features = np.array([[male_encoded, age, currentSmoker_encoded, cigsPerDay, BPMeds_encoded, prevalentStroke_encoded, prevalentHyp_encoded, diabetes_encoded, totChol, sysBP, diaBP, BMI, heartRate, glucose]])

    scaled_features = scaler.transform(features)
    result = rf_classifier.predict(scaled_features)
    return result

In [None]:
male = "female"
age = 56.00
currentSmoker = "yes"
cigsPerDay = 3.00
BPMeds = "no"
prevalentStroke = "no"
prevalentHyp = "yes"
diabetes = 'no'
totChol = 285.00
sysBP = 145.00
diaBP = 100.00
BMI = 30.14
heartRate = 80.00
glucose = 86.00

result = predict(rf_classifier, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose)
print(result)


if result == 1:
    print("The person has heart disease.")
else:
    print("The person does not have heart disease.")

[0]
The person does not have heart disease.




In [85]:
import sklearn
sklearn.__version__

'1.6.0'