In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('datasets_4123_6408_framingham.csv')
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
df.shape

(4240, 16)

In [4]:
df.isnull().sum()


male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [5]:
df.drop('education',axis=1,inplace=True)

In [6]:
# filling missing value mode() and min()

bin_colo = ['male','currentSmoker','prevalentHyp','diabetes']

for col in bin_colo:
    mode_val = df[col].mode()[0]
    df.fillna({col: mode_val}, inplace=True)



In [7]:
df.head()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [8]:
# filling missing value mode() and min()

bin_colo = ['cigsPerDay','BPMeds','BMI','heartRate','glucose', 'totChol']

for col in bin_colo:
    median_val = df[col].median()
    df.fillna({col: median_val}, inplace=True)


In [9]:
df.isnull().sum()

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [10]:
df['TenYearCHD'].value_counts() # model is bias due to high embalace data

TenYearCHD
0    3596
1     644
Name: count, dtype: int64

In [11]:
from sklearn.utils import resample

# separate majority and minority classes
df_majority =df[df['TenYearCHD'] == 0]
df_minority =df[df['TenYearCHD'] == 1]

df_minority_upsample = resample(df_minority,
                                replace=True,
                                n_samples=len(df_majority)
                                ,random_state=42)
df_balanced = pd.concat([df_majority,df_minority_upsample])

In [12]:
df_balanced['TenYearCHD'].value_counts()

TenYearCHD
0    3596
1    3596
Name: count, dtype: int64

In [13]:
df_balanced

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
4,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
5,0,43,0,0.0,0.0,0,1,0,228.0,180.0,110.0,30.30,77.0,99.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025,0,57,0,0.0,0.0,0,1,0,207.0,175.0,80.0,20.86,83.0,75.0,1
3570,0,62,0,0.0,0.0,0,1,0,276.0,185.0,95.0,26.21,80.0,110.0,1
3537,1,58,0,0.0,0.0,0,1,0,320.0,139.0,81.5,23.65,80.0,82.0,1
622,0,64,0,0.0,0.0,0,0,0,293.0,116.0,80.0,26.81,80.0,87.0,1


In [14]:
# Scale the value of datASET in 0 to 1
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# separate the data into x feature and y labels
x = df_balanced.drop(columns=['TenYearCHD'])
y = df_balanced['TenYearCHD']

# separate train test split 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [15]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit scaler to training data and transform both training and testing data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [16]:
x_train_scaled

array([[-0.94172615,  1.4582083 , -1.02624378, ...,  0.840088  ,
        -0.50731448, -0.34072887],
       [-0.94172615, -1.66694628,  0.97442734, ..., -1.050363  ,
        -0.0898974 ,  0.25133934],
       [-0.94172615,  1.34246184,  0.97442734, ...,  0.42574258,
        -1.34214864, -0.34072887],
       ...,
       [ 1.06187982,  1.92119417, -1.02624378, ..., -0.6972277 ,
        -1.34214864, -0.18492145],
       [ 1.06187982,  1.68970124,  0.97442734, ...,  0.59289329,
        -0.0898974 ,  4.08420194],
       [ 1.06187982, -0.74097455,  0.97442734, ...,  1.0660946 ,
         0.32751968,  1.84057505]])

In [17]:
x_test_scaled

array([[-0.94172615, -0.97246749,  0.97442734, ...,  0.48930693,
        -0.0898974 ,  0.65643864],
       [ 1.06187982, -0.16224222, -1.02624378, ..., -1.00092406,
         0.32751968, -0.40305184],
       [ 1.06187982,  0.64798304,  0.97442734, ..., -0.02391638,
        -1.17518181,  0.06437043],
       ...,
       [-0.94172615, -0.85672102, -1.02624378, ..., -1.0433003 ,
        -1.34214864, -0.21608293],
       [-0.94172615,  0.64798304,  0.97442734, ..., -0.22873486,
         1.57977092, -0.24724442],
       [-0.94172615,  0.06925071, -1.02624378, ..., -0.36292627,
        -1.50911548,  1.21734536]])

In [19]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   - -------------------------------------- 2.4/56.8 MB 16.8 MB/s eta 0:00:04
   --- ------------------------------------ 5.0/56.8 MB 14.4 MB/s eta 0:00:04
   ----- ---------------------------------- 7.6/56.8 MB 13.4 MB/s eta 0:00:04
   ------ --------------------------------- 8.9/56.8 MB 11.5 MB/s eta 0:00:05
   ------- -------------------------------- 10.5/56.8 MB 10.4 MB/s eta 0:00:05
   -------- ------------------------------- 12.1/56.8 MB 9.8 MB/s eta 0:00:05
   --------- ------------------------------ 13.6/56.8 MB 9.6 MB/s eta 0:00:05
   ---------- ----------------------------- 15.2/56.8 MB 9.3 MB/s eta 0:00:05
   ----------- ---------------------------- 17.0/56.8 MB 9.2 MB/s eta 0:00:05
  

# Training 10 Models With Different Metrics

In [18]:
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report,confusion_matrix

In [17]:
# Define a list of classification 
classifiers = [
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    GaussianNB(),
    XGBClassifier()
]


# Create a dictionary to store the result
results = {}

# Train and evaluate each classifier 
for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(x_train_scaled,y_train)
    y_pred = clf.predict(x_test_scaled)

    # caculate accuracy 
    accuracy = accuracy_score(y_test,y_pred)*100
    print(f"{clf_name} Accuracy: {accuracy}:")

   # Classification report
    print(f"Classification Report for {clf_name}:")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    print(f"Confusion Matrix for {clf_name}:")
    print(confusion_matrix(y_test, y_pred))
    print("="*50) 

RandomForestClassifier Accuracy: 96.87282835302294:
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for RandomForestClassifier:
[[695  40]
 [  5 699]]
AdaBoostClassifier Accuracy: 65.18415566365532:
Classification Report for AdaBoostClassifier:
              precision    recall  f1-score   support

           0       0.68      0.61      0.64       735
           1       0.63      0.70      0.66       704

    accuracy                           0.65      1439
   macro avg       0.65      0.65      0.65      1439
weighted avg       0.65      0.65      0.65      1439

Confusion Matrix for AdaBoostClassifier:
[[447 288]
 [213 491]]
GradientBoostingCl

## Show Each Model Results

In [25]:
import pandas as pd
# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])

# Train and evaluate each classifier
for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(x_train_scaled, y_train)
    y_pred = clf.predict(x_test_scaled)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)*100
    report = classification_report(y_test, y_pred, output_dict=True)
    f1_score = report['weighted avg']['f1-score']
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    
    # Append results to DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([{
    'Model': clf_name,
    'Accuracy': accuracy,
    'F1-Score': f1_score,
    'Precision': precision,
    'Recall': recall
}])], ignore_index=True)

results_df

  results_df = pd.concat([results_df, pd.DataFrame([{


Unnamed: 0,Model,Accuracy,F1-Score,Precision,Recall
0,RandomForestClassifier,96.803336,0.968031,0.969241,0.968033
1,AdaBoostClassifier,65.184156,0.651286,0.65429,0.651842
2,GradientBoostingClassifier,72.897846,0.728702,0.73132,0.728978
3,LogisticRegression,65.879083,0.65883,0.659053,0.658791
4,SVC,68.311327,0.683126,0.683656,0.683113
5,KNeighborsClassifier,78.735233,0.783833,0.812481,0.787352
6,DecisionTreeClassifier,91.174427,0.911278,0.923972,0.911744
7,GaussianNB,58.304378,0.530092,0.635597,0.583044
8,XGBClassifier,90.618485,0.905977,0.912148,0.906185


## Best Model 

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Instantiation the randomForestClassifier
rf_classifier = RandomForestClassifier()

rf_classifier.fit(x_train_scaled,y_train)

y_pred_rf = rf_classifier.predict(x_test_scaled)

# calculate accuracy
accuracy_rf = accuracy_score(y_test,y_pred_rf)*100
print("Random Forest Classifier Accuracy:", accuracy_rf)

# classification report
print("Classification Report for Random Forest Classifier:")
print(classification_report(y_test, y_pred_rf))

# Confusion matrix
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Classifier Accuracy: 97.15079916608757
Classification Report for Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for Random Forest Classifier:
[[699  36]
 [  5 699]]


## Single Prediction

In [33]:
# test 
print("Prediction Class: ",rf_classifier.predict(x_test_scaled[100].reshape(1,-1))[0])
print("Actual class: ",y_test.iloc[100])

Prediction Class:  0
Actual class:  0


In [31]:
# test 2:
print("predcted class ",rf_classifier.predict(x_test_scaled[300].reshape(1,-1))[0])
print("actual class ", y_test.iloc[300])

predcted class  1
actual class  1


In [32]:
# test 3:
print("predcted class ",rf_classifier.predict(x_test_scaled[110].reshape(1,-1))[0])
print("actual class ", y_test.iloc[110])

predcted class  1
actual class  1


## Save Models

In [37]:
import pickle
pickle.dump(rf_classifier,open("rf_classifier.pkl",'wb'))
pickle.dump(scaler,open("scaler.pkl",'wb'))

## Load models to test1 

In [40]:
import pickle


# Load the randomforestclassifiers model
with open("rf_classifier.pkl","rb") as file:
    rf_classifier = pickle.load(file)

# Load the scaler 
with open("scaler.pkl","rb") as file:
    scaler = pickle.load(file)



## Prediction System

In [41]:
import numpy as np

def predict(model, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose):
    # Encode categorical variables
    male_encoded = 1 if male.lower() == "male" else 0
    currentSmoker_encoded = 1 if currentSmoker.lower() == "yes" else 0
    BPMeds_encoded = 1 if BPMeds.lower() == "yes" else 0
    prevalentStroke_encoded = 1 if prevalentStroke.lower() == "yes" else 0
    prevalentHyp_encoded = 1 if prevalentHyp.lower() == "yes" else 0
    diabetes_encoded = 1 if diabetes.lower() == "yes" else 0
    
    # Prepare features array
    features = np.array([[male_encoded, age, currentSmoker_encoded, cigsPerDay, BPMeds_encoded, prevalentStroke_encoded, prevalentHyp_encoded, diabetes_encoded, totChol, sysBP, diaBP, BMI, heartRate, glucose]])
    
    # scalling
    scaled_features = scaler.transform(features)
    
    # predict by model
    result = model.predict(scaled_features)
    
    return result[0]

In [42]:
# test 1:
male = "female"
age = 56.00
currentSmoker = "yes"
cigsPerDay = 3.00
BPMeds = "no"
prevalentStroke = "no"
prevalentHyp = "yes"
diabetes = 'no'
totChol = 285.00
sysBP = 145.00
diaBP = 100.00
BMI = 30.14
heartRate = 80.00
glucose = 86.00


result = predict(rf_classifier, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose)


if result == 1:
    print("The Patient has Heart Diseas")
else: 
    print("The Patiennt has No Heart Deseas")

The Patiennt has No Heart Deseas




In [43]:
male = 'female'
age = 63.0
currentSmoker = 'yes'
cigsPerDay = 3.0
BPMeds = 'no'
prevalentStroke = 'no'
prevalentHyp = 'yes'
diabetes = 'no'
totChol = 267.0
sysBP = 156.5
diaBP = 92.5
BMI = 27.1
heartRate = 60.0
glucose = 79.0
result = 1.0



result = predict(rf_classifier, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose)


if result == 1:
    print("The Patient has Heart Diseas")
else: 
    print("The Patiennt has No Heart Deseas")

The Patient has Heart Diseas


