## Import libraries

In [38]:
import pandas as pd
df=pd.read_csv('datasets_4123_6408_framingham.csv')
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [39]:
df.shape

(4240, 16)

In [40]:
df.drop('education',axis=1, inplace=True)

In [41]:
df.isnull().sum() # Missing value information

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [42]:
# Define the binary columns
bin_cols=["male","currentSmoker","prevalentStroke","prevalentHyp","diabetes"]
# Fill missing values for binary features with the most frequent values
for col in bin_cols:
    mode_values=df[col].mode()[0]
    df[col].fillna(mode_values,inplace=True)

In [43]:
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [44]:
import numpy as np
# Define the binary columns
numeric_cols=["cigsPerDay","BPMeds","totChol","BMI","heartRate","glucose"]
# Fill missing values for binary features with the most frequent values
for col in numeric_cols:
    median_values=df[col].median()
    df[col].fillna (median_values,inplace=True)

In [45]:
df.isnull().sum()

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [46]:
df['TenYearCHD'].value_counts() # Checking the balance of dataset

TenYearCHD
0    3596
1     644
Name: count, dtype: int64

In [47]:
from sklearn.utils import resample
import pandas as pd

# Assuming df is your original DataFrame
# Separate majority and minority classes
df_majority = df[df['TenYearCHD'] == 0]
df_minority = df[df['TenYearCHD'] == 1]

# Upsample the minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,               # Sample with replacement
                                 n_samples=len(df_majority), # Match majority class size
                                 random_state=42)            # Reproducibility

# Combine the majority class with the upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Shuffle the resulting dataframe (optional but recommended)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)


In [48]:
df_balanced['TenYearCHD'].value_counts()

TenYearCHD
1    3596
0    3596
Name: count, dtype: int64

## Train test Split

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#
x=df_balanced.drop(columns=['TenYearCHD'])
y=df_balanced['TenYearCHD']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


In [54]:
x_train

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
6256,0,36,0,0.0,0.0,0,0,0,211.0,100.0,61.5,22.19,60.0,73.0
4668,0,37,1,20.0,0.0,0,0,0,186.0,135.0,91.0,21.48,66.0,84.0
940,1,48,1,20.0,0.0,0,1,0,230.0,140.5,89.0,23.34,66.0,80.0
1511,1,35,1,20.0,0.0,0,0,0,234.0,122.5,76.5,25.16,75.0,85.0
6034,1,48,1,17.0,0.0,0,1,0,250.0,177.0,124.0,26.40,75.0,69.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0,38,1,10.0,0.0,0,0,0,220.0,114.0,73.5,27.06,68.0,67.0
5191,1,53,1,20.0,0.0,0,1,0,216.0,110.0,79.0,24.76,75.0,74.0
5226,1,43,1,15.0,0.0,0,0,0,210.0,115.0,77.5,25.10,70.0,68.0
5390,0,50,1,9.0,0.0,0,1,0,161.0,145.0,89.0,20.30,66.0,81.0


In [59]:
# Initialize standardscaler
scaler=StandardScaler()

#Fit scaler to training data and transform both training and testing data
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [60]:
x_train_scaled

array([[-0.9407409 , -1.77949352, -1.02624378, ..., -0.92420219,
        -1.33120285, -0.37509849],
       [-0.9407409 , -1.66361212,  0.97442734, ..., -1.09131873,
        -0.83320285, -0.01376629],
       [ 1.06299194, -0.38891674,  0.97442734, ..., -0.65352047,
        -0.83320285, -0.14515982],
       ...,
       [ 1.06299194, -0.96832373,  0.97442734, ..., -0.23925974,
        -0.50120285, -0.5393404 ],
       [-0.9407409 , -0.15715395,  0.97442734, ..., -1.36906172,
        -0.83320285, -0.11231143],
       [-0.9407409 ,  1.58106702, -1.02624378, ...,  0.8670047 ,
         0.32879715, -0.04661467]])

## Model traning

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import xgboost as xgb

# If you want XGBoost, install it via `pip install xgboost` and import:
# import xgboost as xgb

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM (RBF Kernel)': SVC(probability=True),  # probability=True if you want probabilities later
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'Naive Bayes': GaussianNB()
}

for name, model in models.items():
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} accuracy: {acc:.3f}")


Logistic Regression accuracy: 0.676
Decision Tree accuracy: 0.915
Random Forest accuracy: 0.974
SVM (RBF Kernel) accuracy: 0.709
AdaBoost accuracy: 0.672
Gradient Boosting accuracy: 0.733
XGBoost accuracy: 0.926
Naive Bayes accuracy: 0.587


In [73]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create and train the model
rfc = RandomForestClassifier()
rfc.fit(x_train_scaled, y_train)

# Predict
y_pred = rfc.predict(x_test_scaled)

# Evaluate
acc = accuracy_score(y_test, y_pred)
print(f"Random Forest accuracy: {acc:.3f}")



Random Forest accuracy: 0.976


In [74]:
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
# Accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report (precision, recall, f1-score)
cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)


Accuracy: 0.9756775538568451
Confusion Matrix:
 [[691  32]
 [  3 713]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98       723
           1       0.96      1.00      0.98       716

    accuracy                           0.98      1439
   macro avg       0.98      0.98      0.98      1439
weighted avg       0.98      0.98      0.98      1439



## Model Predictions :

In [78]:
import numpy as np

def predict(model, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose):
    # Encode categorical variables
    male_encoded = 1 if male.lower() == "male" else 0
    currentSmoker_encoded = 1 if currentSmoker.lower() == "yes" else 0
    BPMeds_encoded = 1 if BPMeds.lower() == "yes" else 0
    prevalentStroke_encoded = 1 if prevalentStroke.lower() == "yes" else 0
    prevalentHyp_encoded = 1 if prevalentHyp.lower() == "yes" else 0
    diabetes_encoded = 1 if diabetes.lower() == "yes" else 0
    
    # Prepare features array
    features = np.array([[male_encoded, age, currentSmoker_encoded, cigsPerDay, BPMeds_encoded, prevalentStroke_encoded, prevalentHyp_encoded, diabetes_encoded, totChol, sysBP, diaBP, BMI, heartRate, glucose]])
    
    # scalling
    scaled_features = scaler.transform(features)
    
    # predict by model
    result = model.predict(scaled_features)
    
    return result[0]

In [82]:
# test 1:
male = "female"
age = 56.00
currentSmoker = "yes"
cigsPerDay = 3.00
BPMeds = "no"
prevalentStroke = "no"
prevalentHyp = "yes"
diabetes = 'no'
totChol = 285.00
sysBP = 145.00
diaBP = 100.00
BMI = 30.14
heartRate = 80.00
glucose = 86.00


result = predict(rfc, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose)


if result == 1:
    print("The Patient has Heart Diseas")
else: 
    print("The Patient has No Heart Deseas")

The Patient has No Heart Deseas




## Model Save

In [89]:
import pickle
pickle.dump(rfc,open("rfc_classifier.pkl",'wb'))
pickle.dump(scaler,open("scaler.pkl",'wb'))