In [16]:
import pandas as pd 

In [17]:
df = pd.read_csv("Desktop/Heart_Disease_Proj/datasets.csv")

In [18]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


## Data Preprocessing

In [19]:
df.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [20]:
df.drop("education", axis = 1 , inplace = True)

In [21]:
df.head()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [22]:
df.columns


Index(['male', 'age', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [23]:
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [24]:
df.shape

(4240, 15)

In [25]:
# Defining the binary columns
categorical = ["male", "currentSmoker", "prevalentStroke", "prevalentHyp", "diabetes"]
for col in categorical :
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value,inplace = True)
missing_values = df.isnull().sum()
missing_values

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [26]:
# Fill missing values for numeric features
continuous = ["cigsPerDay", "BPMeds", "totChol", "BMI", "heartRate", "glucose"]
for col in continuous:
    median_value = df[col].median()
    df[col].fillna(median_value, inplace = True)
    
missing_values = df.isnull().sum()
missing_values

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

## Resample  Dataset

In [27]:
df["TenYearCHD"].value_counts()

0    3596
1     644
Name: TenYearCHD, dtype: int64

In [28]:
from sklearn.utils import resample

df_majority = df[df["TenYearCHD"]==0]
df_minority = df[df["TenYearCHD"]==1]

In [29]:
df_minority_upscaled = resample(df_minority,replace= True ,n_samples = len(df_majority),random_state = 42)

In [30]:
import pandas as pd
df_resampled = pd.concat([df_majority,df_minority_upscaled])

In [31]:
df_resampled["TenYearCHD"].value_counts()

1    3596
0    3596
Name: TenYearCHD, dtype: int64

## Spliting Training And Testing data

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

In [33]:
X = df_resampled.drop(columns = ["TenYearCHD"])
y = df_resampled["TenYearCHD"]

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

                    

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Selection

In [35]:
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier 
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

In [36]:
classifiers = [RandomForestClassifier(),AdaBoostClassifier(),
               GradientBoostingClassifier(),LogisticRegression(),SVC(), 
               KNeighborsClassifier(), DecisionTreeClassifier(),GaussianNB(),XGBClassifier()]

In [38]:
for clf in classifiers:
    clf_name =clf.__class__.__name__
    clf.fit(X_train_scaled,y_train)
    y_pred=clf.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test,y_pred)
    print (f"Accuracy Score {clf_name}:{accuracy} ")
    
    classification = classification_report(y_test,y_pred)
    print (f"Classification report for {clf_name}= {classification}")
    
    confusion = confusion_matrix(y_test,y_pred)
    print(f"Confusion Matrix for {clf_name} : {confusion}")
    print("="*60)
          
        
    

Accuracy Score RandomForestClassifier:0.970813064628214 
Classification report for RandomForestClassifier=               precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for RandomForestClassifier : [[698  37]
 [  5 699]]
Accuracy Score AdaBoostClassifier:0.6719944405837387 
Classification report for AdaBoostClassifier=               precision    recall  f1-score   support

           0       0.69      0.66      0.67       735
           1       0.66      0.68      0.67       704

    accuracy                           0.67      1439
   macro avg       0.67      0.67      0.67      1439
weighted avg       0.67      0.67      0.67      1439

Confusion Matrix for AdaBoostClassifier : [[486 249]
 [223 481]]
Accur

## RandomForest

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report , accuracy_score , confusion_matrix

In [42]:
rf = RandomForestClassifier()
rf.fit(X_train_scaled,y_train)
rf_pred = rf.predict(X_test_scaled)

accuracy_rf = accuracy_score(y_test,rf_pred)
print(f"Accuracy for RandomForest = {accuracy_rf}")

confusion_rf = confusion_matrix(y_test,rf_pred)
print("Confusion Matrix: ")
print(confusion_rf)

classification_rf = classification_report(y_test,rf_pred)
print(f"classification report : {classification_rf}")

Accuracy for RandomForest = 0.9735927727588604
Confusion Matrix: 
[[702  33]
 [  5 699]]
classification report :               precision    recall  f1-score   support

           0       0.99      0.96      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439



## Predictions from Dataset

### Prediction 1

In [60]:
print("Prediction :", rf.predict(X_test_scaled[11].reshape(1,-1))[0])
print("actual class ", y_test.iloc[11])

Prediction : 1
actual class  1


### Prediction 2

In [61]:
print("Prediction :", rf.predict(X_test_scaled[101].reshape(1,-1))[0])
print("actual class ", y_test.iloc[101])

Prediction : 1
actual class  1


### Prediction 3

In [62]:
print("Prediction :", rf.predict(X_test_scaled[1001].reshape(1,-1))[0])
print("actual class ", y_test.iloc[1001])

Prediction : 1
actual class  1


## Predictive system

In [63]:
import numpy as np

def predict(model, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose):
    # Encode categorical variables
    male_encoded = 1 if male.lower() == "male" else 0
    currentSmoker_encoded = 1 if currentSmoker.lower() == "yes" else 0
    BPMeds_encoded = 1 if BPMeds.lower() == "yes" else 0
    prevalentStroke_encoded = 1 if prevalentStroke.lower() == "yes" else 0
    prevalentHyp_encoded = 1 if prevalentHyp.lower() == "yes" else 0
    diabetes_encoded = 1 if diabetes.lower() == "yes" else 0
    
    # Prepare features array
    features = np.array([[male_encoded, age, currentSmoker_encoded, cigsPerDay, BPMeds_encoded, prevalentStroke_encoded, prevalentHyp_encoded, diabetes_encoded, totChol, sysBP, diaBP, BMI, heartRate, glucose]])
    
    # scalling
    scaled_features = scaler.transform(features)
    
    # predict by model
    result = model.predict(scaled_features)
    
    return result[0]

## Testing on Random variable

In [66]:
# test 1:
male = "female"
age = 46.00
currentSmoker = "yes"
cigsPerDay = 2.00
BPMeds = "no"
prevalentStroke = "no"
prevalentHyp = "yes"
diabetes = 'no'
totChol = 289.00
sysBP = 140.00
diaBP = 103.00
BMI = 31.14
heartRate = 82.00
glucose = 86.00


result = predict(rf, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose)


if result == 1:
    print("The Patient has Heart Diseas")
else: 
    print("The Patiennt has No Heart Deseas")

The Patiennt has No Heart Deseas


In [67]:
male = 'female'
age = 63.0
currentSmoker = 'yes'
cigsPerDay = 3.0
BPMeds = 'no'
prevalentStroke = 'no'
prevalentHyp = 'yes'
diabetes = 'no'
totChol = 267.0
sysBP = 156.5
diaBP = 92.5
BMI = 27.1
heartRate = 60.0
glucose = 79.0
result = 1.0



result = predict(rf, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose)


if result == 1:
    print("The Patient has Heart Diseas")
else: 
    print("The Patiennt has No Heart Deseas")

The Patient has Heart Diseas
