In [61]:
import pandas as pd
import numpy as np

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [63]:
data=pd.read_csv("./data/heart_attack_prediction_dataset.csv")

In [64]:
data.head(3)

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0


In [65]:
# Splitting Blood Pressure column into min & max Blood Pressure
data[["Max BP","Min BP"]] = data['Blood Pressure'].str.split('/',expand=True)

In [66]:
data['Max BP'] = data['Max BP'].astype('int64')
data['Min BP'] = data['Min BP'].astype('int64')

In [67]:
data = data[['Age','Sex','Cholesterol','Heart Rate','Diabetes','Smoking','Alcohol Consumption','Previous Heart Problems','Medication Use','Triglycerides','Heart Attack Risk','Max BP','Min BP']]

In [68]:
data.head(3)

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Smoking,Alcohol Consumption,Previous Heart Problems,Medication Use,Triglycerides,Heart Attack Risk,Max BP,Min BP
0,67,Male,208,72,0,1,0,0,0,286,0,158,88
1,21,Male,389,98,1,1,1,1,0,235,0,165,93
2,21,Female,324,72,1,0,0,1,1,587,0,174,99


In [69]:
X = data.drop(['Heart Attack Risk'],axis=1)

In [70]:
y = data[['Heart Attack Risk']]

In [71]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns



In [72]:
num_features

Index(['Age', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Smoking',
       'Alcohol Consumption', 'Previous Heart Problems', 'Medication Use',
       'Triglycerides', 'Max BP', 'Min BP'],
      dtype='object')

In [73]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [74]:
num_pipe = Pipeline(
    steps=[
        
        ('Scaling',StandardScaler())        
 ]
)

In [75]:
cat_pipe = Pipeline(
    steps  = [
        
        ('Onehotencoder',OneHotEncoder())       
    ]
)

In [76]:
preprocessor = ColumnTransformer(
    [
        ("StandardScaler",num_pipe, num_features),
        ("OneHotEncoder",cat_pipe,cat_features)       
    ] 
)

In [77]:
# splitting data into train & test Set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=30)

In [78]:
X_train.head(3)

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Smoking,Alcohol Consumption,Previous Heart Problems,Medication Use,Triglycerides,Max BP,Min BP
4151,22,Male,350,44,0,1,0,1,0,535,115,77
3192,31,Male,383,42,1,1,1,0,0,609,170,83
6685,22,Male,336,93,1,1,1,0,0,439,107,80


In [79]:
X_test.head(3)

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Smoking,Alcohol Consumption,Previous Heart Problems,Medication Use,Triglycerides,Max BP,Min BP
971,37,Male,165,71,1,1,1,1,1,594,170,87
8676,18,Male,167,87,1,1,1,1,1,415,158,81
8209,67,Female,295,44,0,1,0,1,1,202,159,88


In [80]:
preprocessor.fit_transform(X_test)


array([[-0.75919733, -1.14864613, -0.19136577, ...,  0.11391151,
         0.        ,  1.        ],
       [-1.65125259, -1.12392295,  0.58634329, ..., -0.29190981,
         0.        ,  1.        ],
       [ 0.64931097,  0.45836052, -1.50374979, ...,  0.18154839,
         1.        ,  0.        ],
       ...,
       [ 1.16576401, -0.64182096,  1.21823189, ...,  0.11391151,
         1.        ,  0.        ],
       [ 0.46150986, -1.21045408,  0.68355692, ...,  0.45209594,
         0.        ,  1.        ],
       [ 1.25966456,  1.33603339,  1.46126597, ...,  1.6695599 ,
         1.        ,  0.        ]])

In [81]:
preprocessor.fit(X_test)

In [82]:
preprocessor.get_feature_names_out()

array(['StandardScaler__Age', 'StandardScaler__Cholesterol',
       'StandardScaler__Heart Rate', 'StandardScaler__Diabetes',
       'StandardScaler__Smoking', 'StandardScaler__Alcohol Consumption',
       'StandardScaler__Previous Heart Problems',
       'StandardScaler__Medication Use', 'StandardScaler__Triglycerides',
       'StandardScaler__Max BP', 'StandardScaler__Min BP',
       'OneHotEncoder__Sex_Female', 'OneHotEncoder__Sex_Male'],
      dtype=object)

In [83]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.fit_transform(X_test),columns=preprocessor.get_feature_names_out())

In [84]:
model = LogisticRegression()


In [85]:
model.fit(X_train,y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [86]:
y_pred = model.predict(X_test)


In [87]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6385212231857599

In [88]:
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming that X_train, X_test, y_train, y_test are already defined

# SVM requires feature scaling for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_scores = lr_model.predict_proba(X_test)[:,1]

# Dicision Tree model
dt_model =  DecisionTreeClassifier()
dt_model.fit(X_train_scaled, y_train)
dt_scores = dt_model.predict_proba(X_test_scaled)[:,1]

# Random Forest model
rf_model =  RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train)
rf_scores = rf_model.predict_proba(X_test_scaled)[:,1]


# Generate ROC curve data for logistic regression model
lr_fpr, lr_tpr, lr_thresholds = roc_curve(y_test, lr_scores)
lr_auc = roc_auc_score(y_test, lr_scores)

# Generate ROC curve data for Dicision Tree model
dt_fpr, dt_tpr, dt_thresholds = roc_curve(y_test, dt_scores)
dt_auc = roc_auc_score(y_test, dt_scores)

# Generate ROC curve data for Random Forest model
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, rf_scores)
rf_auc = roc_auc_score(y_test, rf_scores)


# Generate a trace for the Logistic Regression ROC curve
trace0 = go.Scatter(
    x=lr_fpr,
    y=lr_tpr,
    mode='lines',
    name=f'Logistic Regression (Area = {lr_auc:.2f})'
)

# Generate a trace for the Dicision Tree ROC curve
trace1 = go.Scatter(
    x=dt_fpr,
    y=dt_tpr,
    mode='lines',
    name=f'Dicision Tree(Area = {dt_auc:.2f})'
)

# Generate a trace for the Random Forest ROC curve
trace2 = go.Scatter(
    x=rf_fpr,
    y=rf_tpr,
    mode='lines',
    name=f'Random Forest(Area = {rf_auc:.2f})'
)


# Diagonal line
trace3 = go.Scatter(
    x=[0, 1], 
    y=[0, 1], 
    mode='lines', 
    name='Random (Area = 0.5)', 
    line=dict(dash='dash')
)

data = [trace0, trace1, trace2, trace3]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=800,
    height=800,
    showlegend=True
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [89]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier()
    
}

In [90]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    #this is a validation(test) score
    ac=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("Accuracy Score:",ac)

    r2_list.append(ac)
    
    print('='*35)
    print('\n')


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Logistic Regression
Model Training Performance
Accuracy Score: 0.6385212231857599


Decision Tree Classifier
Model Training Performance
Accuracy Score: 0.5362848014605203


Random Forest Classifier
Model Training Performance
Accuracy Score: 0.6125057051574624


