In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [185]:
df=pd.read_csv('./heart_attack_prediction_dataset.csv')

In [186]:
df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [187]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8763 non-null   float64
 12  Diet                

In [188]:
df=df.drop(columns={'Patient ID'})

In [189]:
df['BP1'], df['BP2'] = df['Blood Pressure'].str.split('/', expand=True).astype(int).T.values; df.drop(columns=['Blood Pressure'], inplace=True)

In [190]:
X=df.drop(columns={'Heart Attack Risk'})
y=df['Heart Attack Risk']

In [191]:
numerical_columns = X.select_dtypes(include=['int', 'float']).columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

In [192]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# data preprocessing

In [193]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [194]:
preprocessing = ColumnTransformer(transformers=[
    ('numerical', StandardScaler(), numerical_columns),
    ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_columns)  # Handle categorical columns
])

In [195]:
pipeline=Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('classifier',DecisionTreeClassifier(max_depth=3))
])

In [196]:
pipeline.fit(X_train,y_train)

In [197]:
y_pred=pipeline.predict(X_test)

In [198]:
accuracy_score(y_test,y_pred)

0.6417569880205363

In [199]:
pipeline=Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('classifier',LogisticRegression(penalty='l2'))
])

In [200]:
pipeline.fit(X_train,y_train)

In [201]:
y_pred=pipeline.predict(X_test)

In [202]:
accuracy_score(y_test,y_pred)

0.6428978893325727

In [152]:
pipeline=Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('classifier',SVC())
])

In [153]:
pipeline.fit(X_train,y_train)

In [154]:
y_pred=pipeline.predict(X_test)

In [155]:
accuracy_score(y_test,y_pred)


0.6417569880205363

In [156]:
from sklearn.ensemble import RandomForestClassifier

In [161]:
pipeline=Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('classifier',RandomForestClassifier(max_depth=5))
])

In [162]:
pipeline.fit(X_train,y_train)

In [163]:
y_pred=pipeline.predict(X_test)

In [164]:
accuracy_score(y_test,y_pred)


0.6417569880205363

In [167]:
param_grid = {
    'classifier__n_estimators': [100, 200],  # Number of trees in the forest
    'classifier__max_depth': [10, 20, 30],  # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
}

# Grid Search with Cross-Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [168]:
# Get the best parameters
print("Best Parameters:", grid_search.best_params_)

# Predict using the best model
y_pred = grid_search.best_estimator_.predict(X_test)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Accuracy: 64.18%
