**Importing the Heart Attack Dataset**

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('heart_attack_prediction_dataset.csv')

In [3]:
data.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


**Preprocessing the Data**

In [4]:
data.isnull().sum()

Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8763 non-null   float64
 12  Diet                

In [6]:
y = data['Heart Attack Risk']

In [7]:
X = data.iloc[:,:-1]

In [8]:
X.shape

(8763, 25)

In [9]:
object_columns = X.select_dtypes(include=['object']).columns

In [10]:
list(object_columns)

['Patient ID',
 'Sex',
 'Blood Pressure',
 'Diet',
 'Country',
 'Continent',
 'Hemisphere']

In [11]:
X_one_hot_encoded = pd.get_dummies(X,columns=list(object_columns))

In [12]:
X_one_hot_encoded.head()

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,...,Country_United States,Country_Vietnam,Continent_Africa,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Hemisphere_Northern Hemisphere,Hemisphere_Southern Hemisphere
0,67,208,72,0,0,1,0,0,4.168189,0,...,0,0,0,0,0,0,0,1,0,1
1,21,389,98,1,1,1,1,1,1.813242,1,...,0,0,0,0,0,0,1,0,1,0
2,21,324,72,1,0,0,0,0,2.078353,1,...,0,0,0,0,0,1,0,0,1,0
3,84,383,73,1,1,1,0,1,9.82813,1,...,0,0,0,0,0,0,1,0,1,0
4,66,318,93,1,1,1,1,0,5.804299,1,...,0,0,0,1,0,0,0,0,1,0


In [13]:
X_one_hot_encoded.shape

(8763, 12729)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X_one_hot_encoded,y,random_state=42)

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

In [18]:
X_train = scaler.fit_transform(X_train)

In [19]:
X_test = scaler.transform(X_test)

In [20]:
from sklearn.decomposition import PCA

In [21]:
X_train.shape

(6572, 12729)

In [23]:
pca = PCA(n_components=300)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

#Model Building

## 1. KNN

In [25]:
def euclidean(point, data):
    '''Euclidean distance between a point  & data'''
    return np.sqrt(np.sum((point - data)**2, axis=1))

In [26]:
def most_common(lst):
    '''Returns the most common element in a list'''
    return max(set(lst), key=lst.count)

In [27]:
class KNeighborsClassifier():
    def __init__(self, k=5, dist_metric=euclidean):
        self.k = k
        self.dist_metric = dist_metric
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    def predict(self, X_test):
        neighbors = []
        for x in X_test:
            distances = self.dist_metric(x, self.X_train)
            y_sorted = [y for _, y in sorted(zip(distances, self.y_train))]
            neighbors.append(y_sorted[:self.k])
        return list(map(most_common, neighbors))
    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        accuracy = sum(y_pred == y_test) / len(y_test)
        return accuracy

In [28]:
import matplotlib.pyplot as plt

In [29]:


knn = KNeighborsClassifier(k=5)




In [30]:
knn.fit(X_train, y_train)
knn_accuracy = knn.evaluate(X_test, y_test)

## 2. SVM

In [31]:
from sklearn import svm

In [32]:
svm = svm.SVC(kernel='poly', degree=2)
svm.fit(X_train,y_train)

In [33]:
from sklearn.metrics import accuracy_score
predictions = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, predictions)

In [34]:
svm_accuracy

0.6458238247375627

## 3. Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier

In [37]:
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)

In [38]:
predictions = tree.predict(X_test)
tree_accuracy = accuracy_score(predictions,y_test)

## 4. Gradient Boost

In [39]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report


In [40]:
gradient = GradientBoostingClassifier(random_state=42)
gradient.fit(X_train,y_train)

In [41]:
predictions = gradient.predict(X_test)
classification_report(y_test, predictions)

'              precision    recall  f1-score   support\n\n           0       0.65      1.00      0.78      1415\n           1       0.00      0.00      0.00       776\n\n    accuracy                           0.65      2191\n   macro avg       0.32      0.50      0.39      2191\nweighted avg       0.42      0.65      0.51      2191\n'

## 5. Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train,y_train)

In [44]:
predictions = forest.predict(X_test)

In [45]:
forest_accuracy = accuracy_score(predictions,y_test)

# Hyperparameter Search

## 1. SVM

In [50]:


from sklearn.model_selection import GridSearchCV


param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svm, param_grid, refit = True, verbose = 3)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.640 total time=  16.4s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.640 total time=   9.6s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.641 total time=   9.7s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.641 total time=  12.7s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.640 total time=  11.1s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.640 total time=  10.9s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.640 total time=  10.2s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.641 total time=  11.4s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.641 total time=  10.2s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.640 total time=   8.1s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.640 total time=   9.5s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [51]:
grid_predictions = grid.predict(X_test)

print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.65      1.00      0.78      1415
           1       0.00      0.00      0.00       776

    accuracy                           0.65      2191
   macro avg       0.32      0.50      0.39      2191
weighted avg       0.42      0.65      0.51      2191



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 2. Gradient Boost

In [None]:
grid = {

    'learning_rate':[0.01,0.05,0.1],

    'n_estimators':[100,150,200],

}


gb = GradientBoostingClassifier()

gb_cv = GridSearchCV(gb, grid, cv = 4)

gb_cv.fit(X_train,y_train)

In [None]:
print("Test Score:",gb_cv.score(X_test,y_test))