# Importing the dataset

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


dataset = pd.read_csv("heart.csv");

print(dataset);

     Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  \
0     40   M           ATA        140          289          0     Normal   
1     49   F           NAP        160          180          0     Normal   
2     37   M           ATA        130          283          0         ST   
3     48   F           ASY        138          214          0     Normal   
4     54   M           NAP        150          195          0     Normal   
..   ...  ..           ...        ...          ...        ...        ...   
913   45   M            TA        110          264          0     Normal   
914   68   M           ASY        144          193          1     Normal   
915   57   M           ASY        130          131          0     Normal   
916   57   F           ATA        130          236          0        LVH   
917   38   M           NAP        138          175          0     Normal   

     MaxHR ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0      172              N  

In [9]:
print(dataset.columns)

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')


In [10]:
Y = dataset['HeartDisease'];
X = dataset.drop('HeartDisease', axis =1);

#Seperating causes and outcome

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


## Data Standardization using inbuilt libraries

In [12]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),('cat', categorical_transformer, categorical_cols)])            


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

rf_model = RandomForestClassifier(n_estimators=100)  #initialize 100 decision trees which predict the outcome individually and then aggregate the results and then provide the final results

pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', rf_model)])

pipeline.fit(X_train, Y_train)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(Y_test, y_pred))

print(y_pred)
print(np.array(list(Y_test)))

Accuracy: 0.89

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89        89
           1       0.89      0.91      0.90        95

    accuracy                           0.89       184
   macro avg       0.89      0.89      0.89       184
weighted avg       0.89      0.89      0.89       184

[0 1 1 0 0 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 1 1 0 1 1 1
 1 0 1 0 1 1 1 1 0 1 0 0 1 0 1 1 0 0 1 1 1 1 0 0 1 0 1 0 0 1 0 1 1 0 0 0 1
 0 0 1 0 1 0 1 1 1 1 0 0 0 0 0 0 1 0 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1
 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 1 0 1 0 1 0
 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 0 1 0 0 0 0 0 1 1 1]
[0 1 1 0 0 0 0 0 1 1 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 1 1 0 1 1 1
 1 0 1 0 1 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 0 0 1
 0 0 1 0 1 0 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 0 1 1 1 0 0
 0 0 1 1 1 0 1 1 1 0 0 1 0 1 0 1 0 

In [14]:
import joblib

joblib.dump(pipeline, 'heart_disease_prediction.pkl')

['heart_disease_prediction.pkl']