In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score
import pickle
import warnings
warnings.filterwarnings('ignore')

In [70]:
df = pd.read_csv('datasets/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [71]:
df.shape

(918, 12)

In [72]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,918.0,53.510893,9.432617,28.0,47.0,54.0,60.0,77.0
RestingBP,918.0,132.396514,18.514154,0.0,120.0,130.0,140.0,200.0
Cholesterol,918.0,198.799564,109.384145,0.0,173.25,223.0,267.0,603.0
FastingBS,918.0,0.233115,0.423046,0.0,0.0,0.0,0.0,1.0
MaxHR,918.0,136.809368,25.460334,60.0,120.0,138.0,156.0,202.0
Oldpeak,918.0,0.887364,1.06657,-2.6,0.0,0.6,1.5,6.2
HeartDisease,918.0,0.553377,0.497414,0.0,0.0,1.0,1.0,1.0


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [76]:
cat_cols = [col for col in df.columns if df[col].dtype in ['object']]
cat_cols

['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope']

In [77]:
for i in cat_cols:
    print(df[i].unique())

['M' 'F']
['ATA' 'NAP' 'ASY' 'TA']
['Normal' 'ST' 'LVH']
['Up' 'Flat' 'Down']


In [74]:
df.drop(['Oldpeak','ExerciseAngina'],axis=1,inplace=True)

In [79]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,Up,0
1,49,F,NAP,160,180,0,Normal,156,Flat,1
2,37,M,ATA,130,283,0,ST,98,Up,0
3,48,F,ASY,138,214,0,Normal,108,Flat,1
4,54,M,NAP,150,195,0,Normal,122,Up,0


In [66]:
cat_cols

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [80]:
df[cat_cols]

Unnamed: 0,Sex,ChestPainType,RestingECG,ST_Slope
0,M,ATA,Normal,Up
1,F,NAP,Normal,Flat
2,M,ATA,ST,Up
3,F,ASY,Normal,Flat
4,M,NAP,Normal,Up
...,...,...,...,...
913,M,TA,Normal,Flat
914,M,ASY,Normal,Flat
915,M,ASY,Normal,Flat
916,F,ATA,LVH,Flat


In [45]:
lb = LabelEncoder()

for i in cat_cols:
    df[i] = lb.fit_transform(df[i])

In [56]:
df.drop('FastingBS',axis=1,inplace=True)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Age             918 non-null    int64
 1   Sex             918 non-null    int32
 2   ChestPainType   918 non-null    int32
 3   RestingBP       918 non-null    int64
 4   Cholesterol     918 non-null    int64
 5   RestingECG      918 non-null    int32
 6   MaxHR           918 non-null    int64
 7   ExerciseAngina  918 non-null    int32
 8   ST_Slope        918 non-null    int32
 9   HeartDisease    918 non-null    int64
 10  predict         918 non-null    int64
dtypes: int32(5), int64(6)
memory usage: 61.1 KB


In [58]:
x = df.drop('HeartDisease',axis=1)
y = df['HeartDisease']

In [93]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=43)

In [94]:
rf= RandomForestClassifier()
rf.fit(x_train,y_train)
rf_pred = rf.predict(x_test)
print(accuracy_score(y_test,rf_pred))
print(r2_score(y_test,rf_pred))

0.8478260869565217
0.38247632746014615


In [95]:
rf_pred_df = rf.predict(x)
df['predict'] = rf_pred_df

In [96]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ST_Slope,HeartDisease,predict
0,40,M,ATA,140,289,0,Normal,172,Up,0,0
1,49,F,NAP,160,180,0,Normal,156,Flat,1,0
2,37,M,ATA,130,283,0,ST,98,Up,0,0
3,48,F,ASY,138,214,0,Normal,108,Flat,1,1
4,54,M,NAP,150,195,0,Normal,122,Up,0,0
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,Flat,1,1
914,68,M,ASY,144,193,1,Normal,141,Flat,1,1
915,57,M,ASY,130,131,0,Normal,115,Flat,1,1
916,57,F,ATA,130,236,0,LVH,174,Flat,1,1


In [91]:
with open('data.pickle', 'wb') as f:
    pickle.dump(rf, f)

In [83]:
x

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,RestingECG,MaxHR,ExerciseAngina,ST_Slope,predict
0,40,1,1,140,289,1,172,0,2,0
1,49,0,2,160,180,1,156,0,1,0
2,37,1,1,130,283,2,98,0,2,0
3,48,0,0,138,214,1,108,1,1,1
4,54,1,2,150,195,1,122,0,2,0
...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,1,132,0,1,1
914,68,1,0,144,193,1,141,0,1,1
915,57,1,0,130,131,1,115,1,1,1
916,57,0,1,130,236,0,174,0,1,1
