In [3]:
import pandas as pd
import numpy as np

In [4]:
heart=pd.read_csv("heart.csv").copy()

In [5]:
heart.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [6]:
missing_values= [col for col in heart.columns
                if heart[col].isnull().any()]

In [7]:
print(heart.shape)

(918, 12)


In [8]:
print(missing_values)

[]


we have no missing values in our dataset

In [9]:
print(heart.columns)

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')


In [10]:
y=heart.iloc[:,-1]
X=heart.iloc[:, 0:-1]

In [11]:
cats=(X.dtypes==object)
categorical_values= list(cats[cats].index)
print(categorical_values)
for v in categorical_values:
    print(v+":\t"+str(X[v].nunique()))

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
Sex:	2
ChestPainType:	4
RestingECG:	3
ExerciseAngina:	2
ST_Slope:	3


In [10]:
values=list(X.columns)

# print(values)
numerical_values=values.copy()

for x in categorical_values:
    numerical_values.remove(x)
print(numerical_values)

['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']


we have now devided the columns into categorical and numerical values

In [12]:
for x in categorical_values:
    print(x)
    print(X[x].unique())

Sex
['M' 'F']
ChestPainType
['ATA' 'NAP' 'ASY' 'TA']
RestingECG
['Normal' 'ST' 'LVH']
ExerciseAngina
['N' 'Y']
ST_Slope
['Up' 'Flat' 'Down']


In [13]:
from sklearn.preprocessing import OrdinalEncoder
oe= OrdinalEncoder()

In [14]:
oec=["Sex", "ExerciseAngina"]
X[oec]=oe.fit_transform(X[oec])

In [15]:
X.loc[X["ChestPainType"]=="NAP", "ChestPainType"]=0
X.loc[X["ChestPainType"]=="TA", "ChestPainType"]=1
X.loc[X["ChestPainType"]=="ATA", "ChestPainType"]=2
X.loc[X["ChestPainType"]=="ASY", "ChestPainType"]=3

In [16]:
X.loc[X["RestingECG"]=="Normal", "RestingECG"]=0
X.loc[X["RestingECG"]=="ST", "RestingECG"]=1
X.loc[X["RestingECG"]=="LVH", "RestingECG"]=2

In [17]:
X.loc[X["ST_Slope"]=="Up", "ST_Slope"]=1
X.loc[X["ST_Slope"]=="Flat", "ST_Slope"]=0
X.loc[X["ST_Slope"]=="Down", "ST_Slope"]=2

In [17]:
print(X.head())

   Age  Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  \
0   40  1.0             2        140          289          0          0   
1   49  0.0             0        160          180          0          0   
2   37  1.0             2        130          283          0          1   
3   48  0.0             3        138          214          0          0   
4   54  1.0             0        150          195          0          0   

   MaxHR  ExerciseAngina  Oldpeak ST_Slope  
0    172             0.0      0.0        1  
1    156             0.0      1.0        0  
2     98             0.0      0.0        1  
3    108             1.0      1.5        0  
4    122             0.0      0.0        1  


## Now it is time to train our dataset with multiple models and see the the one with the best result

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [19]:
decisiontreemodel= DecisionTreeClassifier()
dt=cross_val_score(decisiontreemodel, X, y, cv=5)
print(dt)

[0.80434783 0.82608696 0.77173913 0.69398907 0.67759563]


In [20]:
nee=[10, 25, 50, 100, 200, 500, 1000, 2000, 5000]
for ne in nee:
    randomforestmodel=RandomForestClassifier(n_estimators=ne)
    rf=cross_val_score(randomforestmodel, X, y, cv=5)
    print(ne)
    print(rf.mean())

10
0.8115169874079354
25
0.8256771204561654
50
0.812568306010929
100
0.8288964124495131
200
0.821287716797339
500
0.8256296032311713
1000
0.8256355428842955
2000
0.8223746733190783
5000
0.8278213352340223


In [24]:
Xb=X.copy()
for xb in Xb.columns:
    Xb[xb]=pd.to_numeric(Xb[xb])
for ne in nee:
    xgboostmodel=XGBClassifier(n_estimators=ne, learning_rate=0.05, n_jobs=4)
    xg=cross_val_score(xgboostmodel, Xb, y, cv=5)
    print(ne)
    print(xg.mean())

10
0.8147303397481587




25
0.8180090282727489




50
0.8201888809693514








100
0.8223627940128295








200
0.8179912093133762
















500
0.807103825136612
















1000
0.8169042527916369
















2000
0.8169042527916369
















5000
0.8158172962698979


In [26]:
allmodels=[dt, rf, xg]
for x in allmodels:
    print(x.mean())

0.754751722499406
0.8278213352340223
0.8158172962698979


In [27]:
import pickle
filename = 'finalized_model.sav'


model=RandomForestClassifier(n_estimators=500)
model.fit(Xb,y)
pickle.dump(model, open(filename, 'wb'))

# some time later...
# print(y_test.shape)
# print(X_test.shape)
# load the model from disk
loaded_model = pickle.load(open("finalized_model.sav", 'rb'))


1.0


In [28]:
print(type(X.loc[2]))

<class 'pandas.core.series.Series'>


In [29]:
prediction=heart.iloc[1, :-1]
prediction.Sex=1
prediction.ChestPainType=2
prediction.RestingECG=0
prediction.ExerciseAngina=0
prediction.ST_Slope=1

# prediction=pd.to_numeric(prediction)
p=np.array(prediction).reshape(1,11)
r=loaded_model.predict(p)
print(r[0])

0


In [30]:
print(heart.head(1))

   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
