Import necessary models

In [1]:
import numpy as np
import pandas as pd


Read data file

In [2]:
data= pd.read_csv("../input/heart.csv") 

**DATA ANALYZE**

data.head() to find the first 5 columns
data.info() to find the summary of datas
data.isnull().sum() to find the missing value of data
data.dtypes to find the type of columns 



In [3]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


In [5]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
data.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

1. We have 3 categorical columns: cp, exang, slope. And we should deal with them

First, we create dummies value

In [7]:
cp=pd.get_dummies(data['cp'],prefix='cp', drop_first= True)
exang=pd.get_dummies(data['exang'],prefix='exang', drop_first= True)
slope=pd.get_dummies(data['slope'],prefix='slope', drop_first=True)

we then add the dummy values to the data

In [8]:
new_data= pd.concat([data,cp,exang,slope], axis=1)
new_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_1,cp_2,cp_3,exang_1,slope_1,slope_2
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,0,0,1,0,0,0
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,0,1,0,0,0,0
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,1,0,0,0,0,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,1,0,0,0,0,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,0,0,0,1,0,1


we then drop the original values from the data

In [9]:
new_data.drop(['cp','exang','slope'], axis= 1, inplace= True)
new_data.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,oldpeak,ca,thal,target,cp_1,cp_2,cp_3,exang_1,slope_1,slope_2
0,63,1,145,233,1,0,150,2.3,0,1,1,0,0,1,0,0,0
1,37,1,130,250,0,1,187,3.5,0,2,1,0,1,0,0,0,0
2,41,0,130,204,0,0,172,1.4,0,2,1,1,0,0,0,0,1
3,56,1,120,236,0,1,178,0.8,0,2,1,1,0,0,0,0,1
4,57,0,120,354,0,1,163,0.6,0,2,1,0,0,0,1,0,1


Now we separate target and the rest

In [10]:
y=new_data['target']
X=new_data.drop(['target'], axis= 1)

We then split the data to train our models. Lets put 80% train 20% test

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train, y_test= train_test_split(X,y, test_size=0.2, random_state= 2)

**MAKE MODELS**

*Logistic Regression*

In [12]:
from sklearn.linear_model import LogisticRegression
lr= LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_test, y_test)



0.9180327868852459

*K nearest neighbors*

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_test, y_test)


0.6885245901639344

*Decision Tree Classifier*

In [14]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.9016393442622951

*Gradient boosting classifier*

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc.score(X_test, y_test)

0.8688524590163934

*Gaussian NB*


In [16]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
nb.score(X_test, y_test)

0.8852459016393442

*random forest classifier*

In [17]:
from sklearn.ensemble import RandomForestClassifier
for i in range(1, 10):
    rfc = RandomForestClassifier(n_estimators=i)
    rfc.fit(X_train, y_train)
    print('n_estimators : ', i, "score : ", rfc.score(X_test, y_test), end="\n")

n_estimators :  1 score :  0.8032786885245902
n_estimators :  2 score :  0.7704918032786885
n_estimators :  3 score :  0.819672131147541
n_estimators :  4 score :  0.8524590163934426
n_estimators :  5 score :  0.819672131147541
n_estimators :  6 score :  0.819672131147541
n_estimators :  7 score :  0.8688524590163934
n_estimators :  8 score :  0.7704918032786885
n_estimators :  9 score :  0.819672131147541


*Support vectors machine*

In [18]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.8852459016393442

Of all the models, logistics regression shows the best result