In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
%matplotlib inline



In [2]:
# read in data using pandas
heart_data = pd.read_csv('..\\Datasets\\''Heart.csv', index_col = 0)
# to remove the automatic index use     index_col = 0 or put None to display

In [3]:
type(heart_data)

pandas.core.frame.DataFrame

In [4]:
heart_data.shape

(303, 14)

In [5]:
heart_data.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [6]:
# Summary statistics
summary_stat = heart_data.describe()
summary_stat


Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0
mean,54.438944,0.679868,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241
std,9.038662,0.467299,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438
min,29.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0
max,77.0,1.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0


In [7]:
# Checking for missing values
heart_data.isnull().sum().sort_values()

Age          0
Sex          0
ChestPain    0
RestBP       0
Chol         0
Fbs          0
RestECG      0
MaxHR        0
ExAng        0
Oldpeak      0
Slope        0
AHD          0
Thal         2
Ca           4
dtype: int64

In [8]:
heart_data[heart_data[['Ca', 'Thal']].isna().values == True]

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
88,53,0,nonanginal,128,216,0,2,115,0,0.0,1,0.0,,No
167,52,1,nonanginal,138,223,0,0,169,0,0.0,1,,normal,No
193,43,1,asymptomatic,132,247,1,2,143,1,0.1,2,,reversable,Yes
267,52,1,asymptomatic,128,204,1,0,156,1,1.0,2,0.0,,Yes
288,58,1,nontypical,125,220,0,0,144,0,0.4,2,,reversable,No
303,38,1,nonanginal,138,175,0,0,173,0,0.0,1,,normal,No


In [9]:
heart_data[heart_data.isna().values == True]

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
88,53,0,nonanginal,128,216,0,2,115,0,0.0,1,0.0,,No
167,52,1,nonanginal,138,223,0,0,169,0,0.0,1,,normal,No
193,43,1,asymptomatic,132,247,1,2,143,1,0.1,2,,reversable,Yes
267,52,1,asymptomatic,128,204,1,0,156,1,1.0,2,0.0,,Yes
288,58,1,nontypical,125,220,0,0,144,0,0.4,2,,reversable,No
303,38,1,nonanginal,138,175,0,0,173,0,0.0,1,,normal,No


In [10]:
heart_data['ChestPain'].unique()

array(['typical', 'asymptomatic', 'nonanginal', 'nontypical'],
      dtype=object)

In [11]:
heart_data['Thal'].unique()

array(['fixed', 'normal', 'reversable', nan], dtype=object)

In [12]:
heart_data['ChestPain'].count

<bound method Series.count of 1           typical
2      asymptomatic
3      asymptomatic
4        nonanginal
5        nontypical
6        nontypical
7      asymptomatic
8      asymptomatic
9      asymptomatic
10     asymptomatic
11     asymptomatic
12       nontypical
13       nonanginal
14       nontypical
15       nonanginal
16       nonanginal
17       nontypical
18     asymptomatic
19       nonanginal
20       nontypical
21          typical
22          typical
23       nontypical
24       nonanginal
25     asymptomatic
26       nonanginal
27       nonanginal
28          typical
29     asymptomatic
30     asymptomatic
           ...     
274    asymptomatic
275         typical
276         typical
277      nonanginal
278      nonanginal
279      nontypical
280    asymptomatic
281    asymptomatic
282      nonanginal
283    asymptomatic
284      nontypical
285    asymptomatic
286    asymptomatic
287    asymptomatic
288      nontypical
289      nontypical
290      nontypical
291      n

In [13]:
# See columns that are categorical and how many categories they have

for col in heart_data.columns:
    if heart_data[col].dtypes == 'object':
        num_of_unique_cat = len(heart_data[col].unique())
        print("Features '{col_name}' has '{unique_cat}' unique categories".format(col_name = col, unique_cat = num_of_unique_cat))

Features 'ChestPain' has '4' unique categories
Features 'Thal' has '4' unique categories
Features 'AHD' has '2' unique categories


In [14]:
dummy_1 = pd.get_dummies(heart_data['ChestPain'])
dummy_2 = pd.get_dummies(heart_data['Thal'])

In [15]:
dummy_1

Unnamed: 0,asymptomatic,nonanginal,nontypical,typical
1,0,0,0,1
2,1,0,0,0
3,1,0,0,0
4,0,1,0,0
5,0,0,1,0
6,0,0,1,0
7,1,0,0,0
8,1,0,0,0
9,1,0,0,0
10,1,0,0,0


In [16]:
dummy_2

Unnamed: 0,fixed,normal,reversable
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0
5,0,1,0
6,0,1,0
7,0,1,0
8,0,1,0
9,0,0,1
10,0,0,1


In [17]:
# Drop chestpain and thal columns and add dummy_1 and dummy_2
heart_data = heart_data.drop(['ChestPain', 'Thal'], axis = 1)

# when you set axis to 0, it picks from the horizontal, 1 is vertical

heart_data

Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,AHD
1,63,1,145,233,1,2,150,0,2.3,3,0.0,No
2,67,1,160,286,0,2,108,1,1.5,2,3.0,Yes
3,67,1,120,229,0,2,129,1,2.6,2,2.0,Yes
4,37,1,130,250,0,0,187,0,3.5,3,0.0,No
5,41,0,130,204,0,2,172,0,1.4,1,0.0,No
6,56,1,120,236,0,0,178,0,0.8,1,0.0,No
7,62,0,140,268,0,2,160,0,3.6,3,2.0,Yes
8,57,0,120,354,0,0,163,1,0.6,1,0.0,No
9,63,1,130,254,0,2,147,0,1.4,2,1.0,Yes
10,53,1,140,203,1,2,155,1,3.1,3,0.0,Yes


In [18]:
heart_data = pd.concat([heart_data, dummy_1], axis = 1)
heart_data = pd.concat([heart_data, dummy_2], axis = 1)

In [19]:
heart_data.head()

Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,AHD,asymptomatic,nonanginal,nontypical,typical,fixed,normal,reversable
1,63,1,145,233,1,2,150,0,2.3,3,0.0,No,0,0,0,1,1,0,0
2,67,1,160,286,0,2,108,1,1.5,2,3.0,Yes,1,0,0,0,0,1,0
3,67,1,120,229,0,2,129,1,2.6,2,2.0,Yes,1,0,0,0,0,0,1
4,37,1,130,250,0,0,187,0,3.5,3,0.0,No,0,1,0,0,0,1,0
5,41,0,130,204,0,2,172,0,1.4,1,0.0,No,0,0,1,0,0,1,0


In [20]:
# Summary Statistics
summary_stat = heart_data.describe()
summary_stat

Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,asymptomatic,nonanginal,nontypical,typical,fixed,normal,reversable
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,0.475248,0.283828,0.165017,0.075908,0.059406,0.547855,0.386139
std,9.038662,0.467299,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,0.500213,0.4516,0.371809,0.265288,0.236774,0.498528,0.487668
min,29.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,1.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,61.0,1.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
max,77.0,1.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
# Checking again for missing values
heart_data.isnull().sum().sort_values()

Age             0
fixed           0
typical         0
nontypical      0
nonanginal      0
asymptomatic    0
AHD             0
normal          0
Slope           0
ExAng           0
MaxHR           0
RestECG         0
Fbs             0
Chol            0
RestBP          0
Sex             0
Oldpeak         0
reversable      0
Ca              4
dtype: int64

In [22]:
# Data separation
target = heart_data['AHD']
heart_data = heart_data.drop(['AHD'], axis = 1)

In [23]:
# Replace missing values with the mean
imp = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imp.fit(heart_data)
heart_data = pd.DataFrame(data = imp.transform(heart_data), columns = heart_data.columns)

In [24]:
# Checking again for missing values
heart_data.isnull().sum().sort_values()

Age             0
fixed           0
typical         0
nontypical      0
nonanginal      0
asymptomatic    0
Ca              0
Slope           0
Oldpeak         0
ExAng           0
MaxHR           0
RestECG         0
Fbs             0
Chol            0
RestBP          0
Sex             0
normal          0
reversable      0
dtype: int64

In [25]:
# Convert targets to numerics
#target = {0 if x == 'No' else 1 for x in target}

In [26]:
# Convert targets to numerics
count = 0
for x in target:
    if x == 'Yes':
        target[count:] = 1
    else:
         target[count:] = 0
    count += 1

target

1      0
2      1
3      1
4      0
5      0
6      0
7      1
8      0
9      1
10     1
11     0
12     0
13     1
14     0
15     0
16     0
17     1
18     0
19     0
20     0
21     0
22     0
23     1
24     1
25     1
26     0
27     0
28     0
29     0
30     1
      ..
274    0
275    1
276    0
277    0
278    0
279    1
280    0
281    1
282    0
283    1
284    0
285    1
286    1
287    1
288    0
289    0
290    0
291    1
292    0
293    1
294    1
295    1
296    0
297    1
298    1
299    1
300    1
301    1
302    1
303    0
Name: AHD, Length: 303, dtype: int64

In [27]:
target.values

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [28]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(heart_data, target, random_state = 1)

In [29]:
logreg = LogisticRegression()

In [30]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
# Predicting and testing the result
y_pred = logreg.predict(X_test)

In [32]:
accuracy_score(y_test, y_pred)

0.868421052631579

In [33]:
classification_report(y_test, y_pred)

'             precision    recall  f1-score   support\n\n          0       0.86      0.90      0.88        41\n          1       0.88      0.83      0.85        35\n\navg / total       0.87      0.87      0.87        76\n'

In [34]:
confusion_matrix(y_test, y_pred)
print('\n')

classification_report(y_test, y_pred)





'             precision    recall  f1-score   support\n\n          0       0.86      0.90      0.88        41\n          1       0.88      0.83      0.85        35\n\navg / total       0.87      0.87      0.87        76\n'

In [35]:
knn= KNeighborsClassifier()

In [36]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [37]:
pred_2 = knn.predict(X_test)

In [38]:
classification_report(y_test, pred_2)

'             precision    recall  f1-score   support\n\n          0       0.72      0.71      0.72        41\n          1       0.67      0.69      0.68        35\n\navg / total       0.70      0.70      0.70        76\n'