In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

In [58]:
df = pd.DataFrame()
df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Heart_s.csv")

Importing and Reading Data^

Below, we print our dataset, checking it

In [59]:
df[0::10]

Unnamed: 0,Age,Gender,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal,AHD
0,63,f,typical,145,233,2,150,2.3,fixed,No
10,57,f,asymptomatic,140,192,0,148,0.4,fixed,No
20,64,f,typical,110,211,2,144,1.8,normal,No
30,69,m,typical,140,239,0,151,1.8,normal,No
40,65,m,asymptomatic,150,225,2,114,1.0,reversable,Yes
50,41,m,nontypical,105,198,0,168,0.0,normal,No
60,51,m,asymptomatic,130,305,0,142,1.2,reversable,Yes
70,65,m,nonanginal,155,269,0,148,0.8,normal,No
80,45,f,asymptomatic,104,208,2,148,3.0,normal,No
90,62,m,asymptomatic,160,164,2,145,6.2,reversable,Yes


Altering our data to show only the columns we want, removing Gender, Chestpain, and Thal

In [60]:
feature_columns = ['Age','RestBP','Chol','RestECG','MaxHR','Oldpeak']
x = df[feature_columns]
y = df['AHD']

Checking our altered dataset

In [61]:
print(x.shape)
print(y.shape)

(301, 6)
(301,)


In [62]:
x[0::10]

Unnamed: 0,Age,RestBP,Chol,RestECG,MaxHR,Oldpeak
0,63,145,233,2,150,2.3
10,57,140,192,0,148,0.4
20,64,110,211,2,144,1.8
30,69,140,239,0,151,1.8
40,65,150,225,2,114,1.0
50,41,105,198,0,168,0.0
60,51,130,305,0,142,1.2
70,65,155,269,0,148,0.8
80,45,104,208,2,148,3.0
90,62,160,164,2,145,6.2


In [63]:
y[0::10]

0       No
10      No
20      No
30      No
40     Yes
50      No
60     Yes
70      No
80      No
90     Yes
100     No
110    Yes
120    Yes
130     No
140    Yes
150     No
160    Yes
170     No
180    Yes
190    Yes
200     No
210    Yes
220     No
230    Yes
240     No
250    Yes
260    Yes
270    Yes
280    Yes
290    Yes
300     No
Name: AHD, dtype: object

Splitting our data into training and testing sets

In [64]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 6)

KNN Results

In [65]:
k = 3
myKnn = KNeighborsClassifier(n_neighbors=k)
myKnn.fit(x_train, y_train)
y_predict = myKnn.predict(x_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.6447368421052632


Decision Tree Results

In [66]:
mydt = DecisionTreeClassifier(random_state=5)
mydt.fit(x_train, y_train)
y_predict = mydt.predict(x_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.618421052631579


Log Regression Results

In [67]:
mylr = LogisticRegression()
mylr.fit(x_train, y_train)
y_predict = mylr.predict (x_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.6710526315789473


The log regression prediction provided the best results at 0.6710526315789473. The worst results were from the decision tree prediction at 0.618421052631579.

OneHotEncoding process

In [68]:
def genderNum(x):
    if x == 'm':
        return 0
    elif x == 'f':
        return 1
    
def chestpainNum(x):
    if x == 'typical':
        return 0
    elif x == 'asymptomatic':
        return 1
    elif x == 'nonanginal':
        return 2
    elif x == 'nontypical':
        return 3
    
def thalNum(x):
    if x == 'fixed':
        return 0
    elif x == 'normal':
        return 1
    elif x == 'reversable':
        return 2

Data and Column update

In [69]:
df['Gender'] = df['Gender'].apply(genderNum)
df['ChestPain'] = df['ChestPain'].apply(chestpainNum)
df['Thal'] = df['Thal'].apply(thalNum)

Print and check our new Data

In [70]:
df[0::10]

Unnamed: 0,Age,Gender,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal,AHD
0,63,1,0,145,233,2,150,2.3,0,No
10,57,1,1,140,192,0,148,0.4,0,No
20,64,1,0,110,211,2,144,1.8,1,No
30,69,0,0,140,239,0,151,1.8,1,No
40,65,0,1,150,225,2,114,1.0,2,Yes
50,41,0,3,105,198,0,168,0.0,1,No
60,51,0,1,130,305,0,142,1.2,2,Yes
70,65,0,2,155,269,0,148,0.8,1,No
80,45,1,1,104,208,2,148,3.0,1,No
90,62,0,1,160,164,2,145,6.2,2,Yes


In [71]:
feat_columns = ['Age','Gender','ChestPain','RestBP','Chol','RestECG','MaxHR','Oldpeak','Thal']
x = df[feat_columns]
y = df['AHD']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 6)

Checking our new data

In [72]:
print(x.shape)
print(y.shape)

(301, 9)
(301,)


In [73]:
x[0::10]

Unnamed: 0,Age,Gender,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal
0,63,1,0,145,233,2,150,2.3,0
10,57,1,1,140,192,0,148,0.4,0
20,64,1,0,110,211,2,144,1.8,1
30,69,0,0,140,239,0,151,1.8,1
40,65,0,1,150,225,2,114,1.0,2
50,41,0,3,105,198,0,168,0.0,1
60,51,0,1,130,305,0,142,1.2,2
70,65,0,2,155,269,0,148,0.8,1
80,45,1,1,104,208,2,148,3.0,1
90,62,0,1,160,164,2,145,6.2,2


In [74]:
y[0::10]

0       No
10      No
20      No
30      No
40     Yes
50      No
60     Yes
70      No
80      No
90     Yes
100     No
110    Yes
120    Yes
130     No
140    Yes
150     No
160    Yes
170     No
180    Yes
190    Yes
200     No
210    Yes
220     No
230    Yes
240     No
250    Yes
260    Yes
270    Yes
280    Yes
290    Yes
300     No
Name: AHD, dtype: object

Knn results for new data

In [75]:
k = 3
myKnn = KNeighborsClassifier(n_neighbors=k)
myKnn.fit(x_train, y_train)
y_predict = myKnn.predict(x_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.6447368421052632


Decision Tree results for new data

In [76]:
mydt = DecisionTreeClassifier(random_state=5)
mydt.fit(x_train, y_train)
y_predict = mydt.predict(x_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.6710526315789473


Log Regression results for new data

In [77]:
mylr = LogisticRegression(max_iter=1000)
mylr.fit(x_train, y_train)
y_predict = mylr.predict (x_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.7368421052631579


The differences in each result set were observable as: Knn remained unchanged, Decision tree results increased from 0.618421052631579 to 0.6710526315789473, and Log Regression increased from 0.6710526315789473 to 0.7368421052631579

Cross Validation

Knn with Cross Validation 

In [78]:
k = 3
cross_knn = KNeighborsClassifier(n_neighbors=k)
accuracies = cross_val_score(cross_knn, x, y, cv=10, scoring='accuracy')
print(accuracies)
print("Mean:" , accuracies.mean())

[0.70967742 0.63333333 0.56666667 0.66666667 0.63333333 0.5
 0.66666667 0.7        0.56666667 0.73333333]
Mean: 0.6376344086021505


Decision Tree With Cross Validation

In [79]:
cross_dt = DecisionTreeClassifier(random_state=5)
accuracies = cross_val_score(cross_dt, x, y, cv=10, scoring='accuracy')
print(accuracies)
print("Mean:" , accuracies.mean())

[0.80645161 0.73333333 0.66666667 0.7        0.83333333 0.63333333
 0.7        0.7        0.76666667 0.63333333]
Mean: 0.7173118279569893


Log Regression with Cross Validation

In [80]:
cross_lr = LogisticRegression(max_iter=1000)
accuracies = cross_val_score(cross_lr, x, y, cv=10, scoring='accuracy')
print(accuracies)
print("Mean:" , accuracies.mean())

[0.77419355 0.7        0.76666667 0.9        0.83333333 0.76666667
 0.8        0.73333333 0.73333333 0.8       ]
Mean: 0.7807526881720429


Log regression produced the best results with a mean at 0.7807526881720429