### I will use :
- KNN 
- Decision Trees
- Logistic Regression
- SVM :
    - poly
    - linear
    - sigmoid
    - rbf

#### add primary imports

In [114]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier


#### Load csv

In [115]:
df = pd.read_csv("heart.csv")
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [116]:
df.dtypes

age           int64
sex           int64
cp            int64
trtbps        int64
chol          int64
fbs           int64
restecg       int64
thalachh      int64
exng          int64
oldpeak     float64
slp           int64
caa           int64
thall         int64
output        int64
dtype: object

#### Grouped the data to X and Y


In [117]:
feature_df = df[['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng' , 'oldpeak', "slp" ,  'caa', 'thall']]
X = np.asarray(feature_df)
print(X[0:5])



y = np.asarray(df['output'])
print(y[0:5])

[[ 63.    1.    3.  145.  233.    1.    0.  150.    0.    2.3   0.    0.
    1. ]
 [ 37.    1.    2.  130.  250.    0.    1.  187.    0.    3.5   0.    0.
    2. ]
 [ 41.    0.    1.  130.  204.    0.    0.  172.    0.    1.4   2.    0.
    2. ]
 [ 56.    1.    1.  120.  236.    0.    1.  178.    0.    0.8   2.    0.
    2. ]
 [ 57.    0.    0.  120.  354.    0.    1.  163.    1.    0.6   2.    0.
    2. ]]
[1 1 1 1 1]


#### Train/Test dataset

In [118]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (242, 13) (242,)
Test set: (61, 13) (61,)


#### <span style="color: red;">KNN modeling : 
- #### use all parameters to prediction :

In [119]:
Ks = 100                                               
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))


for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])


print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 


The best accuracy was with 0.7377049180327869 with k= 31


#### add a magic loop to find the best parameters :

In [120]:
import itertools

best_accuracy = 0
best_features = None
best_k = None

# create a list of all feature combinations
feature_combinations = []
for r in range(1, len(df.columns)-1):
    feature_combinations.extend(itertools.combinations(df.columns[:-1], r))

# loop through all feature combinations
for features in feature_combinations:
    feature_df = df[list(features)]
    X = np.asarray(feature_df)
    y = np.asarray(df['output'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

    Ks = 100
    mean_acc = np.zeros((Ks-1))

    for n in range(1, Ks):
        # Train Model and Predict
        neigh = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
        yhat = neigh.predict(X_test)
        mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    # print(f"Features: {features}\nBest accuracy: {mean_acc.max()} with k={mean_acc.argmax()+1}\n")

    # check if this feature subset is the best so far
    if mean_acc.max() > best_accuracy:
        best_accuracy = mean_acc.max()
        best_features = features
        best_k = mean_acc.argmax() + 1
        print(f"Features: {features}\nBest accuracy: {mean_acc.max()} with k={mean_acc.argmax()+1}\n")
        
        
print(f"Best feature subset: {best_features}\nBest accuracy: {best_accuracy} with k={best_k}")


# output ------------------------------------------------ 
# Features: ('age',)
# Best accuracy: 0.639344262295082 with k=52

# Features: ('cp',)
# Best accuracy: 0.8032786885245902 with k=3

# Features: ('cp', 'restecg')
# Best accuracy: 0.819672131147541 with k=28

# Features: ('cp', 'oldpeak')
# Best accuracy: 0.8360655737704918 with k=13

# Features: ('exng', 'caa')
# Best accuracy: 0.8688524590163934 with k=7

# Features: ('cp', 'exng', 'caa')
# Best accuracy: 0.9016393442622951 with k=20

# Features: ('cp', 'oldpeak', 'caa', 'thall')
# Best accuracy: 0.9180327868852459 with k=20

# Features: ('sex', 'cp', 'exng', 'oldpeak', 'slp', 'caa', 'thall')
# Best accuracy: 0.9344262295081968 with k=5

# Best feature subset: ('sex', 'cp', 'exng', 'oldpeak', 'slp', 'caa', 'thall')
# Best accuracy: 0.9344262295081968 with k=5


Features: ('age',)
Best accuracy: 0.639344262295082 with k=52

Features: ('cp',)
Best accuracy: 0.8032786885245902 with k=3

Features: ('cp', 'restecg')
Best accuracy: 0.819672131147541 with k=28

Features: ('cp', 'oldpeak')
Best accuracy: 0.8360655737704918 with k=13

Features: ('exng', 'caa')
Best accuracy: 0.8688524590163934 with k=7

Features: ('cp', 'exng', 'caa')
Best accuracy: 0.9016393442622951 with k=20



KeyboardInterrupt: 

##### I find the best feature now time to fit :

In [None]:

feature = df[['sex', 'cp', 'exng', 'oldpeak', 'slp', 'caa', 'thall']]
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split( feature, y, test_size=0.2, random_state=4)


Ks = 5                                               
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

 
neigh = KNeighborsClassifier(n_neighbors = Ks).fit(X_train_knn,y_train_knn)
yhat_knn=neigh.predict(X_test_knn)
mean_acc[Ks-2] = metrics.accuracy_score(y_test_knn, yhat)
std_acc[Ks-2]=np.std(yhat_knn==y_test_knn)/np.sqrt(yhat_knn.shape[0])


print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 



The best accuracy was with 0.9344262295081968 with k= 4


##### time to test for real parameters :

In [None]:
real_parameters = [[1,2,0,0.5,2,0,3]]
feature_names = ['sex', 'cp', 'exng', 'oldpeak', 'slp', 'caa', 'thall']
real_parameters_with_names = pd.DataFrame(real_parameters, columns=feature_names)
neigh.predict(real_parameters_with_names)

array([1], dtype=int64)

#### <span style="color: red;">decision tree modeling : 
- use all parameters to prediction :


In [None]:
feature_df = df[['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng' , 'oldpeak', "slp" ,  'caa', 'thall']]
X = np.asarray(feature_df)
y = np.asarray(df['output'])

X_train_Tree, X_test_Tree, y_train_Tree, y_test_Tree = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train_Tree.shape,  X_train_Tree.shape)
print ('Test set:', X_test_Tree.shape,  y_test_Tree.shape)

Train set: (242, 13) (242,)
Test set: (61, 13) (61,)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics


heartTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
heartTree.fit(X_train_Tree,y_train_Tree)
predTree = heartTree.predict(X_test_Tree)
print("Accuracy: ", metrics.accuracy_score(y_test_Tree, predTree))


Accuracy:  0.8524590163934426


#### Add a magic loop to find the best parameters :

In [130]:
import itertools

best_accuracy = 0
best_features = None
best_k = None

# create a list of all feature combinations
feature_combinations = []
for r in range(1, len(df.columns)-1):
    feature_combinations.extend(itertools.combinations(df.columns[:-1], r))

# loop through all feature combinations
for features in feature_combinations:
    feature_df = df[list(features)]
    X = np.asarray(feature_df)
    y = np.asarray(df['output'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

   
    heartTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
    heartTree.fit(X_train,y_train)
    predTree = heartTree.predict(X_test)
    
    # check if this feature subset is the best so far
    if metrics.accuracy_score(y_test, predTree) > best_accuracy:
        best_accuracy = metrics.accuracy_score(y_test, predTree)
        best_features = features
        print("Accuracy: ", metrics.accuracy_score(y_test, predTree) , "best feature :", best_features)        
        
        
print("Accuracy: ", best_accuracy , "best feature :", best_features)   



# output -----------------------------------------------------------------------------------------------
# Accuracy:  0.5573770491803278 best feature : ('age',)
# Accuracy:  0.8032786885245902 best feature : ('cp',)
# Accuracy:  0.8688524590163934 best feature : ('exng', 'caa')
# Accuracy:  0.9016393442622951 best feature : ('cp', 'caa', 'thall')
# Accuracy:  0.9180327868852459 best feature : ('age', 'sex', 'cp', 'chol', 'exng', 'slp', 'caa')
# Accuracy:  0.9180327868852459 best feature : ('age', 'sex', 'cp', 'chol', 'exng', 'slp', 'caa')     



Accuracy:  0.5573770491803278 best feature : ('age',)
Accuracy:  0.8032786885245902 best feature : ('cp',)
Accuracy:  0.8688524590163934 best feature : ('exng', 'caa')
Accuracy:  0.9016393442622951 best feature : ('exng', 'caa', 'thall')
Accuracy:  0.9016393442622951 best feature : ('exng', 'caa', 'thall')
