![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)

# <center> Machine Learning Methods </center>
## <center> Exercise 03 - Hearth Disease Classification - Solution </center>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/MachineLearningMethod/Exercises/Exercise03_Classification_Solution.ipynb)

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import numpy             as np
import pandas            as pd
import seaborn           as sns
import matplotlib.pyplot as plt

### Load the dataset:
https://www.kaggle.com/cherngs/heart-disease-cleveland-uci?select=heart_cleveland_upload.csv

In [3]:
dData = pd.read_csv('heart_cleveland_upload.csv')
dData

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


#### Columns full name (based on the data description):
(Try to understand the following code)

In [4]:
lFullName = ['Age', 'Sex', 'ChestPainType', 'RestingBloodPressure', 'Cholesterol', 'FastingBloodSugar', 'RestECG', 'MaxHeartRateAchieved', 'ExerciseInducedAngina', 'StDepression', 'StSlope', 'NumMajorVessels', 'Thalassemia', 'Target']
space     = max(map(len, dData.columns))
for (shortName, fullName) in zip(dData.columns, lFullName):
    print(f'{shortName:{space}s} = {fullName}')

age       = Age
sex       = Sex
cp        = ChestPainType
trestbps  = RestingBloodPressure
chol      = Cholesterol
fbs       = FastingBloodSugar
restecg   = RestECG
thalach   = MaxHeartRateAchieved
exang     = ExerciseInducedAngina
oldpeak   = StDepression
slope     = StSlope
ca        = NumMajorVessels
thal      = Thalassemia
condition = Target


### Get data and normalization:

In [5]:
mX = dData.drop(columns='condition').values
vY = dData['condition'].values

mX.shape, vY.shape

((297, 13), (297,))

In [6]:
mX -= mX.mean(0)
mX /= mX.std(0)

### Basic classification:
Let us try several default classifiers

In [7]:
from sklearn.linear_model    import LogisticRegression
from sklearn.svm             import SVC
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.tree            import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict, KFold

#-- Classifiers:
#-- You can set parameters as well
lModels  = []
lModels += [(SVC                   (C=1), 'SVM'                )]
lModels += [(LogisticRegression    (),    'Logistic Regression')]
lModels += [(KNeighborsClassifier  (),    'KNN'                )]
lModels += [(DecisionTreeClassifier(),    'Tree'               )]


N = len(vY)
for (oClassifier, name) in lModels:
    vHatY    = cross_val_predict(oClassifier, mX, vY, cv=KFold(N))
    accuracy = np.mean(vY == vHatY)
    print(f'{name:19s} = {100*accuracy:2.2f}%')
    

SVM                 = 82.83%
Logistic Regression = 83.16%
KNN                 = 82.49%
Tree                = 69.02%


### Exercise A:
* Use `KNeighborsClassifier` and get above 84% leave-one-out cross validation accuracy.  
(Play with the hyper-parameters)

In [8]:
vHatY = cross_val_predict(KNeighborsClassifier(n_neighbors=13), mX, vY, cv=KFold(N, shuffle=True))
np.mean(vHatY == vY)

0.8484848484848485

In [9]:
vHatY = cross_val_predict(KNeighborsClassifier(n_neighbors=13, weights='distance'), mX, vY, cv=KFold(N, shuffle=True))
np.mean(vHatY == vY)

0.8451178451178452

In [10]:
vHatY = cross_val_predict(KNeighborsClassifier(n_neighbors=11, p=1), mX, vY, cv=KFold(N, shuffle=True))
np.mean(vHatY == vY)

0.8417508417508418

### Meaningful categorical data:
(Based on the data description)

In [11]:
dData2         = dData.copy()
dData2.columns = lFullName
dData2

Unnamed: 0,Age,Sex,ChestPainType,RestingBloodPressure,Cholesterol,FastingBloodSugar,RestECG,MaxHeartRateAchieved,ExerciseInducedAngina,StDepression,StSlope,NumMajorVessels,Thalassemia,Target
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


In [12]:
dData2.loc[dData2['Sex']                   == 0, 'Sex']                   = 'female'
dData2.loc[dData2['Sex']                   == 1, 'Sex']                   = 'male'
  
dData2.loc[dData2['ChestPainType']         == 0, 'ChestPainType']         = 'asymptomatic'
dData2.loc[dData2['ChestPainType']         == 1, 'ChestPainType']         = 'atypical angina'
dData2.loc[dData2['ChestPainType']         == 2, 'ChestPainType']         = 'non-anginal pain'
dData2.loc[dData2['ChestPainType']         == 3, 'ChestPainType']         = 'typical angina'

dData2.loc[dData2['FastingBloodSugar']     == 0, 'FastingBloodSugar']     = 'lower than 120mg/ml'
dData2.loc[dData2['FastingBloodSugar']     == 1, 'FastingBloodSugar']     = 'greater than 120mg/ml'

dData2.loc[dData2['RestECG']               == 0, 'RestECG']               = 'normal'
dData2.loc[dData2['RestECG']               == 1, 'RestECG']               = 'ST-T wave abnormality'
dData2.loc[dData2['RestECG']               == 2, 'RestECG']               = 'left ventricular hypertrophy'

dData2.loc[dData2['ExerciseInducedAngina'] == 0, 'ExerciseInducedAngina'] = 'no'
dData2.loc[dData2['ExerciseInducedAngina'] == 1, 'ExerciseInducedAngina'] = 'yes'

dData2.loc[dData2['StSlope']               == 0, 'StSlope']               = 'upsloping'
dData2.loc[dData2['StSlope']               == 1, 'StSlope']               = 'flat'
dData2.loc[dData2['StSlope']               == 2, 'StSlope']               = 'downsloping'

dData2.loc[dData2['Thalassemia']           == 0, 'Thalassemia']            = 'normal'
dData2.loc[dData2['Thalassemia']           == 1, 'Thalassemia']            = 'fixed defect'
dData2.loc[dData2['Thalassemia']           == 2, 'Thalassemia']            = 'reversable defect'

dData2.loc[dData2['Target']                == 0, 'Target']                 = 'no heart disease'
dData2.loc[dData2['Target']                == 1, 'Target']                 = 'heart disease'
dData2

Unnamed: 0,Age,Sex,ChestPainType,RestingBloodPressure,Cholesterol,FastingBloodSugar,RestECG,MaxHeartRateAchieved,ExerciseInducedAngina,StDepression,StSlope,NumMajorVessels,Thalassemia,Target
0,69,male,asymptomatic,160,234,greater than 120mg/ml,left ventricular hypertrophy,131,no,0.1,flat,1,normal,no heart disease
1,69,female,asymptomatic,140,239,lower than 120mg/ml,normal,151,no,1.8,upsloping,2,normal,no heart disease
2,66,female,asymptomatic,150,226,lower than 120mg/ml,normal,114,no,2.6,downsloping,0,normal,no heart disease
3,65,male,asymptomatic,138,282,greater than 120mg/ml,left ventricular hypertrophy,174,no,1.4,flat,1,normal,heart disease
4,64,male,asymptomatic,110,211,lower than 120mg/ml,left ventricular hypertrophy,144,yes,1.8,flat,0,normal,no heart disease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,male,typical angina,152,223,lower than 120mg/ml,normal,181,no,0.0,upsloping,0,reversable defect,heart disease
293,39,male,typical angina,118,219,lower than 120mg/ml,normal,140,no,1.2,flat,0,reversable defect,heart disease
294,35,male,typical angina,120,198,lower than 120mg/ml,normal,130,yes,1.6,flat,0,reversable defect,heart disease
295,35,female,typical angina,138,183,lower than 120mg/ml,normal,182,no,1.4,upsloping,0,normal,no heart disease


### Convert non-numeric features to dummy features:

In [13]:
dData3 = pd.get_dummies(dData2, drop_first=True)
dData3

Unnamed: 0,Age,RestingBloodPressure,Cholesterol,MaxHeartRateAchieved,StDepression,NumMajorVessels,Sex_male,ChestPainType_atypical angina,ChestPainType_non-anginal pain,ChestPainType_typical angina,FastingBloodSugar_lower than 120mg/ml,RestECG_left ventricular hypertrophy,RestECG_normal,ExerciseInducedAngina_yes,StSlope_flat,StSlope_upsloping,Thalassemia_normal,Thalassemia_reversable defect,Target_no heart disease
0,69,160,234,131,0.1,1,1,0,0,0,0,1,0,0,1,0,1,0,1
1,69,140,239,151,1.8,2,0,0,0,0,1,0,1,0,0,1,1,0,1
2,66,150,226,114,2.6,0,0,0,0,0,1,0,1,0,0,0,1,0,1
3,65,138,282,174,1.4,1,1,0,0,0,0,1,0,0,1,0,1,0,0
4,64,110,211,144,1.8,0,1,0,0,0,1,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,152,223,181,0.0,0,1,0,0,1,1,0,1,0,0,1,0,1,0
293,39,118,219,140,1.2,0,1,0,0,1,1,0,1,0,1,0,0,1,0
294,35,120,198,130,1.6,0,1,0,0,1,1,0,1,1,1,0,0,1,0
295,35,138,183,182,1.4,0,0,0,0,1,1,0,1,0,0,1,1,0,1


### Exercise B:
1. Use `dData3` and get above 85% leave-one-out cross validation accuracy.
2. You are allowed you to only 6 features (from the given 18).  
Get more than 80% accuracy with only 6 features.  
What is the right approach for this task?
**Extra:** Get above 85.5% (using only 6 features).

In [14]:
mX3 = dData3.drop(columns='Target_no heart disease').values
vY3 = dData3['Target_no heart disease'].values

mX3 -= mX3.mean(0)
mX3 /= mX3.std(0)

mX3.shape, vY3.shape

((297, 18), (297,))

In [15]:
vC   = np.linspace(0.1, 10, 21)
dRes = pd.DataFrame(columns=['C', 'Accuracy'])

for C in vC:
    oSVM                = SVC(kernel='linear', C=C)
    vHatY               = cross_val_predict(oSVM, mX3, vY3, cv=KFold(N, shuffle=True))
    dRes.loc[len(dRes)] = [C, np.mean(vY3 == vHatY)]
        
dRes.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,C,Accuracy
10,5.05,0.851852
11,5.545,0.851852
19,9.505,0.851852
18,9.01,0.851852
17,8.515,0.851852
16,8.02,0.851852
15,7.525,0.851852
14,7.03,0.851852
13,6.535,0.851852
12,6.04,0.851852


In [16]:
dData3.columns

Index(['Age', 'RestingBloodPressure', 'Cholesterol', 'MaxHeartRateAchieved',
       'StDepression', 'NumMajorVessels', 'Sex_male',
       'ChestPainType_atypical angina', 'ChestPainType_non-anginal pain',
       'ChestPainType_typical angina', 'FastingBloodSugar_lower than 120mg/ml',
       'RestECG_left ventricular hypertrophy', 'RestECG_normal',
       'ExerciseInducedAngina_yes', 'StSlope_flat', 'StSlope_upsloping',
       'Thalassemia_normal', 'Thalassemia_reversable defect',
       'Target_no heart disease'],
      dtype='object')

In [17]:
#-- Choose 6 features
lCol   = ['MaxHeartRateAchieved', 'NumMajorVessels', 'ChestPainType_atypical angina', 'ChestPainType_typical angina', 'Sex_male', 'Thalassemia_normal', ]
dData4 = dData3[lCol]
dData4

Unnamed: 0,MaxHeartRateAchieved,NumMajorVessels,ChestPainType_atypical angina,ChestPainType_typical angina,Sex_male,Thalassemia_normal
0,131,1,0,0,1,1
1,151,2,0,0,0,1
2,114,0,0,0,0,1
3,174,1,0,0,1,1
4,144,0,0,0,1,1
...,...,...,...,...,...,...
292,181,0,0,1,1,0
293,140,0,0,1,1,0
294,130,0,0,1,1,0
295,182,0,0,1,0,1


In [18]:
mX4 = dData4.values.astype(np.float32)
vY4 = vY3.copy()

mX4 -= mX4.mean(0)
mX4 /= mX4.std (0)

mX4.shape, vY4.shape

((297, 6), (297,))

In [19]:
vC   = np.linspace(0.1, 10, 21)
dRes = pd.DataFrame(columns=['C', 'Accuracy'])

for C in vC:
    oSVM                = SVC(kernel='linear', C=C)
    vHatY               = cross_val_predict(oSVM, mX4, vY4, cv=KFold(N, shuffle=True))
    dRes.loc[len(dRes)] = [C, np.mean(vY4 == vHatY)]
        
dRes.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,C,Accuracy
20,10.0,0.851852
12,6.04,0.851852
19,9.505,0.851852
18,9.01,0.851852
17,8.515,0.851852
16,8.02,0.851852
15,7.525,0.851852
14,7.03,0.851852
8,4.06,0.851852
9,4.555,0.851852


In [20]:
oSVM  = SVC(kernel='rbf', C=10, gamma=2e-4)
vHatY = cross_val_predict(oSVM, mX4, vY4, cv=KFold(N, shuffle=True))
np.mean(vHatY == vY4)

0.8552188552188552