In [42]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

### Dataset loading

In [43]:
dataset = pd.read_csv('heart.csv')
dataset.head()

Unnamed: 0,ID,AGE,SEX,CP,TRESTBPS,CHOL,FBS,RESTECG,THALACH,EXANG,OLDPEAK,SLOPE,CA,THAL,TARGET
0,1,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,2,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,3,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,4,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,5,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


### Label isolation
Here the labels are contained in the 'TARGET' column, corresponding to a low (0) or high risk of heart failure (1).
In this numerical shape, the labels are already fit for prediction using sklearn.

In [50]:
labels = np.asarray(dataset.TARGET)
labels

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Features isolation
We drop the 'target' column, and keep all the other columns as predictive features.
We then vectorize the remaining items and turn the dataset into an array.

In [51]:
dataset_features = dataset.drop(['TARGET', 'ID'], axis=1).to_dict(orient='records')
features = DictVectorizer().fit_transform(dataset_features).toarray()
features

array([[ 63.,   0., 233., ...,   1., 150., 145.],
       [ 37.,   0., 250., ...,   2., 187., 130.],
       [ 41.,   0., 204., ...,   2., 172., 130.],
       ...,
       [ 68.,   2., 193., ...,   3., 141., 144.],
       [ 57.,   1., 131., ...,   3., 115., 130.],
       [ 57.,   1., 236., ...,   2., 174., 130.]])

### Set splitting
We split the resulting datasets in a training and testing set, with 80/20 distribution.

In [52]:
features_train, features_test, labels_train, labels_test = train_test_split(
features, labels, test_size=0.20)

### Model selection, training and testing
![](ml_map.png)


We train a Support Vector Classifier using a Linear kernel, as is encouraged by scikit-learn when evaluating our dataset (predicting exercice, small number of data, pre-labeled data).

In [53]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(features_train, labels_train)
train_accuracy = svm_model_linear.score(features_test, labels_test)
test_accuracy = svm_model_linear.score(features_train, labels_train)
print("Train accuracy:", train_accuracy)
print("Test accuracy:",test_accuracy)   

Train accuracy: 0.7868852459016393
Test accuracy: 0.8884297520661157
