## Modeling Heart Disease

#### Import libraries

In [82]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import (train_test_split, GridSearchCV)
from sklearn_pandas import DataFrameMapper
from sklearn.neighbors import (KNeighborsClassifier, NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
# from xgboost import XGBClassifier
from sklearn.metrics import (make_scorer, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score)
from sklearn.metrics import confusion_matrix
# from sklearn.decomposition import PCA


In [2]:
data = pd.read_csv('heart_clean.csv')

### Split data into training/test sets so test data does not influence z-score normalization

In [7]:
random_seed = 24

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns='target'), data.target,
                                                    test_size=0.3, stratify=data.target, random_state=random_seed)

### Normalize continuous data (z-score) and one hot encode categorical

In [9]:
#categorical = ['cp', 'restecg', 'slope', 'ca', 'thal']
#binary_cat = ['sex', 'fbs', 'exang'] ## 'target' is omitted
numerical = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [10]:
SS = preprocessing.StandardScaler()
#OH = preprocessing.OneHotEncoder(categories = 'auto', sparse=False)
mapper = DataFrameMapper([([n], SS) for n in numerical], default=None, df_out=True)

In [11]:
X_train = mapper.fit_transform(X_train.astype('float'))
X_test = mapper.transform(X_test.astype('float'))

In [29]:
nca = NeighborhoodComponentsAnalysis(random_state=random_seed)
knn = KNeighborsClassifier()
neighbors_pipe = Pipeline([('nca', nca), ('knn', knn)])

In [15]:
neighbors_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('nca',
                 NeighborhoodComponentsAnalysis(callback=None, init='auto',
                                                max_iter=50, n_components=None,
                                                random_state=24, tol=1e-05,
                                                verbose=0, warm_start=False)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [16]:
print(neighbors_pipe.score(X_test, y_test))

0.46153846153846156


In [35]:
params = {'n_neighbors': [i for i in range(1, 10)], 'weights': ['uniform', 'distance'],
          'algorithm' : ['ball_tree', 'kd_tree', 'brute']}

In [88]:
model = GridSearchCV(knn, params, cv=5, return_train_score=True, iid=False, scoring = make_scorer(precision_score))

In [89]:
model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid=False, n_jobs=None,
             param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=make_scorer(precision_score), verbose=0)

In [90]:
print('Best parameters:', model.best_params_)
print('Best score:', '{:.3f}'.format(model.best_score_))

Best parameters: {'algorithm': 'ball_tree', 'n_neighbors': 2, 'weights': 'uniform'}
Best score: 0.854


In [91]:
y_pred = model.best_estimator_.predict(X_test)

In [92]:
print(confusion_matrix(y_test, y_pred))

[[16 26]
 [31 18]]
