## k-NN

In [32]:
#import libraries
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors, datasets

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, classification_report

from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [33]:
#standardize the features
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [37]:
np.random.seed(211)

# Using gridsearch to choose a optimal model for the knn
# Parameters to tune are: k, weights
gs = GridSearchCV(estimator=neighbors.KNeighborsClassifier(p=2,metric='minkowski'),
                  param_grid=[{'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21], 'weights':['uniform','distance']}],
                  scoring='accuracy', # Specifying multiple metrics for evaluation
                  cv=5,
                  n_jobs=-1)

gs = gs.fit(X_train_std,y_train)

In [38]:
# print the best score and parameters for the best model
print(gs.best_score_)
print(gs.best_params_)
print(gs.best_estimator_)

0.8482152785499834
{'n_neighbors': 11, 'weights': 'uniform'}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')


In [None]:
# Use the parameters generated from the best KNN model to compute the mean and variance of acuracy using cross-validation
clf3 = neighbors.KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                                        metric_params=None, n_jobs=1, n_neighbors=11, p=2,
                                        weights='uniform')

scores=cross_val_score(clf3, X_train_std, y_train, scoring='accuracy', cv=5)

print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores),
                                      np.std(scores)))

## ROC

In [None]:
# Label the classifiers
clf_labels = ['Logistic regression', 'Decision tree', 'KNN']
all_clf = [clf1, clf2, clf3]

print('5-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3], clf_labels): #For all classifiers 
    scores = cross_val_score(estimator=clf,  #Estimate AUC based on cross validation
                             X=X_train,
                             y=y_train,
                             cv=5,
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" #Print peformance statistics based on cross-validation
          % (scores.mean(), scores.std(), label))

colors = [ 'orange', 'blue', 'green']      #Colors for visualization
linestyles = [':', '--', '-.', '-']        #Line styles for visualization
for clf, label, clr, ls in zip(all_clf,
               clf_labels, colors, linestyles):

    # assuming the label of the positive class is 1 and data is normalized
    y_pred = clf.fit(X_train,
                     y_train).predict_proba(X_test)[:, 1] # Make predictions based on the classifiers
    fpr, tpr, thresholds = roc_curve(y_true=y_test, # Build ROC curve
                                     y_score=y_pred)
    roc_auc = auc(x=fpr, y=tpr)                # Compute Area Under the Curve (AUC) 
    plt.plot(fpr, tpr,                         # Plot ROC Curve and create label with AUC values
             color=clr,
             linestyle=ls,
             label='%s (auc = %0.2f)' % (label, roc_auc))

plt.legend(loc='lower right')    # Where to place the legend
plt.plot([0, 1], [0, 1], # Visualize random classifier
         linestyle='--',
         color='gray',
         linewidth=2)

plt.xlim([-0.1, 1.1])   #limits for x axis
plt.ylim([-0.1, 1.1])   #limits for y axis
plt.grid(alpha=0.5)
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')


#plt.savefig('ROC_all_classifiers', dpi=300)
plt.show()