In [126]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

import imblearn
from imblearn.over_sampling import RandomOverSampler

%matplotlib inline
RSEED = 42




## Apply classifier to data

In [None]:
def clf_train_predict(clf, train, train_label, test):
    '''Trains the classifier clf and predicts the target for test'''
    clf.fit(train, train_label)
    y_pred = clf.predict(test)
    return y_pred

## Print simple metrics

In [None]:
def simple_metrics(test_label, y_pred):
    '''Take y_test and y_pred as input 
    and prints the accuracy, f1-score and the confusion matrix.'''
    print('accuracy_score', accuracy_score(test_label, y_pred))
    print('f1-score', f1_score(test_label, y_pred))
    cm = confusion_matrix(test_label, y_pred)
    # print('confusion matrix: \n',cm)
    return cm

## Function to plot confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import itertools


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Oranges):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    # Plot the confusion matrix
    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)

# Confusion matrix
# How to use?
# cm = confusion_matrix(test_labels, rf_predictions)
# plot_confusion_matrix(cm, classes = ['Poor Health', 'Good Health'],
#                       title = 'Health Confusion Matrix')