In [145]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import matplotlib.pyplot as plt

# The Importance of Balanced Data

## Introduction

In machine learning you typically want to build a model that does what it is suppose to do.

If you have an image classifier, you want your model to correctly classify images.

!["https://www.google.ca/url?sa=i&rct=j&q=&esrc=s&source=images&cd=&cad=rja&uact=8&ved=2ahUKEwjToc6I8pDdAhVNCDQIHayCA9wQjRx6BAgBEAU&url=https%3A%2F%2Fblog.acolyer.org%2F2016%2F04%2F20%2Fimagenet-classification-with-deep-convolutional-neural-networks%2F&psig=AOvVaw3_8QOTYYyy1ixeoaof-d_z&ust=1535584557428370"](../data/img/imagenet_example.png)

So how do you determine how well your model is at doing its job?

With classification, for example, people generally take the proportion of correctly classified images as the model's accuracy score.

$$Accuracy = \frac{\#\ of\ correctly\ classfied\ images}{\#\ of\ total\ classified\ images}$$

A high accuracy score usually equates to a good model.

!["https://plot.ly/~botevmg/5/imagenet-large-scale-visual-recognition-challenge-accuracy.png"](../data/img/imagenet_scores.png)

**BUT... this is not always true...**

## Welcome to healthcare

Let's say you develop a binary classification model that can determine whether a person is HIV+ or HIV-.

If this model is tested on every person in Canada and yields an overall accuracy score of 99.7%, would you consider it to be a successful model?

### **NO!**

The prevelence of HIV/AIDS in Canada is 212 people per 100,000 (0.212%) [1](https://en.wikipedia.org/wiki/HIV/AIDS_in_Canada). This means that 99.788% of Canada does not have HIV/AIDS.

If youur model **predicts every person to be HIV negative**, it would yield an **accuracy score of 99.788%**.

$$Accuracy = \frac{\#\ of\ correctly\ classfied\ images}{\#\ of\ total\ classified\ images}$$

$$Accuracy = \frac{100,000-212}{100,000}$$

$$Accuracy = 0.99788$$

We cannot always rely on general accuracy metrics to explain the success of our models. We need to make sure that our models do what we want it to do.

In [148]:
212/100000*100

0.212

## Import Data

The data source comes from [here](http://archive.ics.uci.edu/ml/datasets.html?task=cla&area=&type=&view=list).

In [96]:
data = pd.read_csv('../data/thoraric_surgery.csv')
display(data.head())
data.Risk1Yr.value_counts()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,DGN3,3.4,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
4,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73,T


F    400
T     70
Name: Risk1Yr, dtype: int64

In [97]:
data.replace('F', 0, inplace=True)
data.replace('T', 1, inplace=True)

dummy_cols = ['DGN', 'PRE6', 'PRE14']
dummies = pd.get_dummies(data[dummy_cols])
display(dummies.head())

data = data.join(dummies)
data.drop(columns=dummy_cols, inplace=True)
data.head()

Unnamed: 0,DGN_DGN1,DGN_DGN2,DGN_DGN3,DGN_DGN4,DGN_DGN5,DGN_DGN6,DGN_DGN8,PRE6_PRZ0,PRE6_PRZ1,PRE6_PRZ2,PRE14_OC11,PRE14_OC12,PRE14_OC13,PRE14_OC14
0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
1,0,0,1,0,0,0,0,1,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,1,0,1,0,0,0
3,0,0,1,0,0,0,0,1,0,0,1,0,0,0
4,0,0,1,0,0,0,0,0,0,1,1,0,0,0


Unnamed: 0,PRE4,PRE5,PRE7,PRE8,PRE9,PRE10,PRE11,PRE17,PRE19,PRE25,...,DGN_DGN5,DGN_DGN6,DGN_DGN8,PRE6_PRZ0,PRE6_PRZ1,PRE6_PRZ2,PRE14_OC11,PRE14_OC12,PRE14_OC13,PRE14_OC14
0,2.88,2.16,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,3.4,1.88,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,2.76,2.08,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,3.68,3.04,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,2.44,0.96,0,1,0,1,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0


In [137]:
# X = data.iloc[:, :-1]
X = data.loc[:, data.columns!='Risk1Yr']
y = data['Risk1Yr']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

lr = LogisticRegression()
lr.fit(X=X_train, y = y_train)
predictions = lr.predict(X_test)

conf_mat = confusion_matrix(y_true=y_test, y_pred=predictions)

print(conf_mat)

print('Accuracy: {}'.format((conf_mat[0, 0] + conf_mat[1, 1]) / (np.sum(conf_mat))))

[[114   2]
 [ 24   1]]
Accuracy: 0.8156028368794326


In [128]:
score = precision_recall_fscore_support(y_true=y_test, y_pred=predictions, average='binary')
score

(0.3333333333333333, 0.04, 0.07142857142857142, None)

In [138]:
label_min_count = y.value_counts().min()
label_min = y.value_counts().idxmin()

data_max_label = data[data.Risk1Yr != label_min]
data_min_label = data[data.Risk1Yr == label_min]

data_max_downsample = data_max_label.sample(n = label_min_count, replace=False, random_state=1234)

data_balanced = data_min_label.append(data_max_downsample)

data_balanced.shape

(140, 28)

In [139]:
X = data_balanced.loc[:, data_balanced.columns!='Risk1Yr']
y = data_balanced['Risk1Yr']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

lr = LogisticRegression()
lr.fit(X=X_train, y = y_train)
predictions = lr.predict(X_test)

conf_mat = confusion_matrix(y_true=y_test, y_pred=predictions)

print(conf_mat)

print('Accuracy: {}'.format((conf_mat[0, 0] + conf_mat[1, 1]) / (np.sum(conf_mat))))

[[11 12]
 [ 6 13]]
Accuracy: 0.5714285714285714


In [140]:
score = precision_recall_fscore_support(y_true=y_test, y_pred=predictions, average='binary')
score

(0.52, 0.6842105263157895, 0.5909090909090909, None)

In [142]:
y = data['Risk1Yr']
label_max_count = y.value_counts().max()
label_max = y.value_counts().idxmax()

data_min_upsample = data_min_label.sample(n = label_max_count, replace=True, random_state=1234)

data_balanced = data_max_label.append(data_min_upsample)

data_balanced.shape

(800, 28)

In [143]:
X = data_balanced.loc[:, data_balanced.columns!='Risk1Yr']
y = data_balanced['Risk1Yr']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

lr = LogisticRegression()
lr.fit(X=X_train, y = y_train)
predictions = lr.predict(X_test)

conf_mat = confusion_matrix(y_true=y_test, y_pred=predictions)

print(conf_mat)

print('Accuracy: {}'.format((conf_mat[0, 0] + conf_mat[1, 1]) / (np.sum(conf_mat))))

[[85 47]
 [31 77]]
Accuracy: 0.675


In [144]:
score = precision_recall_fscore_support(y_true=y_test, y_pred=predictions, average='binary')
score

(0.6209677419354839, 0.7129629629629629, 0.6637931034482759, None)