# Python Machine Learning for Biology
# Evaluation Metrics

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

So far, we have used:
* **Regression problems**: Mean Absolute Error, Mean Squared Error, Root Mean Squared Error
* **Classification problems**: Classification accuracy (so far)--> there are lots of other ways to evaluate classifiers, and we'll learn them now

### Classification Accuracy

#### Let's get the classification accuracy of a logistic regression fit to the cancer dataset
Classification accuracy is the proportion of correct predictions.

In [2]:
cancer = pd.read_csv("data/cancer.csv")

In [3]:
X = cancer.iloc[:, 1:].values

In [4]:
y = cancer['diagnosis'].values

In [5]:
le = LabelEncoder()

In [6]:
le.fit_transform(y)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 5)

In [8]:
logreg = LogisticRegression()

In [9]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
y_pred = logreg.predict(X_test)

In [11]:
print(metrics.accuracy_score(y_test, y_pred))

0.9736842105263158


### Null Accuracy
The accuracy that can be achieved by always predicting the most frequent class

In [None]:
class_le = LabelEncoder()

In [None]:
y_test = class_le.fit_transform(y_test)
y_pred = class_le.fit_transform(y_pred)

In [None]:
y_test = pd.Series(y_test)
y_pred = pd.Series(y_pred)

In [None]:
# examine the class distribution of the testing set (using a Pandas Series method)
y_test.value_counts()

In [None]:
# calculate the percentage of ones
y_test.mean()

In [None]:
# calculate the percentage of zeros
1 - y_test.mean()

In [None]:
# calculate null accuracy (for binary classification problems coded as 0/1)
max(y_test.mean(), 1 - y_test.mean())

In [None]:
# calculate null accuracy (for multi-class classification problems)
y_test.value_counts().head(1) / len(y_test)

In [None]:
# print the first 25 true and predicted responses
print('True:', y_test.values[0:25])
print('Pred:', y_pred[0:25])

* Classification accuracy is the easiest classification metric to understand
* But, it does not tell you the underlying distribution of response values
* And, it does not tell you what "types" of errors your classifier is making

### Confusion matrix
Table that describes the performance of a classification model

<img src="assets/confusionmatrix.png"/>

#### Get the confusion matrix for the logistic regression of the cancer dataset

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# IMPORTANT: first argument is true values, second argument is predicted values
print(metrics.confusion_matrix(y_test, y_pred))

Every observation in the testing set is represented in exactly one box
It's a 2x2 matrix because there are 2 response classes
It tallies how many of the two types of correct predictions were made and the two types of incorrect predictions were made.
#### Basic terminology

* **True Positives (TP):** we correctly predicted that the sample is malignant
* **True Negatives (TN):** we correctly predicted that the sample is benign
* **False Positives (FP):** we incorrectly predicted that the sample is malignant (a "Type I error")
* **False Negatives (FN):** we incorrectly predicted that the sample is benign (a "Type II error")

In [None]:
# print the first 25 true and predicted responses
print('True:', y_test.values[0:25])
print('Pred:', y_pred.values[0:25])

In [None]:
# save confusion matrix and slice into four pieces
confusion = metrics.confusion_matrix(y_test, y_pred)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

### Metrics Calculated from the Confusion Matrix
<img src="assets/confusionmatrixmetrics.png"/>

**Classification accuracy: ** overall, how often is the classifier correct?

In [None]:
print((TP + TN) / float(TP + TN + FP + FN))
print(metrics.accuracy_score(y_test, y_pred))

**Classification Error:** Overall, how often is the classifier incorrect?
* Also known as "Misclassification Rate"

In [None]:
print((FP + FN) / float(TP + TN + FP + FN))
print(1 - metrics.accuracy_score(y_test, y_pred))

**Sensitivity:** When the actual value is positive, how often is the prediction correct?
* How "sensitive" is the classifier to detecting positive instances?
* Also known as "True Positive Rate" or "Recall"

In [None]:
print(TP / float(TP + FN))
print(metrics.recall_score(y_test, y_pred))

**Specificity:** When the actual value is negative, how often is the prediction correct?
* How "specific" (or "selective") is the classifier in predicting positive instances?

In [None]:
print(TN / float(TN + FP))

**False Positive Rate:** When the actual value is negative, how often is the prediction incorrect?

In [None]:
print(FP / float(TN + FP))

**Precision:** When a positive value is predicted, how often is the prediction correct?
* How "precise" is the classifier when predicting positive instances?

In [None]:
print(TP / float(TP + FP))
print(metrics.precision_score(y_test, y_pred))

### Demystifying Precision and Recall
<img src="assets/precisionrecall.png"/>

#### Low Recall, Low Precision
<img src="assets/lowPlowR.png"/>
<img src="assets/legen.jpeg"/>
* Let’s say everything inside the solid lines are pictures of actual hot dogs. 
* Everything within the dotted line is what the model thought was a picture of hot dogs.
* Everything in the square is the entire dataset. 
* True negatives (denoted tn) samples in your data, which you classified as not belonging to your class correctly. Eg. your “hot dog” vs “not hot dog” image classifier correctly classified your image of a car as not being a “hot dog”.
* False negatives (denoted fn) samples in your data, which you classified as not belonging to your class, incorrectly. Eg. your “hot dog” vs “not hot dog” image classifier incorrectly classified an image of a messed up “hot dog” as not being a “hot dog”.
* True positives (denoted tp) samples in your data, which you classifed as belonging to your class correctly. Eg. your “hot dog” vs “not hot dog” classifier correctly classifies a “hot dog” as being a “hot dog”.
* False positives (denoted fp) samples in your data, which you classified as belonging to your class incorrectly. Eg. your “hot dog” vs “not hot dog” classifier incorrectly classifies a hamburger as being a “hot dog”.

#### High Recall, Low Precision
<img src="assets/highRlowP.png"/>
* Our classifier casts a very wide net, catches a lot of fish, but also a lot of other things.
* Our classifier thinks a lot of things are “hot dogs”; legs on beaches, fries and whatnot. 
* However it also thinks a lot of “hot dogs” are “hot dogs”. 
* So from our set of images we got a lot of images classified as “hot dogs”, many of them was in the set of actual “hot dogs”, however a lot of them were also “not hot dogs”.

#### Low Recall, High Precision
<img src="assets/lowRhighP.png"/>
* Our classifier casts a very small but highly specialized net, does not catch a lot of fish, but there is almost only fish in the net.
* Our classifier is very picky, and does not think many things are hot dogs. 
* All the images it thinks are “hot dogs”, are really “hot dogs”. 
* However it also misses a lot of actual “hot dogs”, because it is so very picky. 

#### High Recall, High Precision
<img src="assets/highRhighP.png"/>
* The holy grail, our fish net is wide and highly specialised. 
* We catch a lot of fish (almost all of it) and we almost get only fish, nothing else.
* Our classifier is very good, it is very picky, but still it gets almost all of the images of “hot dogs” which are “hot dogs” correct. 
* We are happy!

Many other metrics can be computed as well: F1 score, Matthews correlation coefficient, etc.

**Conclusion:**
* Confusion matrix gives you a more complete picture of how your classifier is performing
* Also allows you to compute various classification metrics, and these metrics can guide your model selection

**Which metrics should you focus on?**
* Choice of metric depends on your objective
* For example, a spam filter (positive class is "spam"): Optimize for precision or specificity because false negatives (spam goes to the inbox) are more acceptable than false positives (non-spam is caught by the spam filter)
* Or for fraudulent transaction detector (positive class is "fraud"): Optimize for sensitivity because false positives (normal transactions that are flagged as possible fraud) are more acceptable than false negatives (fraudulent transactions that are not detected)

***Which matters more for your research problem?***

### Adjusting the classification threshold
Like a metal detector being adjusted to look for larger and smaller bits of metal.

In [None]:
logreg.predict(X_test)[0:10]

In [None]:
# each row is an observation and 
# each column is the probability it belongs to that class (add up to 1)
logreg.predict_proba(X_test)[0:10, :]

So `.predict` is using these probabilities to choose which class it predicts (if greater than 50%). If we adjust this threshold, we can adjust the sensitivity and specificity. 

In [None]:
# print the first 10 predicted probabilities for class 1
logreg.predict_proba(X_test)[0:10, 1]

In [None]:
# store the predicted probabilities for class 1
y_pred_prob = logreg.predict_proba(X_test)[:, 1]

In [None]:
# histogram of predicted probabilities
plt.hist(y_pred_prob, bins=8)
plt.xlim(0, 1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability')
plt.ylabel('Frequency')

#### Decrease the threshold for predicting malignancy to increase the sensitivity of the classifier

In [None]:
# predict malignant if the predicted probability is greater than 0.3
from sklearn.preprocessing import binarize
y_pred = binarize([y_pred_prob], 0.3)[0]

In [None]:
# print the first 10 predicted probabilities
y_pred_prob[0:10]

In [None]:
# print the first 10 predicted classes with the lower threshold
y_pred[0:10]

In [None]:
# previous confusion matrix (default threshold of 0.5)
print(confusion)

In [None]:
# new confusion matrix (threshold of 0.3)
print(metrics.confusion_matrix(y_test, y_pred))

* Threshold of 0.5 is used by default (for binary problems) to convert predicted probabilities into class predictions
* Threshold can be adjusted to increase sensitivity or specificity
* Sensitivity and specificity have an inverse relationship

## Independent Exercise
* Calculate the confusion matrix for the diabetes dataset after doing a logistic regression.


**Bonus:** try adjusting the classification threshold. Does it change the sensitivity and specificity?

### ROC Curves and Area Under the Curve (AUC)
*Wouldn't it be nice if we could see how sensitivity and specificity are affected by various thresholds, without actually changing the threshold?*

**ROC curves to the rescue!**

**Receiver Operator Curves** tell us about the false positive and true positive rates. A diagonal line would be the same as random guessing. If the ROC curve falls below the diagonal, it is worse than random guessing. A perfect classifier would fall on the edge of the top left corner (true positive rate of 1, false positive rate of 0). The **Area Under the Curve (AUC)** score tells us about the performance of the model. 

<img src="assets/roc.jpg"/>

#### Find the AUC for the cancer dataset
AUC is the percentage of the ROC plot that is underneath the curve:

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
print(roc_auc_score(y_true=y_test, y_score = y_pred))

In [None]:
print(accuracy_score(y_true=y_test, y_pred=y_pred))

* AUC is useful as a single number summary of classifier performance.
* If you randomly chose one positive and one negative observation, AUC represents the likelihood that your classifier will assign a higher predicted probability to the positive observation.
* AUC is useful even when there is high class imbalance (unlike classification accuracy).

In [None]:
# IMPORTANT: first argument is true values, second argument is predicted probabilities
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for cancer classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

* ROC curve can help you to choose a threshold that balances sensitivity and specificity in a way that makes sense for your particular context
* You can't actually see the thresholds used to generate the curve on the ROC curve itself

In [None]:
# define a function that accepts a threshold and prints sensitivity and specificity
def evaluate_threshold(threshold):
    print('Sensitivity:', tpr[thresholds > threshold][-1])
    print('Specificity:', 1 - fpr[thresholds > threshold][-1])

In [None]:
evaluate_threshold(0.5)

In [None]:
evaluate_threshold(0.3)

### Independent Practice
Plot the ROC Curve and calculate the AUC for the diabetes dataset for a logistic regression. 

*What classification accuracy threshold do you recommend?*

**Confusion matrix advantages:**
* Allows you to calculate a variety of metrics
* Useful for multi-class problems (more than two response classes)


**ROC/AUC advantages:**
* Does not require you to set a classification threshold
* Still useful when there is high class imbalance

## Diagnostics: Learning and Validation Curves

### Diagnosing bias and variance with learning curves

Learning curves plot the number of training samples and accuracy for both training and validation.  

Models with high bias in both training and validation sets indicate overfitting. Increasing number of model parameters or decreasing regularization can help.  

Models with high variance (big gap between training and validation sets are overfit. Collecting more data, reducing the complexity of the model, or increasing regularization can help.
<img src="assets/learningcurves.png"/>

#### Plot the learning curves for the cancer data

In [None]:
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
from sklearn.pipeline import Pipeline

In [None]:
pipe_lr = Pipeline([
    ('scl', StandardScaler()), 
    ('clf', LogisticRegression(penalty='l2', random_state=0))])

In [None]:
train_sizes, train_scores, test_scores =\
    learning_curve(estimator=pipe_lr,
                  X=X_train,
                  y=y_train, 
                  train_sizes=np.linspace(0.1, 1.0, 10),
                  cv=10,
                  n_jobs=1)

train_mean = np.mean(train_scores, axis = 1)

train_std = np.std(train_scores, axis = 1)

test_mean = np.mean(test_scores, axis = 1)

test_std = np.std(test_scores, axis = 1)

plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean-train_std, alpha=0.15, color = 'blue')
plt.plot(train_sizes, test_mean, color = 'green', linestyle = '--', marker = 's', markersize=5, label = 'validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean-test_std, alpha=0.15, color = 'green')
plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc = 'lower right')
plt.ylim([0.8, 1.0])
plt.show()

How does our model look?

### Addressing overfitting and underfitting with validation curves

Validation curves are like learning curves, but instead of plotting training and validation accuracies as a function of sample size, we vary the values of the model parameters. 

#### Plot validation curves for the cancer data

In [None]:
from sklearn.learning_curve import validation_curve

In [None]:
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

In [None]:
train_scores, test_scores = validation_curve(
    estimator = pipe_lr, 
    X = X_train,
    y=y_train,
    param_name = 'clf__C',
    param_range = param_range,
    cv=10)

In [None]:
train_mean =np.mean(train_scores, axis=1)

In [None]:
train_std = np.std(train_scores, axis = 1)

In [None]:
test_mean = np.mean(test_scores, axis=1)

In [None]:
test_std = np.std(test_scores, axis = 1)

In [None]:
plt.plot(param_range, train_mean, color='blue', marker = 'o', markersize = 5, label = 'training accuracy')
plt.fill_between(param_range, train_mean + train_std, train_mean-train_std, alpha=0.15, color = 'blue')
plt.plot(param_range, test_mean, color='green', marker = 's', markersize = 5, label = 'validation accuracy')
plt.fill_between(param_range, test_mean+test_std, test_mean-test_std, alpha=0.15, color='green')
plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.8, 1.0])
plt.show()

Which levels of C seem best?