In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import binarize
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
pima = pd.read_csv('pima-indians-diabetes.data', header=None, names=col_names)
pima.head()

In [None]:
feature_cols = ['pregnant', 'insulin', 'bmi', 'age']
X = pima[feature_cols]
y = pima.label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
logreg = LogisticRegression(multi_class='ovr', solver='liblinear')
logreg.fit(X_train, y_train)

In [None]:
y_pred_class = logreg.predict(X_test)

In [None]:
print(metrics.accuracy_score(y_test, y_pred_class))

## Weaknesses of the Accuracy metrics

In [None]:
#view the accuracy that would be achieved by always predicting the most frequent class (Null accuracy)
#examine the class distribution of the testing set
y_test.value_counts()

In [None]:
#calculate the percentage of ones
y_test.mean()

In [None]:
#calculate the percentage of zeros
1-y_test.mean()

In [None]:
#the above means that a model that only predicts 0 (patient doesn't have diabetes) would be right 67.7% of the times
#If we compare our model's accuracy(69%) and the null accuracy we can conclude that our model doesn't perform well
#how to calculate the null accuracy in one line(it work in binary classification, like this for example)
max(y_test.mean(), 1-y_test.mean())

In [None]:
#null accuracy for 3 or more classes
y_test.value_counts().head(1)/len(y_test)

In [None]:
#compare the first 25 true and predicted values
print(f'True: {y_test.values[:25]}')
print(f'Pred: {y_pred_class[:25]}')

## Confusion Matrix

In [None]:
#table that describes the performance of a classification model
#it outputs a 2*2 numpy array
print(metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

### Metrics computed from a confusion matrix

In [None]:
#Classification accuracy
print((TP + TN)/(TP + TN + FP + FN))
print(metrics.accuracy_score(y_test, y_pred_class))

In [None]:
#Classification Error (Misclassification rate)
print((FP + FN)/(TP + TN + FP + FN))
print(1 - metrics.accuracy_score(y_test, y_pred_class))

In [None]:
# Sensitivity (True Positive Rate or Recall)
#It answers the question: When the actual value is positive, how often is the prediction correct?
print(TP / (TP + FN))
print(metrics.recall_score(y_test, y_pred_class))

In [None]:
#Specificity
#It answers the question: When the actual value is negative, how often is the prediction correct?
print(TN /(TN + FP))
#We could say that our results are high specific but low sensitive

In [None]:
#False Positive Rate
#It answers the question: When the actual value is negative, how often is the prediction incorrect?
print(FP/(FP + TN))
print(1 - TN /(TN + FP)) # or 1-specifity

In [None]:
#Precision
#It answers the question: When a positive value is predicted, how often is the prediction correct?
print(TP/(TP + FP))
print(metrics.precision_score(y_test, y_pred_class))
#The F1 score and Matthews correlation coefficient can also be computed with a confusion matrix

#### How to choose which metrics to optimize?
It depends on the business objective

## Adjusting the classification threshold

In [None]:
#print the first 10 predicted responses
logreg.predict(X_test)[:10]

In [None]:
#print the first 10 predicted probabilities of class membership
logreg.predict_proba(X_test)[:10, :]

In [None]:
#store the predicted probabilities for class 1
y_pred_prob = logreg.predict_proba(X_test)[:, 1]

In [None]:
#plot a histogram of predicted probabilities
plt.hist(y_pred_prob, bins=8)
plt.xlim(0, 1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability of diabetes')
plt.ylabel('Frequency')

In [None]:
#we can decrease the threshold for predicted diabetes in order to increase sensitivity
#predict diabetes if the predicted probability is greater than 0.3
y_pred_prob = logreg.predict_proba(X_test)[:, :]
y_pred_class = binarize(y_pred_prob, 0.3)[:]

In [None]:
#print the first 10 predicted probabilities
y_pred_prob[:10]

In [None]:
#print the first 10 predicted classes with the lower threshold
y_pred_class[:10]
#Sensitivity has increased and specificity has decreased

## ROC curve and Area Under the Curve (AUC)
Inspect how sensitivity and specificity are affected by various thresholds


In [None]:
#IMPORTANT: Use always y_pred_prob and not y_pred_class
y_pred_prob = logreg.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)

In [None]:
plt.plot(fpr, tpr)
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.title('ROC curve for diabetes classifier')
plt.xlabel('False positive rate(1-Specificity)')
plt.ylabel('True positive rate(Sensitivity)')
plt.grid(True)

In [None]:
#define a function that accepts a threshold and prints sensitivity and specificity
def evaluate_threshold(threshold):
    print('Sensitivity:', tpr[thresholds > threshold][-1])
    print('Specificity:', 1 - fpr[thresholds > threshold][-1])

In [None]:
evaluate_threshold(0.5)

In [None]:
evaluate_threshold(0.3)

In [None]:
# AUC is the prercentage of the ROC plot that is underneath the curve
#useful as a single number summary of classifier performance
#useful even when there is high class imbalance unlike the classification accuracy
metrics.roc_auc_score(y_test, y_pred_prob)

In [None]:
#calculate cross validated AUC
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()