In [None]:
## Importing Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

%matplotlib inline

In [None]:
# generate 2 class dataset
X, y = make_classification(n_samples=10000, n_classes=2, weights=[0.8,0.2], random_state=1)

In [None]:
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)

In [None]:
# fit a model
model = LogisticRegression()
model.fit(trainX, trainy)
# predict probabilities
probs = model.predict_proba(testX)

# # keep probabilities for the positive outcome only
probs = probs[:, 1]
probs

In [None]:
# predict class values
yhat = model.predict(testX)
yhat

In [None]:
## Accuracy 
print ("Accuracy of the model: ", accuracy_score(testy, yhat))

In [None]:
np.bincount(y)

In [None]:
pd.DataFrame(y)[0].value_counts(normalize=True)

### Confusion matrix 

In [None]:
np.bincount(testy)

In [None]:
pd.DataFrame(confusion_matrix(testy, yhat), index=['Negative Class', 'Positive Class'], 
                          columns=['Negative Class', 'Positive Class'])

In [None]:
print ("Actual Split: ", np.bincount(testy))

## Is Accuracy is a Trap??

![title](img/trap.png)



## Confusion Matrix
![title](img/confusion.png)

------------

## Type 1 and Type 2 Error

![title](img/error.jpg)

In [None]:
contigency = pd.DataFrame(confusion_matrix(testy, yhat))
contigency

In [None]:
tn = contigency.iloc[0,0]
fp = contigency.iloc[0,1]
fn = contigency.iloc[1,0]
tp = contigency.iloc[1,1]

## True Positive Rate/Sensitivity/Recall - proportion of actual positives which are predicted positive

###### Effectiveness of a classifier to identify positive labels

In [None]:
tpr = tp/ (tp+fn)
print(tpr)

recall = tpr

## False Positive Rate/False Alarm Rate

##### It summarizes how often a positive class is predicted when the actual outcome is negative.

In [None]:
fpr = fp/ (tn+fp)
print(fpr)

## Precision - proportion of predicted positives which are actual positive

##### It describes how good a model is at predicting the positive class

In [None]:
precision = tp / (fp + tp)
print("precision:", precision)

## F1 Score - Harmonic Mean of Precision and Recall

![title](img/harmonic.jpg)

In [None]:
f1 = 2 / (1/recall + 1/precision)
print ("F1 Score: ", f1)

In [None]:
### Classification report 
print ("Classification Report : \n\n", classification_report(testy, yhat))

## ROC Curve  - Receiver operating characteristic '

----
It is a plot of the false positive rate (x-axis) versus the true positive rate (y-axis) for a number of different candidate threshold values between 0.0 and 1.0. Put another way, it plots the false alarm rate versus the hit rate.

![title](img/roc1.png)

In [None]:
# generate 2 class dataset
X, y = make_classification(n_samples=10000, n_classes=2, weights=[0.5,0.5], random_state=1)
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)

# fit a model
model = LogisticRegression()
model.fit(trainX, trainy)
# predict probabilities
probs = model.predict_proba(testX)

# keep probabilities for the positive outcome only
probs = probs[:, 1]
probs

In [None]:
# predict class values
yhat = model.predict(testX)

In [None]:
## Accuracy 

print ("Accuracy of the model: ", accuracy_score(testy, yhat))

In [None]:
# calculate AUC
auc = roc_auc_score(testy, probs)
print('AUC: %.3f' % auc)

In [None]:
# calculate roc curve
fpr, tpr, thresholds = roc_curve(testy, probs)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.');

In [None]:
thresholds

In [None]:
print(fpr)
tpr

In [None]:
### Random Guess example - Diagonal Line
y_rand_prob = np.random.rand(5000,)
np.bincount((y_rand_prob >=0.5).astype(np.int64))
y_rand = np.random.randint(0,2, testy.shape)

In [None]:
# calculate AUC
auc = roc_auc_score(testy, y_rand_prob)
print('AUC: %.3f' % auc)

In [None]:
accuracy_score(testy, y_rand)

In [None]:
# calculate roc curve
fpr, tpr, thresholds = roc_curve(testy, y_rand_prob)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.');

### ROC Space - Example

Consider the following models:
![title](img/ROC_explained.png)    


* PPV - Positive Predicted Value or Precision

-----------------

![title](img/roc.png)

## Precision and Recall curve

In [None]:
### Precision Recall Curve --- 
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from matplotlib import pyplot
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[1,1], random_state=1)
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
# fit a model
model = KNeighborsClassifier(n_neighbors=3)
model.fit(trainX, trainy)
# predict probabilities
probs = model.predict_proba(testX)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# predict class values
yhat = model.predict(testX)
# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(testy, probs)
# calculate F1 score
f1 = f1_score(testy, yhat)
# calculate precision-recall AUC
auc = auc(recall, precision)
# calculate average precision score
ap = average_precision_score(testy, probs)
print('f1=%.3f auc=%.3f ap=%.3f' % (f1, auc, ap))
# plot no skill
pyplot.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the roc curve for the model
pyplot.plot(recall, precision, marker='.')
# show the plot
pyplot.show()

##### If Imbalanced dataset then Precision recall curve is more meaningful

In [None]:
# roc curve and auc on imbalanced dataset
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.9,0.09], random_state=1)
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
# fit a model
model = KNeighborsClassifier(n_neighbors=3)
model.fit(trainX, trainy)
# predict probabilities
probs = model.predict_proba(testX)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(testy, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(testy, probs)
# plot no skill
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
pyplot.plot(fpr, tpr, marker='.')
# show the plot
pyplot.show()

In [None]:
# precision-recall curve and auc
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from matplotlib import pyplot
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.9,0.09], random_state=1)
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
# fit a model
model = KNeighborsClassifier(n_neighbors=3)
model.fit(trainX, trainy)
# predict probabilities
probs = model.predict_proba(testX)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# predict class values
yhat = model.predict(testX)
# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(testy, probs)
# calculate F1 score
f1 = f1_score(testy, yhat)
# calculate precision-recall AUC
auc = auc(recall, precision)
# calculate average precision score
ap = average_precision_score(testy, probs)
print('f1=%.3f auc=%.3f ap=%.3f' % (f1, auc, ap))
# plot no skill
pyplot.plot([0, 1], [0.1, 0.1], linestyle='--')
# plot the roc curve for the model
pyplot.plot(recall, precision, marker='.')
# show the plot
pyplot.show()