In [1]:
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Loading datasets

In [2]:
X,y = load_breast_cancer(return_X_y=True)

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42)
# train transformer on train data, and transform them
X_train = scaler.fit_transform(X_train)
# now the transformer is trained on train data, it can be applied on test data
X_test = scaler.transform(X_test)

# Fitting the data

In [4]:
lr = LogisticRegression()

In [5]:
lr.fit(X_train, y_train)

# Making predictions

In [8]:
scores = lr.predict_proba(X_test)
scores

array([[1.36008828e-01, 8.63991172e-01],
       [9.99977295e-01, 2.27049622e-05],
       [9.96080057e-01, 3.91994328e-03],
       [7.80003100e-04, 9.99219997e-01],
       [1.14294099e-04, 9.99885706e-01],
       [1.00000000e+00, 3.50087563e-10],
       [9.99999993e-01, 7.08653260e-09],
       [9.54663200e-01, 4.53368002e-02],
       [4.31753581e-01, 5.68246419e-01],
       [1.14096940e-03, 9.98859031e-01],
       [5.99262279e-02, 9.40073772e-01],
       [9.84076010e-01, 1.59239901e-02],
       [7.18535007e-03, 9.92814650e-01],
       [8.28290034e-01, 1.71709966e-01],
       [2.59614386e-03, 9.97403856e-01],
       [9.98448510e-01, 1.55149008e-03],
       [2.75117091e-03, 9.97248829e-01],
       [1.59757689e-05, 9.99984024e-01],
       [1.28262413e-06, 9.99998717e-01],
       [9.99997173e-01, 2.82651584e-06],
       [8.40555409e-02, 9.15944459e-01],
       [1.21140991e-02, 9.87885901e-01],
       [9.99999984e-01, 1.61112524e-08],
       [1.22092033e-04, 9.99877908e-01],
       [1.812191

# Functions for evaluating fp,tp and accuracy

In [None]:
def eval_fp_tp(actual, predicted):
    pass

In [None]:
def eval_accuracy(actual, predicted):
    pass

# fp,tp and accuracy evaluations for different thresholds

Given the scores for the test cases, we might want to find the best possible threshold for classification, i.e., the real value $t$ such that `scores >` $t$ gives the best classifiation of the examples. 

Let us then start to consider 100 possible thresholds in the range $[0,1]$:

In [None]:
thresholds = ...

and compute the tp, fp, and accuracy values of the labelings obtained by comparing the scores with those thresholds.

In [None]:
performances = []
fps, tps = [], []
#loop in cui mettiamo i falsi positivi e i falsi negativi
...

# performances [(acc, t, fp, tp)]

performances = np.array(performances)

# Plotting

Let us then start plotting the coverage plot for the obtained classifications.

In [None]:
plt.plot(fps, tps)


# Checking performances for threshold 0.5

The predict_proba method we used to get the score returns the probability that examples belong to the positive class. Usually the positive class is then predicted as score > 0.5 (since in this case it is the one with the largest likelihood).

Let's then see where this classifier (i.e., the one obtained setting the threshold to 0.5) lays in the coverage plot and if there are better options.

**note**: since we saved interesting stats in the `performances` array, we can retrieve the fp, tp position of the classifier we get by setting the thresholds to 0.5, by finding the position of the row we are interested using the expression: `performances[:,1] == 0.5` and then using the resulting boolean vector to retrieve the correct row of the matrix: `performances[performances[:,1] == 0.5]`.
```

In [None]:
plt.plot(fps, tps)
accuracy, threshold, fp, tp = performances[performances[:,1] == 0.5][0]
plt.scatter(fp,tp,color='red')
plt.plot([fp-10,fp+10],[tp-10,tp+10], color="red")

As it is shown by the red dot and the red line, threshold 0.5 is a good one, but apparently two other points can reach a better classification.

Let us see where these point lay in the plot and what is their accuracy.

In [None]:
# Note: the subscription returns a matrix with a single row, but still two dimensions, 
# we need to get the element in the first position of that matrix...

perf05 = performances[performances[:, 1] == 0.5][0,0] 
performances[performances[:,0] > perf05]

The two points that we are looking for are then in position (5,121) and (1,117)

In [None]:
plt.plot(fps, tps)
fp, tp = eval_fp_tp(actual, scores > 0.5)
plt.scatter(fp,tp, color="red")
plt.scatter(5,121, color="orange")
plt.scatter(1,117, color="orange")
plt.plot([fp-10,fp+10],[tp-10,tp+10], color="red")

These two points (that we found by looking only to the accuracies) are indeed the two points that the plot show having a better accuracy. 