In [1]:
# This demo covers using the ClassificationThesholdTuner tool, which helps
# identify and describe thresholds that may be used for classification
# problems. 

# For simplicity, this notebook uses synthetic data. Another notebook uses
# real data. 

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import sys

sys.path.insert(0, "..")
from threshold_tuner import ClassificationThesholdTuner

# Functions to generate synthetic predictions

In [3]:
NUM_ROWS = 10_000_000

In [4]:
# The synthetic data consists only of true labels and predicted probabilities,
# as these are all that are needed by ClassificationThesholdTuner; it does
# not examine the original features. 

# Simple case with half the rows true 0 and half true 1. The predictions are
# well-separated. This is a basic smoke test, as any threshold in the 
# vicinity of 0.5 will perfectly separate the classes. 
def generate_data_1():
    num_rows_per_class = NUM_ROWS // 2
    np.random.seed(0)
    d = pd.DataFrame({"Y": ['A']*num_rows_per_class + ['B']*num_rows_per_class,
                  "Pred_Proba": 
                      np.random.normal(0.5, 0.1, num_rows_per_class).tolist() + \
                      np.random.normal(1.5, 0.1, num_rows_per_class).tolist()
                 })
    return d, ['A', 'B']

# Similar to generate_data_1(), but the predictions overlap more
def generate_data_2():
    num_rows_per_class = NUM_ROWS // 2
    np.random.seed(0)
    d = pd.DataFrame({"Y": ['A']*num_rows_per_class + ['B']*num_rows_per_class,
                  "Pred_Proba": 
                      np.random.normal(0.7, 0.3, num_rows_per_class).tolist() + \
                      np.random.normal(1.4, 0.3, num_rows_per_class).tolist()
                 })
    return d, ['A', 'B']


# Generate the test data

In [5]:
d, target_classes = generate_data_2()
d

Unnamed: 0,Y,Pred_Proba
0,A,1.229216
1,A,0.820047
2,A,0.993621
3,A,1.372268
4,A,1.260267
...,...,...
9999995,B,1.577063
9999996,B,1.540285
9999997,B,0.816586
9999998,B,1.296804


In [6]:
target_classes

['A', 'B']

In [7]:
# Ensure the probabilities are between 0.0 and 1.0. It is possible for the scores coming
# from MinMaxScaler to be slighly outside this range.

scaler= MinMaxScaler()
proba_cols = [x for x in d.columns if "Proba" in x]

for col_name in proba_cols:
    d[col_name] = scaler.fit_transform(d[col_name].values.reshape(-1, 1)).reshape(1, -1)[0]
    d[col_name] = [x if x <= 1.0 else 1.0 for x in d[col_name]]
    d[col_name] = [x if x >= 0.0 else 0.0 for x in d[col_name]]

In [8]:
# Set the Binary prediction column, to start, based on a threshold of 0.5

d['Pred'] = np.where(d["Pred_Proba"] > 0.50, "B", "A")
d

Unnamed: 0,Y,Pred_Proba,Pred
0,A,0.545601,B
1,A,0.436101,A
2,A,0.482552,A
3,A,0.583884,B
4,A,0.553911,B
...,...,...,...
9999995,B,0.638691,B
9999996,B,0.628848,B
9999997,B,0.435175,A
9999998,B,0.563689,B


In [9]:
d.dtypes

Y              object
Pred_Proba    float64
Pred           object
dtype: object

# Test the visualizations

In [10]:
tuner = ClassificationThesholdTuner()

In [None]:
# Simple output based on the predicted classes

tuner.print_stats_labels(
    y_true=d["Y"], 
    target_classes=target_classes,
    y_pred=d["Pred"])

In [None]:
# Simple output based on the predicted probabilities for each class. This
# includes a call to print_stats_labels(), and provides additional information
# based on the probabilities. 

tuner.print_stats_proba(
    y_true=d["Y"], 
    target_classes=target_classes, 
    y_pred_proba=d["Pred_Proba"])    

In [None]:
# Display stats about the precision & recall for various ranges of the 
# predicted probabilities.

tuner.print_stats_table(
    y_true=d['Y'], 
    target_classes=target_classes,
    y_pred_proba=d["Pred_Proba"],
    num_ranges=10
)

# Tools to help adjust the threshold

In [None]:
# View the effect of different thresholds for a specified
# set of thresholds. This uses the default set of 0.1, 0.2,
# 0.3 ... up to 0.9.

tuner.plot_by_threshold(
    y_true=d['Y'], 
    target_classes=target_classes,
    y_pred_proba=d["Pred_Proba"])

In [None]:
# View a reduced set of thresholds. This is an example
# specifying the start, end, and num_steps parameters.

tuner.plot_by_threshold(
    y_true=d['Y'], 
    target_classes=target_classes,
    y_pred_proba=d["Pred_Proba"], 
    start=0.50, end=0.55, num_steps=6)

In [None]:
# To compare choices for thresholds, we look closer at the slices
# between a set of potential thresholds. Here we specify 5 slices.
# The table output will also include the ranges before and after
# these, so 7 slices in total.

tuner.describe_slices(    
    y_true=d['Y'], 
    target_classes=target_classes,
    y_pred_proba=d["Pred_Proba"], 
    start=0.3, end=0.7, num_slices=5)

In [None]:
# View a smaller range of potential thresholds more closely.

tuner.describe_slices(    
    y_true=d['Y'], 
    target_classes=target_classes,
    y_pred_proba=d["Pred_Proba"], 
    start=0.4, end=0.6, num_slices=10)

In [None]:
# Zoom in further

tuner.describe_slices(    
    y_true=d['Y'], 
    target_classes=target_classes,
    y_pred_proba=d["Pred_Proba"], 
    start=0.5, end=0.55, num_slices=6)

# Select the optimal threshold

In [None]:
# Allow ClassificationThesholdTuner to identify the best threshold optimizing
# for a specified metric.

from sklearn.metrics import f1_score

best_threshold = tuner.tune_threshold(
    y_true=d['Y'], 
    target_classes=target_classes,
    y_pred_proba=d["Pred_Proba"],
    metric=f1_score,
    average='macro',
    higher_is_better=True,
    max_iterations=5
)
best_threshold

In [None]:
tuned_pred = tuner.get_predictions(target_classes, d["Pred_Proba"], None, best_threshold)

# Show the first 10 predictions
tuned_pred[:10]

# Redisplay the metrics with the optimal threshold

In [None]:
# We can see, with the optimized threshold, the F1 macro score 
# increased from about 0.87 to about 0.88 (results will vary slightly 
# each run)

d['Pred'] = np.where(d["Pred_Proba"] > best_threshold, "B", "A")
    
tuner.print_stats_labels(
    y_true=d["Y"], 
    target_classes=target_classes,
    y_pred=d["Pred"])    

In [None]:
# Call print_stats_proba() again, this time with the threshold.
# The threshold is simply drawn on the plots to provide some understanding
# of the threshold. 

tuner.print_stats_proba(
    y_true=d["Y"], 
    target_classes=target_classes, 
    y_pred_proba=d["Pred_Proba"],
    thresholds=best_threshold
)  

# Tuning to maximize binary F1 Score

In [None]:
from sklearn.metrics import f1_score

# In this case, the pos_label must be specified as well.

best_threshold = tuner.tune_threshold(
    y_true=d['Y'],
    target_classes=target_classes,
    y_pred_proba=d["Pred_Proba"],
    metric=f1_score,
    average='binary',
    pos_label='B',
    higher_is_better=True,
    max_iterations=5
)
best_threshold