In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import seaborn as sns 

import assessment 
import calibration 
import utils 
from utils import logit, sigmoid
import warnings 
warnings.filterwarnings("default")

%load_ext autoreload
%matplotlib inline
%autoreload 2

# Set plotting aesthetics

mpl.rcParams.update({'font.size': 15})
mpl.rcParams.update({"axes.grid" : True, "grid.linestyle": '--', 
                     "grid.alpha": 0.8, "grid.color": "black"})
mpl.rcParams.update({"lines.linewidth" : 3})
mpl.style.use('seaborn-colorblind')

In [None]:
n_train = 1000
n_val = 500
n_test = 4500

np.random.seed(0)
from scipy.stats import norm

def regression_function(x, bias):
    p_x_y_0 = (1-bias)*norm.pdf(x)
    p_x_y_1 = bias*norm.pdf(x-2)
    p_y_equals_1_given_x = p_x_y_1/(p_x_y_0+p_x_y_1)
    return p_y_equals_1_given_x

def create_data(n):    
    dat = np.zeros((n,1))
    Y = np.zeros((n))
    probs = np.zeros((n))
    for i in range(n):
        bias = (1-create_data.count/6000)*0.95 + (create_data.count/6000)*0.05
        Y[i] = int(np.random.random() <= bias)
        dat[i,:] = 2*Y[i] + np.random.randn((1))
        probs[i] = regression_function(dat[i,:], bias)
                
        create_data.count += 1
    return dat, Y, probs

create_data.count = 0
    
x_train, y_train, probs_train = create_data(n_train)
x_calib, y_calib, probs_calib = create_data(n_val)
x_test, y_test, probs_test = create_data(n_test)

In [None]:
model = LogisticRegression().fit(x_train, y_train)
_, a_platt, b_platt = calibration.fit_platt_scaling_parameters(logit(model.predict_proba(x_calib)[:,1]), y_calib)

In [None]:
pred_probs_ons, a_platt_ons, b_platt_ons = calibration.online_platt_scaling_newton(logit(model.predict_proba(np.vstack((x_calib, x_test)))[:,1]), np.concatenate((y_calib, y_test)))

In [None]:
plt.plot(a_platt_ons, label="a ONS")
plt.plot(b_platt_ons, label="b ONS")
plt.legend()
plt.title("a_fixed = {:.2}, b_fixed = {:.2}".format(a_platt[0], b_platt[0]))

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 10))
x_vals = np.arange(-3, 5, 0.1).reshape((-1,1))

#plt.plot([], [])

t1 = 0
true_bias = (1-t1/6000)*0.95 + (t1/6000)*0.05
true_probs = regression_function(x_vals, true_bias)
ax[0,0].plot([], []) 
ax[0,0].plot([], []) 
ax[0,0].plot([], [])
ax[0,0].plot([], []) 
ax[0,0].plot(x_vals, true_probs, label = "True probabilities at t = 0", linestyle='solid') 
ax[0,0].plot([], [])
ax[0,0].plot(x_vals, model.predict_proba(x_vals)[:,1], label = "Fixed base model", linestyle='dotted')#, color=line.get_color())
utils.normalized_hist(x_train, ax[0,0])
ax[0,0].set_title(r'$t = 1$ to $t = 1000$ (training)')

t1 = 1500
true_bias = (1-t1/6000)*0.95 + (t1/6000)*0.05
true_probs = regression_function(x_vals, true_bias)
ax[0,1].plot([], []) 
ax[0,1].plot([], []) 
ax[0,1].plot([], [])
ax[0,1].plot([], []) 
ax[0,1].plot(x_vals, true_probs, label = "True probabilities at t = 1500", linestyle='solid')
ax[0,1].plot([], []) 
ax[0,1].plot(x_vals, model.predict_proba(x_vals)[:,1], label = "Fixed base model", linestyle='dotted')#, color=line.get_color())
ax[0,1].plot([], []) 
ax[0,1].plot(x_vals, sigmoid(a_platt_ons[t1-1000]*logit(model.predict_proba(x_vals)[:,1]) + b_platt_ons[t1-1000]), label = "Online Platt scaling (OPS) at t = 1500", linestyle='dashed')
utils.normalized_hist(x_test[t1-1500:t1-1000], ax[0,1])
ax[0,1].set_title(r'$t = 1501$ to $t = 2000$')

t1 = 3500
true_bias = (1-t1/6000)*0.95 + (t1/6000)*0.05
true_probs = regression_function(x_vals, true_bias)
ax[1,0].plot([], []) 
ax[1,0].plot([], []) 
ax[1,0].plot([], [])
ax[1,0].plot([], []) 
ax[1,0].plot(x_vals, true_probs, label = "True probabilities at t = 1500", linestyle='solid')
ax[1,0].plot([], []) 
ax[1,0].plot(x_vals, model.predict_proba(x_vals)[:,1], label = "Fixed base model", linestyle='dotted')#, color=line.get_color())
ax[1,0].plot([], []) 
ax[1,0].plot(x_vals, sigmoid(a_platt_ons[t1-1000]*logit(model.predict_proba(x_vals)[:,1]) + b_platt_ons[t1-1000]), label = "Online Platt scaling (OPS) at t = 1500", linestyle='dashed')
utils.normalized_hist(x_test[t1-1500:t1-1000], ax[1,0])
ax[1,0].set_title(r'$t = 3501$ to $t = 4000$')

t1 = 5500
true_bias = (1-t1/6000)*0.95 + (t1/6000)*0.05
true_probs = regression_function(x_vals, true_bias)
ax[1,1].plot([], []) 
ax[1,1].plot([], []) 
ax[1,1].plot([], [])
ax[1,1].plot([], []) 
ax[1,1].plot(x_vals, true_probs, label = "True probabilities at t = 1500", linestyle='solid')
ax[1,1].plot([], []) 
ax[1,1].plot(x_vals, model.predict_proba(x_vals)[:,1], label = "Fixed base model", linestyle='dotted')#, color=line.get_color())
ax[1,1].plot([], []) 
ax[1,1].plot(x_vals, sigmoid(a_platt_ons[t1-1000]*logit(model.predict_proba(x_vals)[:,1]) + b_platt_ons[t1-1000]), label = "Online Platt scaling (OPS) at t = 1500", linestyle='dashed')
utils.normalized_hist(x_test[t1-1500:t1-1000], ax[1,1])
ax[1,1].set_title(r'$t = 5501$ to $t = 6000$')

fig.add_subplot(111, frameon=False)
plt.grid(False)
plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
plt.xlabel(r'Value of $x$', fontsize=22)
plt.ylabel(r'True/predicted   Pr$(Y=1 \mid X = x)$', fontsize=20)
plt.savefig('results/label_shift_1d.pdf')

In [None]:
def print_vals(y_pred_probs, true_probs):
    y_pred_classes = (y_pred_probs>=0.5).astype('int')
    ece = 0
    bin_edges = np.sort(np.unique(y_pred_probs))
    bin_assignment = utils.bin_points(y_pred_probs, bin_edges)
    tot_elem = 0
    for i, bin_edges in enumerate(bin_edges):
        bin_idx = (bin_assignment == i)
        assert(sum(bin_idx) > 0), "This assert should pass by construction of the code"
        n_elem = sum(bin_idx)
        tot_elem += n_elem
        pi_pred = y_pred_probs[bin_idx].mean()
        pi_true = true_probs[bin_idx].mean()
        ece += n_elem*np.abs(pi_pred-pi_true)
    assert(tot_elem == y_pred_probs.size)
    ece /= y_pred_probs.size
    acc = np.sum(np.multiply(true_probs, y_pred_classes) + np.multiply(1-true_probs,1-y_pred_classes))/y_pred_probs.size    

    print("{:.2f}\\% & {:.2}".format(100*acc, ece))

In [None]:
y_pred_probs = model.predict_proba(x_train)[:,1]
true_probs = probs_train
print_vals(y_pred_probs, true_probs)
for base_val in [0, 2000, 4000]:
    y_pred_probs = model.predict_proba(x_test)[base_val:(base_val+500),1]
    true_probs = probs_test[base_val:(base_val+500)]
    print_vals(y_pred_probs, true_probs)
    y_pred_probs = sigmoid(np.multiply(
        a_platt_ons[(base_val + 500):(base_val + 1000)], 
        logit(model.predict_proba(x_test)[base_val:(base_val+500),1])) + b_platt_ons[base_val+500])
    print_vals(y_pred_probs, true_probs)