<a href="https://colab.research.google.com/github/FatemehTahavori/CAPE/blob/master/calibration_isotonic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import scipy as sp
import pandas as pd
from pathlib import Path 
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import roc_auc_score as AUC
from sklearn.isotonic import IsotonicRegression as IR

# load data, predictions from a random forest
import gdown
id = "1d6CChySejVXeGpF2B335tmb_W23Xkt38"
gdown.download(id=id, quiet=False)
input_file = '/content/predictions_random_forest_Adult_dataset.csv'

y_and_p = np.loadtxt(input_file, delimiter = ',')

y = y_and_p[:,0]
p = y_and_p[:,1]

# y is the label, we convert it to 0 if it was -1
y[y == -1] = 0

# Split data in half for train and test
train_indx_end = int(y.shape[0] / 2)
test_index_start = train_indx_end + 1

y_train = y[0:train_indx_end]
y_test =y[test_index_start:]
p_train = p[0:train_indx_end]
p_test =p[test_index_start:]

# IsotonicRegression
ir = IR(out_of_bounds = 'clip')
ir.fit(p_train, y_train)
p_calibrated = ir.transform(p_test)
# if p_calibrated is nan we convert it to 0
p_calibrated[np.isnan(p_calibrated)] = 0

# This calculates log_loss
def log_loss(actual, prediction):
	epsilon = 1e-15
	prediction = sp.maximum(epsilon, prediction)
	prediction = sp.minimum(1-epsilon, prediction)
	log_loss = sum(actual*sp.log(prediction) + sp.subtract(1,actual)*sp.log(sp.subtract(1,prediction)))
	log_loss = log_loss * -1.0/len(actual)
	return log_loss

auc = AUC(y_test, p_test)
auc_calibrated = AUC(y_test,p_calibrated)
accuracy = accuracy_score(y_test, np.round(p_test))
accuracy_calibrated = accuracy_score(y_test, np.round(p_calibrated))
log_loss_score = log_loss(y_test, p_test)
log_loss_calibrated = log_loss(y_test, p_calibrated)
brier_score = brier_score_loss(y_test, p_test)
brier_score_calibrated = brier_score_loss(y_test, p_calibrated)

print("AUC - before/after:        ", auc, "/", auc_calibrated)
print("Accuracy - before/after:   ", accuracy, "/", accuracy_calibrated)
print("Log loss - before/after:   ", log_loss_score, "/", log_loss_calibrated)
print("Brier score - before/after:", brier_score, "/", brier_score_calibrated)


df = pd.DataFrame({'p_train': p_train, 'y_train': y_train, 'p_test': p_test, 'y_test': y_test})
filepath = Path('output/ir_python.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath, index=False)  

"""
AUC - before/after:         0.8781398450768403 / 0.8777692307692309
Accuracy - before/after:    0.8477886977886978 / 0.845945945945946
Log loss - before/after:    0.6305257728714894 / 0.35572214331077795
Brier score - before/after: 0.11168379606879607 / 0.11009118568385362
"""

Downloading...
From: https://drive.google.com/uc?id=1d6CChySejVXeGpF2B335tmb_W23Xkt38
To: /content/predictions_random_forest_Adult_dataset.csv
100%|██████████| 110k/110k [00:00<00:00, 47.6MB/s]

AUC - before/after:         0.8781398450768403 / 0.8777692307692309
Accuracy - before/after:    0.8477886977886978 / 0.845945945945946
Log loss - before/after:    0.6305257728714894 / 0.35572214331077795
Brier score - before/after: 0.11168379606879607 / 0.11009118568385362





'\nAUC - before/after:         0.8781398450768403 / 0.8777692307692309\nAccuracy - before/after:    0.8477886977886978 / 0.845945945945946\nLog loss - before/after:    0.6305257728714894 / 0.35572214331077795\nBrier score - before/after: 0.11168379606879607 / 0.11009118568385362\n'