<a href="https://colab.research.google.com/github/2021aim1014/Master-Thesis-Project/blob/main/Base_Paper_1_calibration_via_confusion_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/GavinKerrigan/conf_matrix_and_calibration.git
!mv /content/conf_matrix_and_calibration/cifar10h .
!rm -R /content/conf_matrix_and_calibration
!mkdir output

Cloning into 'conf_matrix_and_calibration'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 47 (delta 14), reused 21 (delta 3), pack-reused 0[K
Unpacking objects: 100% (47/47), done.


In [None]:
!pip install attrdict deepdish pyro-ppl uncertainty-calibration

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting attrdict
  Downloading attrdict-2.0.1-py2.py3-none-any.whl (9.9 kB)
Collecting deepdish
  Downloading deepdish-0.3.7-py2.py3-none-any.whl (37 kB)
Collecting pyro-ppl
  Downloading pyro_ppl-1.8.3-py3-none-any.whl (727 kB)
[K     |████████████████████████████████| 727 kB 16.2 MB/s 
[?25hCollecting uncertainty-calibration
  Downloading uncertainty-calibration-0.1.4.tar.gz (15 kB)
Collecting pyro-api>=0.1.1
  Downloading pyro_api-0.1.2-py3-none-any.whl (11 kB)
Collecting parameterized
  Downloading parameterized-0.8.1-py2.py3-none-any.whl (26 kB)
Building wheels for collected packages: uncertainty-calibration
  Building wheel for uncertainty-calibration (setup.py) ... [?25l[?25hdone
  Created wheel for uncertainty-calibration: filename=uncertainty_calibration-0.1.4-py3-none-any.whl size=14223 sha256=8e969912e47298fb8434c1d924d8a243e5469a08f7abeb646199ae5c428b9918
  Stored in di

In [None]:
from tqdm.auto import tqdm
import torch, csv, os, numpy as np
from torch import nn, optim
from sklearn.model_selection import train_test_split

In [None]:
import warnings
from sklearn.metrics import confusion_matrix
from attrdict import AttrDict
from sklearn.cluster import KMeans 

In [None]:
class BaseCalibrator:
    """ Abstract calibrator class
    """
    def __init__(self):
        self.n_classes = None

    def fit(self, logits, y):
        raise NotImplementedError

    def calibrate(self, probs):
        raise NotImplementedError

In [None]:
class TSCalibrator(BaseCalibrator):
    """ Maximum likelihood temperature scaling (Guo et al., 2017)
    """

    def __init__(self, temperature=1.):
        super().__init__()
        self.temperature = temperature

        self.loss_trace = None

    def fit(self, logits, y):
        """ Fits temperature scaling using hard labels.
        """
        # Pre-processing
        self.n_classes = logits.shape[1]
        _model_logits = torch.from_numpy(logits)
        _y = torch.from_numpy(y)
        _temperature = torch.tensor(self.temperature, requires_grad=True)

        # Optimization parameters
        nll = nn.CrossEntropyLoss()  # Supervised hard-label loss
        num_steps = 7500
        learning_rate = 0.05
        grad_tol = 1e-3  # Gradient tolerance for early stopping
        min_temp, max_temp = 1e-2, 1e4  # Upper / lower bounds on temperature

        optimizer = optim.Adam([_temperature], lr=learning_rate)

        loss_trace = []  # Track loss over iterations
        step = 0
        converged = False
        while not converged:

            optimizer.zero_grad()
            loss = nll(_model_logits / _temperature, _y)
            loss.backward()
            optimizer.step()
            loss_trace.append(loss.item())

            with torch.no_grad():
                _temperature.clamp_(min=min_temp, max=max_temp)

            step += 1
            if step > num_steps:
                warnings.warn('Maximum number of steps reached -- may not have converged (TS)')
            converged = (step > num_steps) or (np.abs(_temperature.grad) < grad_tol)

        self.loss_trace = loss_trace
        self.temperature = _temperature.item()

    def calibrate(self, probs):
        calibrated_probs = probs ** (1. / self.temperature)  # Temper
        calibrated_probs /= np.sum(calibrated_probs, axis=1, keepdims=True)  # Normalize
        return calibrated_probs

In [None]:
class OracleCombiner:
    """ Implements the P+L combination method, fit using maximum likelihood
    """
    def __init__(self, calibration_method='temperature scaling', **kwargs):
        self.calibrator = None
        self.confusion_matrix = None  # conf[i, j] is assumed to be P(h = i | Y = j)

        self.n_train_u = None  # Amount of unlabeled training data
        self.n_train_l = None  # Amount of labeled training data
        self.n_cls = None  # Number of classes

        self.eps = 1e-50

        self.use_cv = False
        self.calibration_method = calibration_method
        self.calibrator = TSCalibrator()

    def calibrate(self, model_probs):
        return self.calibrator.calibrate(model_probs)

    def fit(self, model_probs, y_h, y_true):
        self.n_cls = model_probs.shape[1]

        # Estimate human confusion matrix
        # Entry [i, j]  is #(Y = i and h = j)
        conf_h = 1. * confusion_matrix(y_true, y_h, labels=np.arange(self.n_cls))
        # Swap so entry [i, j] is #(h = i and Y = j)
        conf_h = conf_h.T
        conf_h = np.clip(conf_h, self.eps, None)
        normalizer = np.sum(conf_h, axis=0, keepdims=True)
        # Normalize columns so entry [i, j] is P(h = i | Y = j)
        conf_h /= normalizer
        self.confusion_matrix = conf_h

        self.fit_calibrator(model_probs, y_true)

    def fit_calibrator(self, model_probs, y_true):
        clipped_model_probs = np.clip(model_probs, self.eps, 1)
        model_logits = np.log(clipped_model_probs)
        self.calibrator.fit(model_logits, y_true)

    def combine_proba(self, model_probs, y_h):
        """ Combines model probabilities with hard labels via the calibrate-confuse equation given the confusion matrix.

        Args:
            p_m: Array of model probabilities ; shape (n_samples, n_classes)
            y_h: List of hard labels ; shape (n_samples,)

        Returns:
            Normalized posterior probabilities P(Y | m, h). Entry [i, j] is P(Y = j | h_i, m_i)
        """
        assert model_probs.shape[0] == y_h.size, 'Size mismatch between model probs and human labels'
        assert model_probs.shape[1] == self.n_cls, 'Size mismatch between model probs and number of classes'

        n_samples = model_probs.shape[0]
        calibrated_model_probs = self.calibrate(model_probs)

        y_comb = np.empty((n_samples, self.n_cls))
        for i in range(n_samples):
            y_comb[i] = calibrated_model_probs[i] * self.confusion_matrix[y_h[i]]
            if np.allclose(y_comb[i], 0):  # Handle zero rows
                y_comb[i] = np.ones(self.n_cls) * (1./self.n_cls)

        # Don't forget to normalize :)
        assert np.all(np.isfinite(np.sum(y_comb, axis=1)))
        assert np.all(np.sum(y_comb, axis=1) > 0)
        y_comb /= np.sum(y_comb, axis=1, keepdims=True)
        return y_comb

    def combine(self, model_probs, y_h):
        """ Combines model probs and y_h to return hard labels
        """
        y_comb_soft = self.combine_proba(model_probs, y_h)
        return np.argmax(y_comb_soft, axis=1)

In [None]:
def get_acc(y_pred, y_true):
    """ Computes the accuracy of predictions.
    If y_pred is 2D, it is assumed that it is a matrix of scores (e.g. probabilities) of shape (n_samples, n_classes)
    """
    if y_pred.ndim == 1:
        return np.mean(y_pred == y_true)
    elif y_pred.ndim == 2:
        return np.mean(np.argmax(y_pred, axis=1), y_true)

In [None]:
def load_CIFAR10H(model_name):
    """ Loads the CIFAR-10H predictions (human and model) and true labels.
    """
    if model_name == 'r_low_acc':
        data_path = 'cifar10h/human_model_truth_cifar10h.csv'
        data = np.genfromtxt(data_path, delimiter=',')

        human_counts = data[:, :10]
        model_probs = data[:, 10:20]
        true_labels = data[:, -1]

        true_labels -= 1  # data has labels 1-10 -- shifting so that they are zero-indexed.
    else:
        data_path = f'cifar10h/{model_name}.csv'
        data = np.genfromtxt(data_path, delimiter=',')

        true_labels = data[:, 0]
        human_counts = data[:, 1:11]
        model_probs = data[:, 11:]

    true_labels = true_labels.astype(int)

    return human_counts, model_probs, true_labels

In [None]:
def simulate_single_human(human_counts, seed=0):
    rng = np.random.default_rng(seed)

    human_labels_per_input = np.sum(human_counts, axis=1)
    min_human_labels = int(min(human_labels_per_input))
    n_rows = human_counts.shape[0]
    n_classes = human_counts.shape[1]

    human_labels = np.empty(shape=(n_rows, min_human_labels))
    for row in range(n_rows):
        temp = []
        for i in range(n_classes):
            temp += [i] * int(human_counts[row, i])
        rng.shuffle(temp)
        human_labels[row, :] = temp[:min_human_labels]

    return human_labels[:, 0].astype(int)

In [None]:
class_wise_acc = {}

def _run_experiment(model_name, y_h=None, model_probs=None, y_true=None, **kwargs):
    seed = kwargs.pop('seed', 0)
    n_runs = kwargs.pop('n_runs', 25)
    test_size = kwargs.pop('test_size', 0.3)
    calibration_methods = kwargs.pop('calibration_methods', ['none'])
    output_file_acc = kwargs.pop('output_file_acc', './acc.csv')

    acc_data = []
    for i in range(n_runs):
        # Train/test split
        y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
            y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)

        acc_h = get_acc(y_h_te, y_true_te)
        model_output_te = np.argmax(model_probs_te, axis=1)
        acc_m = get_acc(model_output_te, y_true_te)
        class_wise_acc[model_name] = print_classwise_accuracy(y_true_te, model_output_te)
        class_wise_acc['human_'+model_name] = print_classwise_accuracy(y_true_te, y_h_te)
        _acc_data = [acc_h, acc_m]
        for calibration_method in calibration_methods:
            
            combiner = OracleCombiner(calibration_method=calibration_method)

            combiner.fit(model_probs_tr, y_h_tr, y_true_tr)

            y_comb_te = combiner.combine(model_probs_te, y_h_te)
            acc_comb = get_acc(y_comb_te, y_true_te)
            _acc_data.append(acc_comb)
            class_wise_acc["comb_"+model_name] = print_classwise_accuracy(y_true_te, y_comb_te)

        acc_data += [_acc_data]

    # Save data to CSV
    header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
    with open(output_file_acc, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_acc)
        writer.writerows(acc_data)

In [None]:
def write_classwise_accuray_to_file():
    # write classwise accuracy of model and combined model to file
    model_names = []
    class_wise_acc_list = []
    for model_name, acc in class_wise_acc.items():
        model_names.append(model_name)
        class_wise_acc_list.append(list(acc.values()))

    model_names = np.array(model_names).reshape((12, 1))
    class_wise_acc_list = np.array(class_wise_acc_list)
    acc_data = np.append(model_names, class_wise_acc_list, axis=1)
    output_file_acc = out_fpath + "Classwise_Accuracy.csv"
    header_acc = ('Model Name', 'plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    with open(output_file_acc, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_acc)
        writer.writerows(acc_data)

In [None]:
def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0):
    model_names = ['r_low_acc', 
                   'resnet-110', 'preresnet-110', 'densenet-bc-L190-k40'
                   ]

    for model_name in tqdm(model_names, desc='Models', leave=True):
        # Specify output files
        output_file_acc = out_fpath + f'{model_name}_accuracy.csv'
        assert not os.path.exists(output_file_acc), 'Output filepath already exists'
        experiment_args['output_file_acc'] = output_file_acc

        # Load data
        human_counts, model_probs, y_true = load_CIFAR10H(model_name) # data_utils.py
        y_h = simulate_single_human(human_counts, seed=seed)

        _run_experiment(model_name, y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)

    write_classwise_accuray_to_file()

In [None]:
def print_classwise_accuracy(y_true, y_pred):
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    result = {}
    # prepare to count predictions for each class
    correct_pred = {classname: 0 for classname in classes}
    total_pred = {classname: 0 for classname in classes}

    for label, prediction in zip(y_true, y_pred):
        if label == prediction:
            correct_pred[classes[label]] += 1
        total_pred[classes[label]] += 1


    # print accuracy for each class
    for classname, correct_count in correct_pred.items():
        accuracy = 100 * float(correct_count) / total_pred[classname]
        result[classname] = round(accuracy, 1)
    return result

In [None]:
seed = 9658
torch.manual_seed(seed)
np.random.seed(seed)

calibration_methods = [
    'temperature scaling'
    ]

args = {
    'n_runs': 1,
    'test_size': 0.3,
    'calibration_methods': calibration_methods,
    'seed': seed
        }


In [None]:
!rm -R output
!mkdir output
!rm -R sample_data

In [None]:
out_fpath = './output/'
run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed)

Models:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
import pandas as pd

df1 = pd.read_csv('/content/output/r_low_acc_accuracy.csv')
df2 = pd.read_csv('/content/output/resnet-110_accuracy.csv')
df3 = pd.read_csv('/content/output/preresnet-110_accuracy.csv')
df4 = pd.read_csv('/content/output/densenet-bc-L190-k40_accuracy.csv')

df_all_cols = pd.concat([df1, df2, df3, df4], axis = 0)
df_all_cols['Model Name'] = ['ResNet-110', 'ResNet-164', 'PreResNet-164', 'DenseNet-BC']
df_all_cols = df_all_cols[['Model Name', 'human', 'model', 'comb temperature scaling']]
df_all_cols.to_csv('output/paper_results.csv')

In [None]:
!rm /content/output/r_low_acc_accuracy.csv
!rm /content/output/resnet-110_accuracy.csv
!rm /content/output/preresnet-110_accuracy.csv
!rm /content/output/densenet-bc-L190-k40_accuracy.csv