In [150]:
# So we can load files from other sub-directories, e.g. datasets.
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import torch
import utils.utils as utils
import json
import numpy as np
from scipy.special import softmax

import calibration as cal


In [17]:
print(f'cuda is available: {torch.cuda.is_available()}')
torch.cuda.empty_cache()

cuda is available: True


In [190]:
def load_model(config_path, checkpoint_path):
    with open(config_path) as f:
        config = json.load(f)
    net = utils.initialize(config['model'])
    if USE_CUDA:
        net = net.cuda()
    net.new_last_layer(config['num_classes'])
    utils.load_ckp(checkpoint_path, net)
    return net

# Load datasets.
def load_test_dataset(config, idx):
    test_config = config['test_datasets'][idx]
    print(test_config['name'])
    if 'transforms' not in test_config:
        test_config['transforms'] = config['default_test_transforms']
    test_data = utils.init_dataset(test_config)
    test_loader = torch.utils.data.DataLoader(
        test_data, batch_size=batch_size,
        shuffle=False, num_workers=num_workers)
    return test_data, test_loader

def get_outputs_labels(net, loader):
    net.cuda()
    net.eval()
    outputs_list, labels_list = [], []
    with torch.no_grad():
        for data in loader:
            images, labels = data
            images, labels = images.cuda(), labels.cuda()
            outputs = net(images)
            outputs_list.append(outputs.detach().cpu().numpy())
            labels_list.append(labels.detach().cpu().numpy())
    outputs = np.concatenate(outputs_list)
    labels = np.concatenate(labels_list)
    return outputs, labels

def get_adjusted_probs(probs, labels):
    calibrators = [cal.PlattTopCalibrator(0, num_bins=10) for _ in range(M)]
    adjusted_probs = make_none_list(M, L)
    for m in range(M):
        calibrators[m].train_calibration(probs[m][0], labels[m][0])
    for m in range(M):
        for l in range(L):
            adjusted_probs[m][l] = calibrators[m].calibrate(probs[m][l])
    return adjusted_probs

def get_acc(preds, labels):
    return np.mean(preds == labels)

def make_none_list(rs, cs):
    return [[None] * cs for _ in range(rs)]

def combine_confs_preds(confs, preds):
    confs, preds = np.array(confs), np.array(preds)
    model_choices = np.argmax(confs, axis=0)
    return preds[model_choices, np.arange(confs.shape[1])]


In [168]:
USE_CUDA = True
cuda = torch.device('cuda') 
batch_size, num_workers = 32, 16


In [198]:
def combined_calibration_accs(
    config_paths, checkpoint_paths, model_names, loader_names, loader_indices):
    M, L = len(model_names), len(loader_names)
    models = []
    for m in range(M):
        models.append(load_model(config_paths[m], checkpoint_paths[m]))

    with open(config_paths[0]) as f:
        config = json.load(f)
    loaders = []
    for l in range(L):
        _, loader = load_test_dataset(config, idx=loader_indices[l])
        loaders.append(loader)

    logits, labels = make_none_list(M, L), make_none_list(M, L)
    probs = make_none_list(M, L)
    confidences, preds = make_none_list(M, L), make_none_list(M, L)
    for m in range(M):
        for l in range(L):
            logits[m][l], labels[m][l] = get_outputs_labels(models[m], loaders[l])
            probs[m][l] = softmax(logits[m][l], axis=-1)
            confidences[m][l] = np.max(probs[m][l], axis=1)
            preds[m][l] = np.argmax(probs[m][l], axis=-1)
            print(f'Ave {model_names[m]} confidence for {loader_names[l]}: {np.mean(confidences[m][l])}')

    # Check that the ground truth labels agree at least for the first two models.
    # Note these are just ground truth labels, not predictions. Just a sanity check.
    # E.g. if the images were shuffled and not aligned between the two models this will fail.
    for l in range(L):
        assert (np.sum(labels[0][l] != labels[1][l]) == 0)

    adjusted_probs = get_adjusted_probs(probs, labels)
    combined_preds = [None] * L
    for l in range(L):
        cur_loader_adjusted_probs = [adjusted_probs[m][l] for m in range(M)]
        cur_loader_preds = [preds[m][l] for m in range(M)]
        combined_preds[l] = combine_confs_preds(cur_loader_adjusted_probs, cur_loader_preds)

    stats = make_none_list(M+1, L)
    for l in range(L):
        print(f'Dataset {loader_names[l]}: ')
        for m in range(M):
            stats[m][l] = get_acc(preds[m][l], labels[0][l])
            print(f'Model {model_names[m]}: {stats[m][l]}')
        stats[M][l] = get_acc(combined_preds[l], labels[0][l])
        print('Combined ', stats[M][l])
        
    return stats, combined_preds, preds, confidences, probs, adjusted_probs, labels


In [199]:
# Try linear and fine-tuning models which have very different ID and OOD accs.
# On CIFAR, Im-n-C10

config_paths = [
    '../logs/imnet_cifar_all_linp_0d01/config.json',
    '../logs/imnet_cifar_all_ft_0d01/config.json'
]

checkpoint_paths = [
    '../logs/imnet_cifar_all_linp_0d01/checkpoints/ckp_last',
    '../logs/imnet_cifar_all_ft_0d01/checkpoints/ckp_last'
]

model_names = ['lin_model', 'ft_model']
loader_names = ['cifar', 'imnc']
loader_indices = [0, 4]  # Relative to the config file in the first config path in config_paths.

_ = combined_calibration_accs(config_paths, checkpoint_paths, model_names, loader_names, loader_indices)

cifar10-test
imnet-n-cifar
Ave lin_model confidence for cifar: 0.926066517829895
Ave lin_model confidence for imnc: 0.9317657947540283
Ave ft_model confidence for cifar: 0.988318145275116
Ave ft_model confidence for imnc: 0.9081357717514038
Dataset cifar: 
Model lin_model: 0.9052
Model ft_model: 0.966
Combined  0.9646
Dataset imnc: 
Model lin_model: 0.752
Model ft_model: 0.62
Combined  0.758


([[0.9052, 0.752], [0.966, 0.62], [0.9646, 0.758]],
 [array([3, 8, 8, ..., 5, 1, 7]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 9, 1, 9, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 3, 4,
         2, 3, 2, 3, 3, 3, 4, 2, 3, 6, 3, 3, 3, 3, 4, 3, 2, 3, 5, 4, 4, 3,
         3, 3, 2, 3, 3, 3, 3, 2, 3, 2, 3, 2, 4, 3, 2, 3, 3, 3, 6, 3, 3, 2,
         2, 3, 3, 4, 4, 2, 4, 5, 4, 3, 2, 2, 3, 2, 4, 2, 2, 4, 2, 4, 3, 4,
         4, 4, 4, 2, 4, 2, 0, 2, 4, 2, 2, 4, 2, 4, 4, 4, 2, 4, 4, 4, 4, 2,
         2, 2, 4, 0, 4, 2, 4, 2, 4, 5, 5, 2, 3, 5, 5, 5, 3, 5, 4, 5, 3, 5,
         4, 5,

In [201]:
# Try the best linear and fine-tuned models on CIFAR, Im-n-C10

config_paths = [
    '../logs/imnet_cifar_all_linp_0d003/config.json',
    '../logs/imnet_cifar_all_ft_0d003/config.json'
]

checkpoint_paths = [
    '../logs/imnet_cifar_all_linp_0d003/checkpoints/ckp_last',
    '../logs/imnet_cifar_all_ft_0d003/checkpoints/ckp_last'
]

model_names = ['lin_model', 'ft_model']
loader_names = ['cifar', 'imnc']
loader_indices = [0, 4]  # Relative to the config file in the first config path in config_paths.

_ = combined_calibration_accs(config_paths, checkpoint_paths, model_names, loader_names, loader_indices)

cifar10-test
imnet-n-cifar
Ave lin_model confidence for cifar: 0.9127064347267151
Ave lin_model confidence for imnc: 0.9176557660102844
Ave ft_model confidence for cifar: 0.9871745109558105
Ave ft_model confidence for imnc: 0.9197461605072021
Dataset cifar: 
Model lin_model: 0.9076
Model ft_model: 0.9652
Combined  0.9638
Dataset imnc: 
Model lin_model: 0.764
Model ft_model: 0.71
Combined  0.786


In [205]:
# Control: Try two fine-tuned models on CIFAR, Im-n-C10

config_paths = [
    '../logs/imnet_cifar_all_ft_0d001/config.json',
    '../logs/imnet_cifar_all_ft_0d003/config.json'
]

checkpoint_paths = [
    '../logs/imnet_cifar_all_ft_0d001/checkpoints/ckp_last',
    '../logs/imnet_cifar_all_ft_0d003/checkpoints/ckp_last'
]

model_names = ['fine-tuned1', 'fine-tuned2']
loader_names = ['cifar', 'imnc']
loader_indices = [0, 4]  # Relative to the config file in the first config path in config_paths.

_ = combined_calibration_accs(config_paths, checkpoint_paths, model_names, loader_names, loader_indices)

cifar10-test
imnet-n-cifar
Ave fine-tuned1 confidence for cifar: 0.9832788109779358
Ave fine-tuned1 confidence for imnc: 0.9021713733673096
Ave fine-tuned2 confidence for cifar: 0.9871745109558105
Ave fine-tuned2 confidence for imnc: 0.9197461605072021
Dataset cifar: 
Model fine-tuned1: 0.9598
Model fine-tuned2: 0.9652
Combined  0.9655
Dataset imnc: 
Model fine-tuned1: 0.686
Model fine-tuned2: 0.71
Combined  0.704


In [207]:
# Battle testing this: what if fine-tuned model is better in both?
# On CIFAR, cinic

config_paths = [
    '../logs/imnet_cifar_all_linp_0d003/config.json',
    '../logs/imnet_cifar_all_ft_0d003/config.json'
]

checkpoint_paths = [
    '../logs/imnet_cifar_all_linp_0d003/checkpoints/ckp_last',
    '../logs/imnet_cifar_all_ft_0d003/checkpoints/ckp_last'
]

model_names = ['lin_model', 'ft_model']
loader_names = ['cifar', 'cinic']
loader_indices = [0, 2]  # Relative to the config file in the first config path in config_paths.

_ = combined_calibration_accs(config_paths, checkpoint_paths, model_names, loader_names, loader_indices)

cifar10-test
cinic-val
Ave lin_model confidence for cifar: 0.9127064347267151
Ave lin_model confidence for cinic: 0.8465512990951538
Ave ft_model confidence for cifar: 0.9871745109558105
Ave ft_model confidence for cinic: 0.9467272162437439
Dataset cifar: 
Model lin_model: 0.9076
Model ft_model: 0.9652
Combined  0.9638
Dataset cinic: 
Model lin_model: 0.7216285714285714
Model ft_model: 0.7908714285714286
Combined  0.7834428571428571


In [203]:
# Try Entity 30.

config_paths = [
    '../logs/entity30_moco_linprobe_0.003/config.json',
    '../logs/entity30_moco_ft_0.003/config.json'
]

checkpoint_paths = [
    '../logs/entity30_moco_linprobe_0.003/checkpoints/ckp_last',
    '../logs/entity30_moco_ft_0.003/checkpoints/ckp_last'
]

model_names = ['lin_model', 'ft_model']
loader_names = ['e30_src', 'e30_trg']
loader_indices = [0, 1]  # Relative to the config file in the first config path in config_paths.

_ = combined_calibration_accs(config_paths, checkpoint_paths, model_names, loader_names, loader_indices)

source_val_living
target_val_living
Ave lin_model confidence for e30_src: 0.3781175911426544
Ave lin_model confidence for e30_trg: 0.2673947513103485
Ave ft_model confidence for e30_src: 0.9443250298500061
Ave ft_model confidence for e30_trg: 0.8535376787185669
Dataset e30_src: 
Model lin_model: 0.789
Model ft_model: 0.8725
Combined  0.8803333333333333
Dataset e30_trg: 
Model lin_model: 0.6496666666666666
Model ft_model: 0.5401666666666667
Combined  0.6256666666666667


In [206]:
# Try Living 17.

config_paths = [
    '../logs/breeds_moco_linprobe_0.003/config.json',
    '../logs/breeds_moco_ft_0.003/config.json'
]

checkpoint_paths = [
    '../logs/breeds_moco_linprobe_0.003/checkpoints/ckp_last',
    '../logs/breeds_moco_ft_0.003/checkpoints/ckp_last'
]

model_names = ['lin_model', 'ft_model']
loader_names = ['l17_src', 'l17_trg']
loader_indices = [0, 1]  # Relative to the config file in the first config path in config_paths.

_ = combined_calibration_accs(config_paths, checkpoint_paths, model_names, loader_names, loader_indices)

source_val_living
target_val_living
Ave lin_model confidence for l17_src: 0.4781600534915924
Ave lin_model confidence for l17_trg: 0.3302820920944214
Ave ft_model confidence for l17_src: 0.969921886920929
Ave ft_model confidence for l17_trg: 0.9005916714668274
Dataset l17_src: 
Model lin_model: 0.9
Model ft_model: 0.9241176470588235
Combined  0.94
Dataset l17_trg: 
Model lin_model: 0.7341176470588235
Model ft_model: 0.6541176470588236
Combined  0.731764705882353


In [82]:
a = np.array([[1, 2],[3, 4],[5, 6]])
a[np.array([0, 0, 0]), np.array([0, 1, 0])]

array([1, 2, 1])

In [14]:
import calibration as cal

def combine_preds(model_probs, labels_id, labels_ood):
    model
    for model in range(2):
        for domain in range(2):
            
    model1_id_confs = np.max(model1_probs_id, axis=1)
    model2_id_confs = np.max(model2_probs_id, axis=1)
    model1_ood_confs = np.max(model1_probs_ood, axis=1)
    model2_ood_confs = np.max(model2_probs_ood, axis=1)
    model1_id_preds = np.argmax(model1_probs_id, axis=1)
    model2_id_c = np.argmax(model2_probs_id, axis=1)
    model1_ood_confs = np.argmax(model1_probs_ood, axis=1)
    model2_ood_confs = np.argmax(model2_probs_ood, axis=1)

In [22]:
config

{'train_dataset': {'classname': 'torchvision.datasets.CIFAR10',
  'args': {'train': True,
   'download': False,
   'root': '/u/scr/ananya/cifar10_dataset'},
  'transforms': [{'classname': 'torchvision.transforms.Resize',
    'args': {'size': 224}},
   {'classname': 'torchvision.transforms.ToTensor'},
   {'classname': 'torchvision.transforms.Normalize',
    'args': {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]}}]},
 'default_test_transforms': [{'classname': 'torchvision.transforms.Resize',
   'args': {'size': [224, 224]}},
  {'classname': 'torchvision.transforms.ToTensor'},
  {'classname': 'torchvision.transforms.Normalize',
   'args': {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]}}],
 'test_datasets': [{'name': 'cifar10-test',
   'max_test_examples': 1000,
   'classname': 'torchvision.datasets.CIFAR10',
   'args': {'train': False,
    'download': False,
    'root': '/u/scr/ananya/cifar10_dataset'}},
  {'name': 'stl-test',
   'max_test_examples': 1000,
   '