# Calibration Measures

In [1]:
from glob import glob
import os
from tiltify.models.gaussian_nb_model import GaussianNBModel
from tiltify.models.sentence_bert import SentenceBert
from tiltify.models.binary_bert_model import BinaryBERTModel


results_folder = "/home/gebauer/Desktop/repo/tiltify/experiments/IWPE/results_2"

Environment Variables not found. Entering Test Mode!


In [2]:
from tiltify.data_structures.document_collection import DocumentCollection
from sklearn.model_selection import train_test_split

config = {
    "k_ranks": [5, 10, 25],
    "repitions": 2,
    "step_size": 2,
    "random_state": 1337
}

step_size = config["step_size"]
train_doc_sizes = [1]  # [size/10 for size in list(range(0, 10+step_size, step_size))][1:]
print(f"Training Sizes: {train_doc_sizes}")

document_collection = DocumentCollection.from_json_files()
doc_index = list(range(len(document_collection)))

train_docs, test_docs = train_test_split(
    doc_index, test_size=0.33, random_state=config["random_state"], shuffle=False)
train_docs = document_collection[train_docs]
test_set = document_collection[test_docs]
print(f"Corpus having: {len(test_docs)} Test Docs and {len(train_docs)} Train Docs.")

Training Sizes: [1]
Corpus having: 20 Test Docs and 40 Train Docs.


In [3]:
from tqdm import tqdm
import numpy as np

def eval_model(model, doc_set):
    print(f"Starting evaluation of {model.__class__.__name__}...")
    metrics_dict = {}
    

    all_logits = []
    all_labels = []
    for document in tqdm(doc_set):
        labels = model.preprocessor.label_retriever.retrieve_labels(document.blobs)
        labels = model.preprocessor.prepare_labels(labels)
        logits = model.predict(document)
        # log_based_preds = [logit > 0.5 for logit in logits]
        all_logits.append(logits)
        all_labels.append(labels)

    # metrics_dict["all_logits"] = all_logits
    # metrics_dict["all_labels"] = all_labels
    all_labels = sum(all_labels, [])
    all_logits = sum(all_logits, [])
    
    return all_logits, all_labels

# def brier_scores(true, preds, weights=None):
#     bs = np.subtract(preds, true)**2
#     if min(preds) < 0:
#         raise AttributeError(f"Some predictions are below zero: {min(preds)}")
    
#     if weights:
#         bs = np.matmul(bs, weights.T)
#     else:
#         bs = bs.mean()
#     return bs

def min_max_norm(values):
    max_val = 1
    min_val = -1
    new_vals = []
    for val in values:
        new_val = (val - min_val)/(max_val - min_val)
        new_vals.append(new_val)
    return new_vals

def norm_sim(values):
    new_vals = []
    for val in values:
        new_val = (val + 1)/2
        new_vals.append(new_val)
    return new_vals
    

In [5]:
from collections import defaultdict
from sklearn.metrics import brier_score_loss

all_models = [
    SentenceBert,
    BinaryBERTModel, 
    GaussianNBModel
    ]

metrics_dict = defaultdict(list)
for model_cls in all_models:
    relevant_path = os.path.join(results_folder, f"{model_cls.__name__}/Right*/[0-1]")
    all_paths = glob(relevant_path)
    for load_path in all_paths:
        label = load_path.split("/")[-2]
        model = model_cls.load(load_path, label=label)

        logits, labels = eval_model(model, test_set)
        
        save_prefix = f"{model_cls.__name__}__{label}"
        # metrics_dict[f"{save_prefix}__bier_score"].append(brier_scores(norm_labels, logits))
        positive_cnt = sum(labels)
        negative_cnt = len(labels) - positive_cnt
        
        weights = [positive_cnt if label > 0 else negative_cnt for label in labels]
        if isinstance(model, SentenceBert):
            metrics_dict[f"{save_prefix}__brier_score"].append(brier_score_loss(labels, norm_sim(logits), sample_weight=weights))
        else:
            metrics_dict[f"{save_prefix}__brier_score"].append(brier_score_loss(labels, logits, sample_weight=weights))

Starting evaluation of SentenceBert...


100%|██████████| 20/20 [00:58<00:00,  2.95s/it]


Starting evaluation of SentenceBert...


 30%|███       | 6/20 [00:25<00:59,  4.28s/it]


RuntimeError: CUDA out of memory. Tried to allocate 384.00 MiB (GPU 0; 2.95 GiB total capacity; 1.44 GiB already allocated; 131.44 MiB free; 2.13 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
import json

save_path = "/home/gebauer/Desktop/repo/tiltify/experiments/IWPE/results_2/brier_scores.json"

with open(save_path, "w") as f:
    json.dump(metrics_dict, f)

metrics_dict

defaultdict(list,
            {'BinaryBERTModel__Right to Deletion__brier_score': [0.25527092363491977,
              0.24901538831053482],
             'BinaryBERTModel__Right to Withdraw Consent__brier_score': [0.26195873466051056,
              0.23864607295756182],
             'BinaryBERTModel__Right to Information__brier_score': [0.24661778119152072,
              0.2519837284729277],
             'BinaryBERTModel__Right to Complain__brier_score': [0.2742134296117353,
              0.2416008860267396],
             'BinaryBERTModel__Right to Data Portability__brier_score': [0.2361551888246405,
              0.2433711711622055]})