# Confronto delle misure di similarità tra task

#### Configurazioni generali

Importo i modulii necessari.

In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers

Imposto il seme per la riproducibilità.

In [2]:
seed_value = 42

os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)

# Imposto il seme casuale anche per i calcoli CUDA
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

## Caricamento dei dataset

#### 1. IMDB Reviews

In [3]:
import kagglehub

# Scarico l'ultima versione del dataset
imdb_path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

imdb_dataset_path = imdb_path + "/IMDB Dataset.csv"
imdb_dataset = pd.read_csv(imdb_dataset_path)

In [4]:
LABELS = {"negative": 0, "positive": 1}
imdb_classes = list(LABELS.keys())

In [5]:
def get_data(dataset_path, n_train=5000, n_val=500, n_test=512): 

    dataset = pd.read_csv(dataset_path)
    dataset['sentiment'] = dataset["sentiment"].map(LABELS)

    neg = dataset[ dataset['sentiment'] == LABELS['negative'] ]
    pos = dataset[ dataset['sentiment'] == LABELS['positive'] ]

    if len(neg) < n_train + n_val + n_test or len(pos) < n_train + n_val + n_test:
        raise ValueError("Non ci sono abbastanza esempi per le dimensioni del train, validation e test set specificate.")
    
    neg = neg.sample(frac=1, random_state=42).reset_index(drop=True)
    pos = pos.sample(frac=1, random_state=42).reset_index(drop=True)

    neg_train, pos_train = neg[:n_train], pos[:n_train]
    neg_val, pos_val = neg[n_train:n_train+n_val], pos[n_train:n_train+n_val]
    neg_test, pos_test = neg[n_train+n_val:n_train+n_val+n_test], pos[n_train+n_val:n_train+n_val+n_test]

    train_data = pd.concat([neg_train, pos_train])
    val_data = pd.concat([neg_val, pos_val])
    test_data = pd.concat([neg_test, pos_test])

    train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
    val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
    test_data = test_data.sample(frac=1, random_state=42).reset_index(drop=True)

    sentences_train, labels_train = train_data['review'] , train_data['sentiment']
    sentences_val, labels_val = val_data['review'] , val_data['sentiment']
    sentences_test, labels_test = test_data['review'] , test_data['sentiment']

    return sentences_train, labels_train, sentences_val, labels_val, sentences_test, labels_test

In [6]:
from torch.utils.data import Dataset

class IMDBDataset(Dataset):

    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self,index):
        sentence = self.sentences[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding["token_type_ids"].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)      
        }

In [7]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader

MAX_SEQ_LEN = 128
BATCH_SIZE = 32

imdb_sentences_train, imdb_labels_train, imdb_sentences_val, imdb_labels_val, imdb_sentences_test, imdb_labels_test = get_data(imdb_dataset_path)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

imdb_training_data = IMDBDataset(sentences = imdb_sentences_train,
                           labels = imdb_labels_train,
                           tokenizer = tokenizer,
                           max_len = MAX_SEQ_LEN)

imdb_validation_data = IMDBDataset(sentences = imdb_sentences_val.values,
                           labels = imdb_labels_val.values,
                           tokenizer = tokenizer,
                           max_len = MAX_SEQ_LEN)

imdb_test_data = IMDBDataset(sentences = imdb_sentences_test.values,
                           labels = imdb_labels_test.values,
                           tokenizer = tokenizer,
                           max_len = MAX_SEQ_LEN)


# Creo i DataLoader
train_loader_imdb = DataLoader(imdb_training_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader_imdb = DataLoader(imdb_validation_data, batch_size=BATCH_SIZE, shuffle=False)
test_loader_imdb = DataLoader(imdb_test_data, batch_size=BATCH_SIZE, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

#### 2. SST-2 

In [8]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

sst_dataset = load_dataset('sst2')


sst_data = sst_dataset['train'].shuffle(seed=42)

sst_temp_data, sst_test_data, sst_temp_labels, sst_test_labels = train_test_split(sst_data['sentence'], 
                                                  sst_data['label'], 
                                                  test_size=1024, 
                                                  random_state=42,
                                                  stratify=sst_data['label'])

sst_train_data, sst_val_data, sst_train_labels, sst_val_labels = train_test_split(sst_data['sentence'], 
                                                  sst_data['label'],
                                                  train_size=10000,
                                                  test_size=1000, 
                                                  random_state=42,
                                                  stratify=sst_data['label'])

README.md:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [9]:
from torch.utils.data import Dataset

class SSTDataset(Dataset):

    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self,index):
        sentence = self.sentences[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding["token_type_ids"].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
            }

In [10]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader

MAX_SEQ_LEN = 128

# Inizializza il Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#Ottieni i dataset
sst_training_data = SSTDataset(sentences = sst_train_data,
                           labels = sst_train_labels,
                           tokenizer = tokenizer,
                           max_len = MAX_SEQ_LEN)

sst_validation_data = SSTDataset(sentences = sst_val_data,
                           labels = sst_val_labels,
                           tokenizer = tokenizer,
                           max_len = MAX_SEQ_LEN)

sst_test_data = SSTDataset(sentences = sst_test_data,
                           labels = sst_test_labels,
                           tokenizer = tokenizer,
                           max_len = MAX_SEQ_LEN)

# Creo i DataLoader
train_loader_sst = DataLoader(sst_training_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader_sst = DataLoader(sst_validation_data, batch_size=BATCH_SIZE, shuffle=False)
test_loader_sst = DataLoader(sst_test_data, batch_size=BATCH_SIZE, shuffle=False)

#### 3. MNLI

In [11]:
# Carico il datast
mnli_dataset = load_dataset('glue', 'mnli')

# Divido i dati in training set, validation set e test set
mnli_data = mnli_dataset['train'].shuffle(seed=42)

mnli_temp_premises, mnli_test_premises, mnli_temp_hypotheses, mnli_test_hypotheses, mnli_temp_labels, mnli_test_labels = train_test_split(mnli_data['premise'], 
                                                  mnli_data['hypothesis'],                
                                                  mnli_data['label'], 
                                                  test_size=1024, 
                                                  random_state=42,
                                                  stratify=mnli_data['label'])

mnli_train_premises, mnli_val_premises, mnli_train_hypotheses, mnli_val_hypotheses, mnli_train_labels, mnli_val_labels = train_test_split(mnli_data['premise'], 
                                                  mnli_data['hypothesis'],
                                                  mnli_data['label'],
                                                  train_size=10000,
                                                  test_size=1000, 
                                                  random_state=42,
                                                  stratify=mnli_data['label'])

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

(…)alidation_matched-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

(…)dation_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

test_matched-00000-of-00001.parquet:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

test_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [12]:
from torch.utils.data import Dataset

class MNLIDataset(Dataset):

    def __init__(self, premises, hypotheses , labels, tokenizer, max_len):
        self.premises = premises
        self.hypotheses = hypotheses
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.premises)
    
    def __getitem__(self,index):
        premise = self.premises[index]
        hyphotesis = self.hypotheses[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            premise,
            hyphotesis,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding["token_type_ids"].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
            }

In [13]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader

MAX_SEQ_LEN = 256 
BATCH_SIZE = 32

# Inizializza il Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#Ottieni i dataset
mnli_training_data = MNLIDataset(premises = mnli_train_premises,
                            hypotheses = mnli_train_hypotheses,
                            labels = mnli_train_labels,
                            tokenizer = tokenizer,
                            max_len = MAX_SEQ_LEN)

mnli_validation_data = MNLIDataset(premises = mnli_train_premises,
                            hypotheses = mnli_train_hypotheses,
                            labels = mnli_train_labels,
                            tokenizer = tokenizer,
                            max_len = MAX_SEQ_LEN)

mnli_test_data = MNLIDataset(premises = mnli_train_premises,
                            hypotheses = mnli_train_hypotheses,
                            labels = mnli_train_labels,
                            tokenizer = tokenizer,
                            max_len = MAX_SEQ_LEN)

# Creo i DataLoader
train_loader_mnli = DataLoader(mnli_training_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader_mnli = DataLoader(mnli_validation_data, batch_size=BATCH_SIZE, shuffle=False)
test_loader_mnli = DataLoader(mnli_test_data, batch_size=BATCH_SIZE, shuffle=False)


## Caricamento dei modelli

#### 1. IMDB Rewiews

In [14]:
from transformers import BertModel

class BERTClassifierIMDB(nn.Module):
    
    def __init__(self, lora: bool = False, r: int = 16):
        super(BERTClassifierIMDB, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.avg_pooling = nn.AdaptiveAvgPool1d(1) 
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)

        if lora:
            print("Adding LoRA to BERT")
            lora_utils.add_lora_to_bert(self.bert, r=r)
            lora_utils.mark_only_lora_as_trainable(self.bert)

    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_bert = self.bert(
            input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )
        last_hidden_state = output_bert.last_hidden_state  
        avg_pooled = self.avg_pooling(last_hidden_state.transpose(1, 2)).squeeze(-1)
        logits = self.linear(avg_pooled)
        return logits

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
imdb_model = BERTClassifierIMDB(lora=False)
imdb_model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTClassifierIMDB(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [17]:
imdb_model.load_state_dict(torch.load("/kaggle/input/saved-models/imbd_best_full_model_state.bin"))

  imdb_model.load_state_dict(torch.load("/kaggle/input/saved-models/imbd_best_full_model_state.bin"))


<All keys matched successfully>

#### 2. SST-2

In [18]:
from transformers import BertModel

class BERTClassifierSST(nn.Module):
    
    def __init__(self, lora: bool = False, r: int = 16):
        super(BERTClassifierSST, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)

        if lora:
            print("Adding LoRA to BERT")
            lora_utils.add_lora_to_bert(self.bert, r=r)
            lora_utils.mark_only_lora_as_trainable(self.bert)

    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_bert = self.bert(
            input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output_bert.pooler_output)
        output = self.linear(output_dropout)
        return output

In [19]:
sst_model = BERTClassifierSST(lora=False)
sst_model.to(device)

BERTClassifierSST(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [20]:
sst_model.load_state_dict(torch.load("/kaggle/input/saved-models/sa_full_model_best_model_state.bin"))


  sst_model.load_state_dict(torch.load("/kaggle/input/saved-models/sa_full_model_best_model_state.bin"))


<All keys matched successfully>

#### 3. MNLI

In [21]:
from transformers import BertModel

class BERTClassifierNLI(nn.Module):
    
    def __init__(self, num_classes: int = 3, lora: bool = False, r: int = 16):
        super(BERTClassifierNLI, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_classes)

        if lora:
            print("Adding LoRA to BERT")
            lora_utils.add_lora_to_bert(self.bert, r=r)
            lora_utils.mark_only_lora_as_trainable(self.bert)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_bert = self.bert(
            input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output_bert.pooler_output)
        logits = self.linear(output_dropout)
        return logits


In [22]:
mnli_model = BERTClassifierNLI(lora=False)
mnli_model.to(device)

BERTClassifierNLI(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [23]:
mnli_model.load_state_dict(torch.load("/kaggle/input/saved-models/nli_full_model_best_model_state.bin"))


  mnli_model.load_state_dict(torch.load("/kaggle/input/saved-models/nli_full_model_best_model_state.bin"))


<All keys matched successfully>

## Metriche di transfer learning

Installo la libreria tllib.

In [24]:
pip install -i https://test.pypi.org/simple/ tllib==0.4

Looking in indexes: https://test.pypi.org/simple/
Collecting tllib==0.4
  Downloading https://test-files.pythonhosted.org/packages/33/07/38ed6a831287654ea1d9b281fee7a3629f8065f274d8b503b5c64d3c1556/tllib-0.4-py3-none-any.whl.metadata (19 kB)
Downloading https://test-files.pythonhosted.org/packages/33/07/38ed6a831287654ea1d9b281fee7a3629f8065f274d8b503b5c64d3c1556/tllib-0.4-py3-none-any.whl (287 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.6/287.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tllib
Successfully installed tllib-0.4
Note: you may need to restart the kernel to use updated packages.


### LEEP Score

Implementazione del leep score.

In [25]:
def leep_score(pretrained_model, target_dataset_loader, device):
    """
    Calcola il LEEP score dato un modello pre-addestrato e un dataset target.
    
    Args:
        pretrained_model (torch.nn.Module): Modello pre-addestrato θ.
        target_dataset_loader (DataLoader): DataLoader per il dataset target D.
        device (torch.device): Device per il calcolo (CPU/GPU).
        
    Returns:
        float: LEEP score.
    """

    # Imposto il modello in modalità di valutazione e lo sposto sul device
    pretrained_model.eval()  
    pretrained_model.to(device)


    # 1) Calcolo delle distribuzioni dummy θ(xi)
    dummy_distributions = []  # distribuzioni θ(xi) per ogni esempio
    all_target_labels = []    # etichette target yi, serve solo per poter calcolare il numero di classi del dataset target
    
    with torch.no_grad():
        for batch in target_dataset_loader:    # itero sugli esempi del dataset target D

            # Ottengo gli esempi del dataset target e li sposto sul device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device) 

            # Memorizzo l'etichetta 
            all_target_labels.extend(labels.cpu().numpy())
            
            # Mando gli esempi in input al modello e ottengo l'output del modello (ovvero i logits)
            outputs = pretrained_model(input_ids, attention_mask, token_type_ids)

            # Trasformo i logits in probabilità e calcolo così θ(xi)
            prob_class_1 = torch.sigmoid(outputs)  # Visto che il modello ha solo un neurone di output e restituisce quindi la probabilità della classe 1, calcolo manualmente la probabilità della classe 0
            prob_class_0 = 1 - prob_class_1 
            probabilities = torch.cat([prob_class_0, prob_class_1], dim=-1)
            
            # appendo alla lista di dummy distribution
            dummy_distributions.append(probabilities)

    # Concateno tutti i batch in un unico tensore
    dummy_distributions = torch.cat(dummy_distributions, dim=0)  # (n, |Z|)
    
    
    # Determino il numero di classi sorgente e il numero di classi target
    num_source_labels = dummy_distributions.size(1)  # |Z|
    num_target_labels = len(np.unique(all_target_labels))  # |Y|
    
    num_target_examples = dummy_distributions.size(0)  # n
    
    # 2) Calcolo della distribuzione condizionale empirica

    # 2.1) Calcolo della distribuzione condizionale empirica P(y,z)
    joint_distribution = np.zeros((num_target_labels, num_source_labels))
    for y in range(num_target_labels):
        # Trovo gli indici degli esempi nel dataset target che hanno etichetta y
        indices = torch.where(torch.tensor(all_target_labels) == y)[0]
    
        # Sommo θ(xi)_z per ogni esempio xi appartenente alla classe y
        for z in range(num_source_labels):
            joint_distribution[y][z] = dummy_distributions[indices, z].sum().item()
    
    # Normalizzo dividendo per il numero totale di esempi nel dataset target
    joint_distribution /= num_target_examples

    # 2.2) Calcolo della distribuzione marginale empirica P(z)
    marginal_distribution = joint_distribution.sum(axis=0)

    # 2.3) Calcolo della distribuzione condizionale empirica P(y|z)
    conditional_distribution = joint_distribution / marginal_distribution  # (|Y|, |Z|)
    conditional_distribution = np.nan_to_num(conditional_distribution)     # Per gestire i casi in cui P(z) = 0, -inf, +inf

    
    # 3) Calcolo del LEEP score

    log_likelihood_sum = 0.0
    for i in range(num_target_examples):
        # Estrai la distribuzione di probabilità θ(x_i) per l'esempio i
        x_i_probabilities = dummy_distributions[i].cpu().numpy()  # (|Z|,)
        y_i = int(all_target_labels[i])  # Etichetta vera y_i del target
        
        # Calcola p(y_i|x_i; θ, D) = ∑_{z ∈ Z} P̂(y_i|z) * θ(x_i)_z
        eep_prediction = np.sum(
            conditional_distribution[y_i, :] * x_i_probabilities  # Element-wise prodotto
        )

        # Calcola il logaritmo della previsione EEP e aggiungi alla somma
        log_likelihood_sum += np.log(eep_prediction)

    # Calcola la media del logaritmo
    leep_score = log_likelihood_sum / num_target_examples

    return leep_score


# Calcola il LEEP score utilizzando il modello pre-addestrato e i dati target
leeps = leep_score(
    pretrained_model=sst_model,
    target_dataset_loader=train_loader_sst,
    device=device
)

print(leeps)

-0.023772563672319818


In [26]:
def get_model_predictions(model, data_loader):
    model.eval()
    model.to(device)
    
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            # Salvo le etichette vere
            all_labels.extend(labels.cpu().numpy())
            
            # Ottiengo i logits dal modello
            outputs = model(input_ids=input_ids, 
                            attention_mask=attention_mask, 
                            token_type_ids=token_type_ids)
            
            if outputs.size(-1) == 1:  # Nel caso di problema di classificazione binaria
                prob_class_1 = torch.sigmoid(outputs)  # Probabilità della classe 1
                prob_class_0 = 1 - prob_class_1        # Probabilità della classe 0
                probabilities = torch.cat([prob_class_0, prob_class_1], dim=-1)
            else:  # Nel caso di problema di classificazione multi-classe
                probabilities = torch.softmax(outputs, dim=-1)

            all_predictions.extend(probabilities.cpu().numpy())

    return np.array(all_predictions), np.array(all_labels).astype(int)

In [27]:
from tllib.ranking import log_expected_empirical_prediction as leep

def calculate_leep_scores(models, data_loaders):
    leep_scores = []
    for source_name, source_model in models.items():
        for target_name, target_loader in data_loaders.items():
            
            predictions, labels = get_model_predictions(source_model, target_loader)
            score = leep(predictions, labels)
            leep_scores.append({"Source": source_name, "Target": target_name, "Metric": "LEEP", "Score": score})
            print(f"Source: {source_name}, Target: {target_name}, LEEP Score: {score:.4f}")
    return leep_scores

### LogME Score

In [28]:
def extract_embeddings(model, data_loader, device):
    model.eval()
    embeddings = []
    labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            label = batch['labels'].cpu().numpy()

            # Ottiengo l'output del modello
            outputs = model.bert(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )
            # Estraggo la rappresentazione del token [CLS]
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embeddings)
            labels.append(label)
    
    embeddings = np.vstack(embeddings)
    labels = np.concatenate(labels)
    labels = labels.astype(int)
    
    return embeddings, labels

In [29]:
from tllib.ranking import log_maximum_evidence as logme

def calculate_logme_scores(models, data_loaders, device):
    logme_scores = []
    for source_name, source_model in models.items():
        for target_name, target_loader in data_loaders.items():
            
            embeddings, labels = extract_embeddings(source_model, target_loader, device)
            score = logme(embeddings, labels)
            logme_scores.append({"Source": source_name, "Target": target_name, "Metric": "LogME", "Score": score})
            print(f"Source: {source_name}, Target: {target_name}, LogME Score: {score:.4f}")
    return logme_scores


#### H-Score

In [30]:
from tllib.ranking import h_score

def calculate_h_scores(models, data_loaders, device):
    h_scores = []
    for source_name, source_model in models.items():
        for target_name, target_loader in data_loaders.items():
            
            embeddings, labels = extract_embeddings(source_model, target_loader, device)
            score = h_score(embeddings, labels)
            h_scores.append({"Source": source_name, "Target": target_name, "Metric": "H-Score", "Score": score})
            print(f"Source: {source_name}, Target: {target_name}, H-Score: {score:.4f}")
    
    return h_scores


### NCE Score

In [31]:
def get_source_labels(model, data_loader, device):
    model.eval()
    all_predictions = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            # Ottieni i logits dal modello
            logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            
            # Converti i logits in etichette predette (classe 0 o 1)
            predictions = torch.sigmoid(logits).cpu().numpy().flatten() > 0.5
            all_predictions.extend(predictions.astype(int))
    
    return np.array(all_predictions)


In [32]:
from tllib.ranking import negative_conditional_entropy as nce

def calculate_nce_scores(models, data_loaders, device):
    nce_scores = []
    
    for source_name, source_model in models.items():
        for target_name, target_loader in data_loaders.items():
            
            source_labels = get_source_labels(source_model, target_loader, device)
            target_labels = np.array(target_loader.dataset.labels)
            score = nce(source_labels, target_labels)
            nce_scores.append({"Source": source_name, "Target": target_name, "Metric": "NCE", "Score": score})
            print(f"Source: {source_name}, Target: {target_name}, NCE Score: {score:.4f}")
    
    return nce_scores

In [33]:
models = {
    "IMDB": imdb_model,
    "SST": sst_model,
    "MNLI": mnli_model
}

data_loaders = {
    "IMDB": train_loader_imdb,
    "SST": train_loader_sst,
    "MNLI": train_loader_mnli
}


leep_scores = calculate_leep_scores(models, data_loaders)
logme_scores = calculate_logme_scores(models, data_loaders, device)
h_scores = calculate_h_scores(models, data_loaders, device)
nce_scores = calculate_nce_scores(models, data_loaders, device)


Source: IMDB, Target: IMDB, LEEP Score: -0.1040
Source: IMDB, Target: SST, LEEP Score: -0.4576


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: IMDB, Target: MNLI, LEEP Score: -1.0972
Source: SST, Target: IMDB, LEEP Score: -0.4417
Source: SST, Target: SST, LEEP Score: -0.0238


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: SST, Target: MNLI, LEEP Score: -1.0959
Source: MNLI, Target: IMDB, LEEP Score: -0.6912
Source: MNLI, Target: SST, LEEP Score: -0.6721


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: MNLI, Target: MNLI, LEEP Score: -0.1147
Source: IMDB, Target: IMDB, LogME Score: 0.5761
Source: IMDB, Target: SST, LogME Score: -0.2786


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: IMDB, Target: MNLI, LogME Score: -0.5949
Source: SST, Target: IMDB, LogME Score: -0.2925
Source: SST, Target: SST, LogME Score: 1.3886


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: SST, Target: MNLI, LogME Score: -0.5869
Source: MNLI, Target: IMDB, LogME Score: -0.5106
Source: MNLI, Target: SST, LogME Score: -0.3623


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: MNLI, Target: MNLI, LogME Score: 0.6378
Source: IMDB, Target: IMDB, H-Score: 38.7167
Source: IMDB, Target: SST, H-Score: -3.0726


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: IMDB, Target: MNLI, H-Score: -0.9263
Source: SST, Target: IMDB, H-Score: -8.0096
Source: SST, Target: SST, H-Score: 2.1262


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: SST, Target: MNLI, H-Score: 0.6774
Source: MNLI, Target: IMDB, H-Score: 1.2456
Source: MNLI, Target: SST, H-Score: -3.0537


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: MNLI, Target: MNLI, H-Score: -62.4062
Source: IMDB, Target: IMDB, NCE Score: -0.6931
Source: IMDB, Target: SST, NCE Score: -0.6865


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: IMDB, Target: MNLI, NCE Score: -1.0985
Source: SST, Target: IMDB, NCE Score: -0.6931
Source: SST, Target: SST, NCE Score: -0.6864


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: SST, Target: MNLI, NCE Score: -1.0984
Source: MNLI, Target: IMDB, NCE Score: -0.2310
Source: MNLI, Target: SST, NCE Score: -0.2288


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Source: MNLI, Target: MNLI, NCE Score: -0.3662


In [35]:
all_scores = leep_scores + logme_scores + h_scores + nce_scores
results_df = pd.DataFrame(all_scores)

pivot_df = results_df.pivot_table(
    index=["Source", "Target"],
    columns="Metric",
    values="Score"
).reset_index()

# Ordiniamo le righe in base a Source e Target
pivot_df = pivot_df.sort_values(by=["Source", "Target"])

# Visualizziamo il risultato
print(pivot_df)

Metric Source Target    H-Score      LEEP     LogME       NCE
0        IMDB   IMDB  38.716675 -0.104039  0.576065 -0.693146
1        IMDB   MNLI  -0.926340 -1.097195 -0.594876 -1.098514
2        IMDB    SST  -3.072601 -0.457571 -0.278590 -0.686450
3        MNLI   IMDB   1.245621 -0.691206 -0.510617 -0.231047
4        MNLI   MNLI -62.406204 -0.114683  0.637751 -0.366173
5        MNLI    SST  -3.053725 -0.672133 -0.362303 -0.228816
6         SST   IMDB  -8.009583 -0.441727 -0.292533 -0.693110
7         SST   MNLI   0.677395 -1.095884 -0.586855 -1.098433
8         SST    SST   2.126160 -0.023773  1.388554 -0.686380


- Il LEEP assume valori nell'intervallo (− inf,0). Più il suo valore si avvicina a zero, maggiore è la trasferibilità del modello. Questo significa che un modello addestrato sul task di partenza è meglio adattato a risolvere il task di destinazione.
- Il LogME può assumere valori nell'intervallo (−inf,+inf). In generale, valori più elevati di LogME indicano una migliore performance di trasferimento.
- Un H-score più alto segnala una maggiore qualità delle caratteristiche del modello per il trasferimento.
+ Il NCE varia anch'esso nell'intervallo (−inf,0) e a valori più bassi di NCE sono generalmente associati a una minore trasferibilità.