In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import math
from tqdm import tqdm
import json
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = '../data/analogy'
valid_path = os.path.join(data_path, 'valid.json')
test_path = os.path.join(data_path, 'test.json')
valid_data = json.load(open(valid_path, 'r'))
test_data = json.load(open(test_path, 'r'))

In [3]:
model_names = ['bert-base-uncased', 'bert-large-uncased',
                 'roberta-base', 'roberta-large',
                 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
                 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2',] 

for model_name in model_names:
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name).cuda()

    results = []

    for sample in tqdm(test_data):
        uid = sample['uid']
        sentence = sample['input']

        # Tokenize the input sentence
        tokens = tokenizer.tokenize(sentence)
        end_tail = len(tokens) - 2
        tail = tokenizer.tokenize(sample['output'])
        start_tail = end_tail - len(tail) + 1
        
        input_ids_list = []
        masked_indices = []

        # Create a list of inputs with each token masked one at a time
        for i in range(len(tokens)):
            masked_tokens = tokens[:]
            masked_tokens[i] = tokenizer.mask_token
            masked_input = tokenizer.convert_tokens_to_string(masked_tokens)
            inputs = tokenizer(masked_input, return_tensors="pt").to('cuda')
            input_ids_list.append(inputs["input_ids"][0])
            masked_indices.append((inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1])

        # Pad input IDs to the same length
        input_ids_batched = torch.nn.utils.rnn.pad_sequence(input_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
        
        # Create attention masks
        attention_masks = (input_ids_batched != tokenizer.pad_token_id).long()

        # Get model predictions in a single batched inference
        with torch.no_grad():
            outputs = model(input_ids_batched, attention_mask=attention_masks)
            logits = outputs.logits

        # Calculate probabilities for each masked token
        total_log_prob = 0
        tail_log_prob = 0
        for i, token in enumerate(tokens):
            softmax = torch.nn.functional.softmax(logits[i, masked_indices[i], :], dim=-1)
            token_id = tokenizer.convert_tokens_to_ids(token)
            word_prob = softmax[0, token_id].item()
            total_log_prob += math.log(word_prob)
            if start_tail <= i <= end_tail:
                tail_log_prob += math.log(word_prob)

        # Calculate perplexity
        avg_log_prob = total_log_prob / len(tokens)
        avg_tail_log_prob = tail_log_prob / len(tail)
        perplexity = math.exp(-avg_log_prob)
        tail_ppl = math.exp(-avg_tail_log_prob)

        result = {'uid': uid, 'ppl': perplexity, 'tail_ppl': tail_ppl}
        results.append(result)

    os.makedirs('results', exist_ok=True)
    output_model_name = model_name.split('/')[-1]
    with open(os.path.join('results', f'{output_model_name}.json'), 'w') as fout:
        json.dump(results, fout)

[2024-12-02 16:08:41,016] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 13656/13656 [03:14<00:00, 70.27it/s]
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initia