In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch.nn.functional import softmax
from tqdm import tqdm

In [2]:
def predict_masked_token(sentence, tokenizer, model, file_handle, model_type):
    if model_type == 'roberta':
        sentence = sentence.replace('[MASK]', '<mask>')

    input_ids = tokenizer.encode(sentence, return_tensors='pt')
    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs[0]

    masked_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
    masked_logits = predictions[0, masked_index, :]
    top_k_weights, top_k_indices = torch.topk(softmax(masked_logits, dim=-1), 2)

    predicted_tokens = tokenizer.convert_ids_to_tokens(top_k_indices[0])

    if model_type == 'albert':
        predicted_tokens = [token.lstrip('_') for token in predicted_tokens]
    elif model_type == 'roberta':
        predicted_tokens = [token.lstrip('Ġ') for token in predicted_tokens]

    file_handle.write(f"{sentence}\n")
    file_handle.write(f"Top predicted word: {predicted_tokens[0]}\n")
    file_handle.write(f"Probability of top predicted word {top_k_weights[0][0].item():.8f}\n")
    file_handle.write(f"Second predicted word: {predicted_tokens[1]}\n")
    file_handle.write(f"Probability of second predicted word {top_k_weights[0][1].item():.8f}\n\n")


def process_model(sentences, model_name, tokenizer, model, model_type):
    model.eval()
    log_filename = f"{model_name}_predictions.log"
    with open(log_filename, "w") as file:
        for sentence in tqdm(sentences, desc=f"Processing {model_name}"):
            predict_masked_token(sentence, tokenizer, model, file, model_type)

In [3]:
df = pd.read_csv("CustomPrompts_mlm.csv")
sentences = df['sent_masked'].tolist()

model_configs = [
    ("bert-base-uncased", "bert"),
    ("albert-base-v2", "albert"),
    ("roberta-base", "roberta")
]

In [4]:
for model_name, model_type in model_configs:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    process_model(sentences, model_name, tokenizer, model, model_type)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identica

## Top-1 Metric Eval 

In [1]:
import pandas as pd

files = [
    ("mlm/bert/bert_preds.csv", "bert"),
    ("mlm/albert/albert_preds.csv", "albert"),
    ("mlm/roberta/roberta_preds.csv", "roberta")
]

for file_path, model_name in files:
    df = pd.read_csv(file_path)
    stereotype_score = df['pred'].sum() / len(df)
    print(f"{model_name} Stereotype score: {stereotype_score:.4f}")

bert Stereotype score: 0.1515
albert Stereotype score: 0.0808
roberta Stereotype score: 0.1616
