In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset
import numpy as np
from src.evaluate_crows import evaluate_crows
from src.evaluate_stereoset import evaluate_stereoset
import os
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
models_names = ['distilbert/distilbert-base-uncased','albert/albert-base-v2','FacebookAI/xlm-roberta-base']
#models_names = ['albert/albert-base-v2','FacebookAI/xlm-roberta-base']

# Crows evaluation

In [3]:

for model_name in models_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    print('Computing CROWS for: ',model_name,)
    results = evaluate_crows(model,tokenizer)
    print(f' {results}')
    result_path = 'results/'+model_name.replace('/','-')
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'crows.pkl'),'wb') as f:
        pickle.dump(results, f)
    print('-'*25)


Computing CROWS for:  distilbert/distilbert-base-uncased


100%|██████████| 1508/1508 [01:08<00:00, 22.09it/s]


 {'general': 0.59734219269103, 'race-color': 0.629126213592233, 'socioeconomic': 0.5906432748538012, 'gender': 0.5152671755725191, 'disability': 0.6779661016949152, 'nationality': 0.5031446540880503, 'sexual-orientation': 0.7976190476190477, 'physical-appearance': 0.6190476190476191, 'religion': 0.638095238095238, 'age': 0.5287356321839081}
-------------------------


Some weights of the model checkpoint at albert/albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Computing CROWS for:  albert/albert-base-v2


100%|██████████| 1508/1508 [01:28<00:00, 17.06it/s]


 {'general': 0.5813953488372093, 'race-color': 0.6, 'socioeconomic': 0.7017543859649122, 'gender': 0.4770992366412214, 'disability': 0.7457627118644068, 'nationality': 0.4591194968553459, 'sexual-orientation': 0.8333333333333334, 'physical-appearance': 0.49206349206349204, 'religion': 0.6095238095238096, 'age': 0.4482758620689655}
-------------------------


Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Computing CROWS for:  FacebookAI/xlm-roberta-base


100%|██████████| 1508/1508 [03:05<00:00,  8.14it/s]

 {'general': 0.535904255319149, 'race-color': 0.596116504854369, 'socioeconomic': 0.5555555555555556, 'gender': 0.4444444444444444, 'disability': 0.6610169491525424, 'nationality': 0.49056603773584906, 'sexual-orientation': 0.44047619047619047, 'physical-appearance': 0.5555555555555556, 'religion': 0.6, 'age': 0.41379310344827586}
-------------------------





# StereoSet

In [3]:
dataset = load_dataset("McGill-NLP/stereoset", "intrasentence", split="validation")

for model_name in models_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    print('Computing StereoSet for: ',model_name,)
    results = evaluate_stereoset(model,tokenizer,dataset)
    result_path = 'results/'+model_name.replace('/','-')
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'stereoset.pkl'),'wb') as f:
        pickle.dump(results, f)
    print('-'*25)

Generating validation split: 100%|██████████| 2106/2106 [00:00<00:00, 159559.32 examples/s]


Computing StereoSet for:  distilbert/distilbert-base-uncased


Intra-sentence:   0%|          | 0/2106 [00:00<?, ?it/s]


TypeError: string indices must be integers, not 'str'