In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset
import numpy as np
from src.evaluate_crows import evaluate_crows
from src.stereosetEvaluator import StereosetEvaluator
import os
import pickle
from src.cda_Debiaser import CDA_Debiaser
from src.performanceEvaluator import PerformanceEvaluator

In [None]:
models_names = ['distilbert/distilbert-base-uncased','albert/albert-base-v2','FacebookAI/xlm-roberta-base']

device = 'mps'
#device = 'cuda'

# Base models

## Bias evaluation

### Crows-Pairs

In [None]:

for model_name in models_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    print('Computing CROWS for: ',model_name,)
    results = evaluate_crows(model,tokenizer,device)
    print(f' {results}')
    result_path = 'results/'+model_name.replace('/','-')
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'crows.pkl'),'wb') as f:
        pickle.dump(results, f)
    print('-'*25)


Computing CROWS for:  distilbert/distilbert-base-uncased


100%|██████████| 1508/1508 [07:51<00:00,  3.20it/s]


 {'general': 0.5229235880398672, 'race-color': 0.4970873786407767, 'socioeconomic': 0.5789473684210527, 'gender': 0.46564885496183206, 'disability': 0.5254237288135594, 'nationality': 0.5345911949685535, 'sexual-orientation': 0.6547619047619048, 'physical-appearance': 0.3492063492063492, 'religion': 0.6571428571428571, 'age': 0.5517241379310345}
-------------------------


Some weights of the model checkpoint at albert/albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Computing CROWS for:  albert/albert-base-v2


100%|██████████| 1508/1508 [12:31<00:00,  2.01it/s]


 {'general': 0.5009966777408638, 'race-color': 0.45048543689320386, 'socioeconomic': 0.6257309941520468, 'gender': 0.366412213740458, 'disability': 0.6271186440677966, 'nationality': 0.4716981132075472, 'sexual-orientation': 0.7619047619047619, 'physical-appearance': 0.49206349206349204, 'religion': 0.5333333333333333, 'age': 0.6436781609195402}
-------------------------


Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Computing CROWS for:  FacebookAI/xlm-roberta-base


100%|██████████| 1508/1508 [26:09<00:00,  1.04s/it]

 {'general': 0.4920212765957447, 'race-color': 0.4563106796116505, 'socioeconomic': 0.5847953216374269, 'gender': 0.5134099616858238, 'disability': 0.6101694915254238, 'nationality': 0.49056603773584906, 'sexual-orientation': 0.5714285714285714, 'physical-appearance': 0.5396825396825397, 'religion': 0.4095238095238095, 'age': 0.367816091954023}
-------------------------





### StereoSet IntraSentence

In [None]:

for model_name in models_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    print('Computing StereoSet (Intrasentence) for: ',model_name,)
    stereoset_evaluator = StereosetEvaluator(model, 
                                            tokenizer,
                                            './data/stereoset/test.json',
                                            device = device)
    results = stereoset_evaluator.evaluate_intrasentence()
    print(f' {results}')
    result_path = 'results/'+model_name.replace('/','-')
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'stereoset-intra.pkl'),'wb') as f:
        pickle.dump(results, f)
    print('-'*25)

Computing StereoSet (Intrasentence) for:  distilbert/distilbert-base-uncased


100%|██████████| 6392/6392 [04:10<00:00, 25.49it/s]


 {'general': 0.6081990298857769, 'race': 0.5784873949579832, 'gender': 0.6459143968871596, 'profession': 0.6321934945788157, 'religion': 0.6153846153846154}
-------------------------


Some weights of the model checkpoint at albert/albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Computing StereoSet (Intrasentence) for:  albert/albert-base-v2


100%|██████████| 6392/6392 [05:54<00:00, 18.03it/s]


 {'general': 0.5945861367548114, 'race': 0.5660504201680673, 'gender': 0.6095979247730221, 'profession': 0.6213511259382819, 'religion': 0.631578947368421}
-------------------------


Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Computing StereoSet (Intrasentence) for:  FacebookAI/xlm-roberta-base


100%|██████████| 6392/6392 [13:01<00:00,  8.18it/s]

 {'general': 0.53770337922403, 'race': 0.5218413978494624, 'gender': 0.5719844357976653, 'profession': 0.5450375312760634, 'religion': 0.5506072874493927}
-------------------------





### StereoSet InterSentence

In [None]:
for model_name in models_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    print('Computing StereoSet (Intersentence) for: ',model_name,)
    stereoset_evaluator = StereosetEvaluator(model, 
                                            tokenizer,
                                            './data/stereoset/test.json',
                                            device = device)
    results = stereoset_evaluator.evaluate_intersentence()
    print(f' {results}')
    result_path = 'results/'+model_name.replace('/','-')
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'stereoset-inter.pkl'),'wb') as f:
        pickle.dump(results, f)
    print('-'*25)

Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Computing StereoSet (Intersentence) for:  FacebookAI/xlm-roberta-base


100%|██████████| 6374/6374 [1:22:42<00:00,  1.28it/s]

 {'general': 0.4643865704424223, 'race': 0.4306073973532406, 'gender': 0.49533954727030627, 'profession': 0.4940451745379877, 'religion': 0.48132780082987553}
-------------------------





## Performance Evaluation

In [None]:
for model_name in models_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    perf = PerformanceEvaluator(model, tokenizer, device)
    print(' Computing performance metrics for: ', model_name)
    res = {
        'mte': perf.masked_token_evaluation(),
        'text_classification': perf.text_classification_evaluation()
    }
    result_path = 'results/'+model_name.replace('/','-')
    print(res)
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'performance.pkl'),'wb') as f:
        pickle.dump(res, f)

    print('-'*25)

# CDA Debiased

## Debiasing Stage

In [6]:
for model_name in models_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    debiaser = CDA_Debiaser(model, model_name.replace('/','-'), tokenizer, device)
    print('Starting CDA debiasing for: ',model_name.replace('/','-'))
    debiaser.debias()
    print('-'*30)

AssertionError: Torch not compiled with CUDA enabled

## Bias Evaluation

### Crows-Pairs

In [None]:
for model_name in models_names:
    model_name =  model_name.replace('/','-')
    model_path = os.path.join('./debiased_model',model_name+'_cda_debiased')
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForMaskedLM.from_pretrained(model_path)
    print('Computing CROWS for: ',model_name, 'CDA DEBIASED')
    results = evaluate_crows(model,tokenizer,device)
    print(f' {results}')
    result_path = 'results/'+model_name+'_cda_debiased'
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'crows.pkl'),'wb') as f:
        pickle.dump(results, f)
    print('-'*25)


Computing CROWS for:  distilbert-distilbert-base-uncased CDA DEBIASED


100%|██████████| 1508/1508 [10:14<00:00,  2.45it/s]


 {'general': 0.5554817275747508, 'race-color': 0.5300970873786408, 'socioeconomic': 0.6900584795321637, 'gender': 0.4580152671755725, 'disability': 0.5423728813559322, 'nationality': 0.5849056603773585, 'sexual-orientation': 0.6547619047619048, 'physical-appearance': 0.47619047619047616, 'religion': 0.6095238095238096, 'age': 0.5862068965517241}
-------------------------
Computing CROWS for:  albert-albert-base-v2 CDA DEBIASED


 44%|████▎     | 658/1508 [06:09<10:41,  1.33it/s]

### StereoSet InstraSentence

In [None]:
for model_name in models_names:
    model_name =  model_name.replace('/','-')
    model_path = os.path.join('./debiased_model',model_name+'_cda_debiased')
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForMaskedLM.from_pretrained(model_path)

    print('Computing StereoSet (Intrasentence) for: ',model_name,)
    stereoset_evaluator = StereosetEvaluator(model, 
                                            tokenizer,
                                            './data/stereoset/test.json',
                                            device = device)
    results = stereoset_evaluator.evaluate_intrasentence()
    print(f' {results}')
    result_path = 'results/'+model_name+'_cda_debiased'
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'stereoset-intra.pkl'),'wb') as f:
        pickle.dump(results, f)
    print('-'*25)

Computing StereoSet (Intrasentence) for:  distilbert-distilbert-base-uncased


100%|██████████| 6392/6392 [03:13<00:00, 33.10it/s]


 {'general': 0.5273040212799249, 'race': 0.44907563025210084, 'gender': 0.5810635538261998, 'profession': 0.609674728940784, 'religion': 0.5020242914979757}
-------------------------
Computing StereoSet (Intrasentence) for:  albert-albert-base-v2


100%|██████████| 6392/6392 [04:37<00:00, 23.07it/s]


 {'general': 0.5111876075731497, 'race': 0.44134453781512606, 'gender': 0.5304798962386511, 'profession': 0.5938281901584654, 'religion': 0.4898785425101215}
-------------------------
Computing StereoSet (Intrasentence) for:  FacebookAI-xlm-roberta-base


100%|██████████| 6392/6392 [11:21<00:00,  9.38it/s]

 {'general': 0.5037546933667084, 'race': 0.459005376344086, 'gender': 0.5278858625162127, 'profession': 0.5475396163469558, 'religion': 0.5425101214574899}
-------------------------





### StereoSet InterSentence

In [None]:
for model_name in models_names:
    model_name =  model_name.replace('/','-')
    model_path = os.path.join('./debiased_model',model_name+'_cda_debiased')
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForMaskedLM.from_pretrained(model_path)

    print('Computing StereoSet (Intersentence) for: ',model_name,)
    stereoset_evaluator = StereosetEvaluator(model, 
                                            tokenizer,
                                            './data/stereoset/test.json',
                                            device = device)
    results = stereoset_evaluator.evaluate_intersentence()
    print(f' {results}')
    result_path = 'results/'+model_name+'_cda_debiased'
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'stereoset-inter.pkl'),'wb') as f:
        pickle.dump(results, f)
    print('-'*25)

## Performance Evaluation

In [None]:
for model_name in models_names:
    model_name =  model_name.replace('/','-')
    model_path = os.path.join('./debiased_model',model_name+'_cda_debiased')

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForMaskedLM.from_pretrained(model_path)
    perf = PerformanceEvaluator(model, tokenizer, device)
    print(' Computing performance metrics for: ', model_name, ' CDA DEBIASED')
    res = {
        'mte': perf.masked_token_evaluation(),
        'text_classification': perf.text_classification_evaluation()
    }
    result_path = 'results/'+model_name+'_cda_debiased'
    print(res)
    os.makedirs(result_path, exist_ok=True)
    with open(os.path.join(result_path,'performance.pkl'),'wb') as f:
        pickle.dump(res, f)

    print('-'*25)