In [1]:
from transformers import BloomForCausalLM, BloomTokenizerFast
from methods import *
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## CrowS-Pairs dataset

In [3]:
df = pd.read_csv('./Data/SENTENCE_crows.csv', header=None) 
filtered_df = df[df[4] == 'gender'][[1, 2, 3]]
# 262 sentences
stereo = filtered_df.iloc[:, 0].tolist()
normal = filtered_df.iloc[:, 1].tolist()

In [4]:
stereo_half = stereo[:len(stereo) // 2]
normal_half = normal[:len(normal) // 2]

## Equity-Evaluation dataset

In [5]:
d = pd.read_csv('./Data/SENTENCE_Equity-Evaluation-Corpus.csv')
d = d.drop(columns=['ID'])
d = d[d['Race'] == 'European']

df1 = d[d['Template'] == 'The conversation with <person object> was <emotional situation word>.']
df1 = df1[df1['Emotion'] == 'anger']

male_rows1 = df1[df1['Gender'] == 'male']
male_sentence_list1 = male_rows1['Sentence'].tolist()
female_rows1 = df1[df1['Gender'] == 'female']
female_sentence_list1 = female_rows1['Sentence'].tolist()

df2 = d[d['Template'] == '<person subject> made me feel <emotion word>.']
df2 = df2[df2['Emotion'] == 'anger']

male_rows2 = df2[df2['Gender'] == 'male']
male_sentence_list2 = male_rows2['Sentence'].tolist()
female_rows2 = df2[df2['Gender'] == 'female']
female_sentence_list2 = female_rows2['Sentence'].tolist()

fem = female_sentence_list1 + female_sentence_list2
masc = male_sentence_list1 + male_sentence_list2

In [6]:
fem_half = female_sentence_list1[:len(female_sentence_list1) // 2] + female_sentence_list2[:len(female_sentence_list2) // 2]
masc_half = male_sentence_list1[:len(male_sentence_list1) // 2] + male_sentence_list2[:len(male_sentence_list2) // 2]

## BLOOM-560M

In [7]:
model_name = "bigscience/bloom-560m"
tokenizer = BloomTokenizerFast.from_pretrained(model_name)
model = BloomForCausalLM.from_pretrained(model_name)
model.to(device)
model.eval()
print('')




In [8]:
res = SPBM(stereo, normal, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.5648854961832062
The model finds the stereotypical sentence more likely in 56.49% of the sentence pairs!


In [10]:
res = SPBM(stereo_half, normal_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.5190839694656488
The model finds the stereotypical sentence more likely in 51.91% of the sentence pairs!


In [9]:
res = SPBM(fem, masc, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.74
The model finds the stereotypical sentence more likely in 74.0% of the sentence pairs!


In [11]:
res = SPBM(fem_half, masc_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.7
The model finds the stereotypical sentence more likely in 70.0% of the sentence pairs!


## BLOOM-1B1

In [12]:
model_name = "bigscience/bloom-1b1"
tokenizer = BloomTokenizerFast.from_pretrained(model_name)
model = BloomForCausalLM.from_pretrained(model_name)
model.to(device)
print('')




In [13]:
res = SPBM(stereo, normal, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.5648854961832062
The model finds the stereotypical sentence more likely in 56.49% of the sentence pairs!


In [14]:
res = SPBM(stereo_half, normal_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.5419847328244275
The model finds the stereotypical sentence more likely in 54.2% of the sentence pairs!


In [15]:
res = SPBM(fem, masc, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.75
The model finds the stereotypical sentence more likely in 75.0% of the sentence pairs!


In [16]:
res = SPBM(fem_half, masc_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.8
The model finds the stereotypical sentence more likely in 80.0% of the sentence pairs!


## BLOOM-1B7

In [17]:
model_name = "bigscience/bloom-1b7"
tokenizer = BloomTokenizerFast.from_pretrained(model_name)
model = BloomForCausalLM.from_pretrained(model_name)
model.to(device)
print('')




In [18]:
res = SPBM(stereo, normal, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.6412213740458015
The model finds the stereotypical sentence more likely in 64.12% of the sentence pairs!


In [19]:
res = SPBM(stereo_half, normal_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.5877862595419847
The model finds the stereotypical sentence more likely in 58.78% of the sentence pairs!


In [20]:
res = SPBM(fem, masc, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.66
The model finds the stereotypical sentence more likely in 66.0% of the sentence pairs!


In [21]:
res = SPBM(fem_half, masc_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.7
The model finds the stereotypical sentence more likely in 70.0% of the sentence pairs!


## BLOOM-3B

In [22]:
model_name = "bigscience/bloom-3b"
tokenizer = BloomTokenizerFast.from_pretrained(model_name)
model = BloomForCausalLM.from_pretrained(model_name)
model.to(device)
print('')




In [23]:
res = SPBM(stereo, normal, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.6183206106870229
The model finds the stereotypical sentence more likely in 61.83% of the sentence pairs!


In [24]:
res = SPBM(stereo_half, normal_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.5572519083969466
The model finds the stereotypical sentence more likely in 55.73% of the sentence pairs!


In [25]:
res = SPBM(fem, masc, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.72
The model finds the stereotypical sentence more likely in 72.0% of the sentence pairs!


In [26]:
res = SPBM(fem_half, masc_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.74
The model finds the stereotypical sentence more likely in 74.0% of the sentence pairs!


## DEBIASED BLOOM-560M

In [27]:
model_path = '../Models/G_50_16_v3'
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
model = BloomForCausalLM.from_pretrained(model_path)
model.to(device)
model.eval()
print('')




In [28]:
res = SPBM(stereo, normal, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.5648854961832062
The model finds the stereotypical sentence more likely in 56.49% of the sentence pairs!


In [29]:
res = SPBM(stereo_half, normal_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.5801526717557252
The model finds the stereotypical sentence more likely in 58.02% of the sentence pairs!


In [30]:
res = SPBM(fem, masc, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.7
The model finds the stereotypical sentence more likely in 70.0% of the sentence pairs!


In [31]:
res = SPBM(fem_half, masc_half, 16, model, tokenizer, device)
print(res)
print(f'The model finds the stereotypical sentence more likely in {round(res*100, 2)}% of the sentence pairs!')

0.7
The model finds the stereotypical sentence more likely in 70.0% of the sentence pairs!
