In [1]:
from transformers import BloomForCausalLM, BloomTokenizerFast
import torch
import json
import random
from sklearn.model_selection import train_test_split
import datasets
from itertools import islice
import evaluate
bleu = evaluate.load("bleu")

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [6]:
def list_difference(A, B):
    result = []
    for a, b in zip(A, B):
        if a.startswith(b):
            result.append(a[len(b):].strip())
        else:
            result.append(a)
    return result

def chunked_iterable(iterable, size):
    """Yields chunks of the given size from an iterable."""
    it = iter(iterable)
    while chunk := list(islice(it, size)):
        yield chunk

def generate_predictions(model, tokenizer, texts, max_length=40, batch_size=36):
    all_new_parts = []
    for batch in chunked_iterable(texts, batch_size):
        tokenized_texts = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=max_length).to(device)
        with torch.no_grad():
            generated_ids = model.generate(input_ids=tokenized_texts['input_ids'], attention_mask=tokenized_texts['attention_mask'], max_new_tokens=20)
        generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        new_parts = list_difference(generated_texts, batch)
        all_new_parts.extend(new_parts)
    return all_new_parts

In [7]:
with open('./Data/adv_train_data.json', 'r') as f:
        train_gender_data_list, n_data = json.load(f)

random.seed(8)
g_label_0 = [datapoint for datapoint in train_gender_data_list if datapoint[2] == 0]
g_label_1 = [datapoint for datapoint in train_gender_data_list if datapoint[2] == 1]
# shuffle such that random data is selected when only taking first 60 000
random.shuffle(g_label_0)
random.shuffle(g_label_1)
g_balanced_label_0 = g_label_0[:60000]
g_balanced_label_1 = g_label_1[:60000]
g_data = g_balanced_label_0 + g_balanced_label_1
_, both = train_test_split(g_data, test_size=0.2, random_state=8) # important: same random seed as in training!
_, g_test = train_test_split(both, test_size=0.5, random_state=8) 

_, n_both = train_test_split(n_data, test_size=0.2, random_state=8)
_, n_test = train_test_split(n_both, test_size=0.5, random_state=8) 

## GENERATION EVALUATION: BLOOM-560M

In [5]:
model_name = "bigscience/bloom-560m"
model = BloomForCausalLM.from_pretrained(model_name).to(device)
tokenizer = BloomTokenizerFast.from_pretrained(model_name)

generated_texts = generate_predictions(model, tokenizer, [a[0] for a in n_test] + [a[0] for a in g_test])
reference_texts = [[a[1]] for a in n_test] + [[a[1]] for a in g_test]

bleu_score = bleu.compute(predictions=generated_texts, references=reference_texts)
print(f"BLEU score: {bleu_score['bleu']}")

667it [06:00,  1.85it/s]


BLEU score: 0.011033502290115366


## GENERATION EVALUATION: DEBIASED BLOOM-560M

In [6]:
model_path = './Models/DB_BLOOM'
model = BloomForCausalLM.from_pretrained(model_path)
model.to(device)
model.eval()
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
print('')

generated_texts = generate_predictions(model, tokenizer, [a[0] for a in n_test] + [a[0] for a in g_test])
reference_texts = [[a[1]] for a in n_test] + [[a[1]] for a in g_test]

bleu_score = bleu.compute(predictions=generated_texts, references=reference_texts)
print(f"BLEU score: {bleu_score['bleu']}")




667it [05:58,  1.86it/s]


BLEU score: 0.01327239717026448
