In [1]:
# Finetuning the toxic and nontoxic language models
import pandas as pd
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments, AdamW, EarlyStoppingCallback
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch import nn
import argparse
import random
from IPython import embed
from utils import *
from training.infilling import text_infill


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
from transformers import RobertaTokenizer, RobertaForMaskedLM

In [31]:
# DEBUG Purpose
# parser = argparse.ArgumentParser()
# args = parser.parse_args()
class MyDict:
    def __init__(self, data):
        self.data = data
        for key, value in data.items():
            setattr(self, key, value)

args = MyDict({
    "tok_type": "distilroberta-base",
    "model_type": "distilroberta-base",
    "train_data": "dataset/train/train_toxic.csv",
    "val_data": "dataset/train/val_toxic.csv",
    "model_dir": "models/toxic",
    "max_source_length": 180,
    "max_target_length": 230,
    "train_batch_size": 32,
    "eval_batch_size": 128,
    "max_steps": 50000,
    "lr": 1e-6,
    "logging_steps": 500,
    "seed": 0,
    "save_total_limit": 2,
    "save_steps": 500,
    "data_type": "jigsaw_full_30",
    "logging_dir": "logs",
    "early_stopping_steps": 5,
    "load_old": None

})

In [32]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if not torch.cuda.is_available():
    print("No GPUs found!")
else:
    print("Found", str(torch.cuda.device_count()), "GPUS!")

seed_everything(args.seed)

# Load in the tokenizer
tokenizer = RobertaTokenizer.from_pretrained(args.tok_type)

mask = tokenizer.mask_token

No GPUs found!


In [33]:
if not os.path.exists(args.model_dir):
    print(args.model_dir)
    os.mkdir(args.model_dir)

output_dir = args.model_dir + "/" + args.model_type.split("/")[-1] + "_" + str(args.lr) + "_" + \
str(args.seed) + "_" + str(args.train_batch_size * torch.cuda.device_count()) + "_" + args.data_type
print(output_dir)

models/toxic/distilroberta-base_1e-06_0_0_jigsaw_full_30


In [34]:
# Logic to continue training - look at previous models saved
try:
    prev_models = os.listdir(output_dir)
    # Alpha sort
    prev_models.sort()
    # Len sort
    prev_models.sort(key=len)
except:
    prev_models = []

In [35]:
# Logic to continue training if we want to load the old model - load pretrained model
if args.load_old and len(prev_models) > 0:
    model = BartForConditionalGeneration.from_pretrained(os.path.join(output_dir, prev_models[-1]), forced_bos_token_id = tokenizer.bos_token_id).to(device)
else:
    # Otherwise train a new model
    model = RobertaForMaskedLM.from_pretrained(args.model_type, forced_bos_token_id = tokenizer.bos_token_id).to(device)

In [None]:
RobertaForMaskedLM

In [36]:
train_texts = []
val_texts = []

# Read/process the data based on which dataset we're using: Jigsaw or Dynabench
# If you want to load your own data, put the data loading logic here
if "jigsaw" in args.data_type:
    train = pd.read_csv(args.train_data)
    val = pd.read_csv(args.val_data)

    train_texts =  train["comment_text"].tolist()
    val_texts = val["comment_text"].tolist()

print(len(train_texts), len(val_texts))

# Reducing dataset for debugging 
train_texts = [value for i, value in enumerate(train_texts) if i < 5]
val_texts = [value for i, value in enumerate(val_texts) if i < 5]

115216 29118


In [37]:
# Tokenize everything
tokenized_labs_train = tokenizer.batch_encode_plus(
        train_texts, 
        max_length = args.max_target_length, 
        padding="max_length", 
        truncation=True, 
        return_tensors = "pt").input_ids

In [38]:
tokenized_labs_val = tokenizer.batch_encode_plus(
    val_texts,
    max_length = args.max_target_length, 
    padding="max_length", 
    truncation=True, 
    return_tensors = "pt").input_ids

In [39]:
tokenized_labs_val[tokenized_labs_val == tokenizer.pad_token_id] = -100
tokenized_labs_train[tokenized_labs_train == tokenizer.pad_token_id] = -100

In [40]:
tokenized_labs_val

tensor([[    0,  8346,     6,  ...,  -100,  -100,  -100],
        [    0,  1185,    32,  ...,  -100,  -100,  -100],
        [    0,   100,  1017,  ...,  -100,  -100,  -100],
        [    0,  2847,  6097,  ...,  -100,  -100,  -100],
        [    0, 44950, 46835,  ...,  -100,  -100,  -100]])

In [41]:
# Convert this so that examples are dynamically masked
class JigsawDataset(torch.utils.data.Dataset):
    def __init__(self, labels, rawdata):
        self.rawdata = rawdata
        self.labels = labels

    def __getitem__(self, idx):
        # Dynamically generate sources
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # item['labels'] = cur_lab

        # Do detokenization to ensure that the tokenization matches up later
        item = tokenizer.encode_plus(
            detokenize(text_infill(self.rawdata[idx], tokenizer.mask_token)),
            max_length = args.max_source_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors = "pt")
        
        # Necessary for squeezing purposes
        item = {key: torch.tensor(val[0]) for key, val in item.items()}
        item['labels'] = self.labels[idx]
        return item
        
    def __len__(self):
        return len(self.labels)

In [42]:
train_dataset = JigsawDataset(tokenized_labs_train, train_texts)
val_dataset = JigsawDataset(tokenized_labs_val, val_texts)

In [44]:
detokenize(text_infill(train_texts[0], tokenizer.mask_token))

item = tokenizer.encode_plus(
            detokenize(text_infill(train_texts[0], tokenizer.mask_token)),
            max_length = args.max_source_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors = "pt")

In [45]:
class RobertaTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            input_ids = inputs.get("input_ids"), 
            attention_mask = inputs.get("attention_mask"),
            labels = inputs.get("labels")
            )
        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        return (loss, outputs) if return_outputs else loss

    def evaluate(self, **kwargs):
        metrics = super().evaluate(**kwargs)
        # Example to see what the model outputs for mask token (see if it outputs something biased)
        inputs_masked = ["Asians are <mask>"]
        print("\n Generations: ", tokenizer.batch_decode(model.generate(tokenizer(inputs_masked, return_tensors = "pt").to('cuda').input_ids, do_sample=True, num_return_sequences = 5)),"\n")                                                                                                                                                                        
        return metrics

In [46]:
# Print out what the model looks like before we start training
print("Before training:")
inputs_masked = ["Asians are <mask>"]
print("\n Generations: ", tokenizer.batch_decode(model.generate(tokenizer(inputs_masked, return_tensors = "pt").to(device).input_ids, do_sample=True, num_return_sequences = 5)),"\n")   

Before training:


TypeError: The current model class (RobertaForMaskedLM) is not compatible with `.generate()`, as it doesn't have a language model head. Please use one of the following classes instead: {'RobertaForCausalLM'}

**After Training**

In [32]:
antiexpert_path = "CARC/anti-expert-model/bart-base_1e-06_0_32_jigsaw_full_30/checkpoint-40000"
expert_path = "CARC/expert-model/bart-base_2e-06_0_96_jigsaw_full_30/checkpoint-90000"

In [33]:
expert = BartForConditionalGeneration.from_pretrained(expert_path, forced_bos_token_id = tokenizer.bos_token_id).to(device)

In [28]:
antiexpert = BartForConditionalGeneration.from_pretrained(antiexpert_path, forced_bos_token_id = tokenizer.bos_token_id).to(device)

In [30]:
antiexpert.eval();

In [34]:
expert.eval();

In [31]:
# Print out what the model looks like before we start training
print("After training:")
inputs_masked = ["Asians are <mask>"]
print("\n Generations: ", tokenizer.batch_decode(antiexpert.generate(tokenizer(inputs_masked, return_tensors = "pt").to(device).input_ids, do_sample=True, num_return_sequences = 5)),"\n")   

After training:





 Generations:  ['</s><s>Asians are racist.</s>', '</s><s>Asians are stupid.</s>', '</s><s>Asians are stupid.</s>', '</s><s>Asians are racist.</s>', '</s><s>Asians are stupid.</s>'] 



In [35]:
# Print out what the model looks like before we start training
print("After training:")
inputs_masked = ["Asians are <mask>"]
print("\n Generations: ", tokenizer.batch_decode(expert.generate(tokenizer(inputs_masked, return_tensors = "pt").to(device).input_ids, do_sample=True, num_return_sequences = 5)),"\n")   

After training:





 Generations:  ['</s><s>Asians are not racist.</s>', '</s><s>Asians are welcome.</s><pad>', '</s><s>Asians are the problem.</s>', '</s><s>Asians are not.</s><pad>', '</s><s>Asians are a minority.</s>'] 



**Text Infill**

In [97]:
import nltk
import bisect

In [102]:
sentence = train_texts[2]
mask_token = tokenizer.mask_token
lam = 3

In [103]:
tokenized = np.array(nltk.tokenize.casual.casual_tokenize(sentence), dtype = "object")
masked_idcs = []

In [104]:
    # while (len(masked_idcs) / len(tokenized)) < thresh:
span_length = np.random.poisson(lam)

In [105]:
# while ((span_length > list_diffs(masked_idcs, len(tokenized))) or \
#     (span_length > max_span(len(masked_idcs), len(tokenized), thresh))):    
#     span_length = np.random.poisson(lam = lam)
    # print("Span length is too long, it is currently:", span_length)

print("tokenized is", tokenized)
print("masked idcs are", masked_idcs)
print("span length is", span_length)


tokenized is ['This' 'story' 'gets' 'more' 'ridiculous' 'by' 'the' 'hour' '!' 'And' ','
 'I' 'love' 'that' 'people' 'are' 'sending' 'these' 'guys' 'dildos' 'in'
 'the' 'mail' 'now' '.' 'But' '…' 'if' 'they' 'really' 'think' "there's"
 'a' 'happy' 'ending' 'in' 'this' 'for' 'any' 'of' 'them' ',' 'I' 'think'
 "they're" 'even' 'more' 'deluded' 'than' 'all' 'of' 'the' 'jokes' 'about'
 'them' 'assume' '.']
masked idcs are []
span length is 2


In [106]:
if span_length == 0:
    start_idx = np.random.randint(0, len(tokenized) + 1)
    while ((start_idx in masked_idcs) or (start_idx in (np.array(masked_idcs) + 1))):
        print("bad, start_idx is", start_idx)
        start_idx = np.random.randint(0, len(tokenized) + 1)
    
    # print("start idx is", start_idx)
    tokenized = np.insert(tokenized, start_idx, mask_token)
    bisect.insort(masked_idcs, start_idx)

else:
    while True:          
        start_idx = np.random.randint(0, len(tokenized) - span_length + 1)
        idcs = np.arange(start_idx, start_idx + span_length)
        
        for i in idcs:
            if i in masked_idcs or i in (np.array(masked_idcs) + 1):
                # print("bad i" , i)
                continue
        break
    
    for i in idcs:
        bisect.insort(masked_idcs, i)
        tokenized[i] = mask_token
            #print("idcs are", idcs)
    print("final mask ratio:",len(masked_idcs)/len(tokenized))

final mask ratio: 0.03508771929824561


# T5 Comparison

In [42]:
model_checkpoint = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [53]:
tokenizer.pad_token_id

0

In [52]:
tokenizer(inputs_masked, return_tensors = "pt")

{'input_ids': tensor([[ 6578,     7,    33, 32099,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [55]:
# Print out what the model looks like before we start training
print("After training:")
inputs_masked = ["Asians are <mask>"]
print("\n Generations: ", tokenizer.batch_decode(model.generate(tokenizer(inputs_masked, return_tensors = "pt").to(device).input_ids, do_sample=True, num_return_sequences = 5)),"\n")   

After training:

 Generations:  ['<pad> Die Asien sind die <unk> maske></s><pad>', '<pad><extra_id_0> s in the west are <unk> mask>.</s>', '<pad><extra_id_0> s</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '<pad></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '<pad><extra_id_0> s have never lived in Canada.</s><pad><pad><pad><pad>'] 



# DistilRoberta Comparison

In [26]:
from transformers import RobertaTokenizer, RobertaForMaskedLM

# Assuming you've already loaded the model and tokenizer
model_name = "distilroberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForMaskedLM.from_pretrained(model_name)

In [27]:
sequence = f"Asians are {tokenizer.mask_token}" # "The world will end in <mask>"

input_seq = tokenizer.encode(sequence, return_tensors='pt') # tensor([[0, 133, 232, 40, 253, 11, 50264, 2]])
mask_token_index = torch.where(input_seq == tokenizer.mask_token_id)[1] # (tensor([0]), tensor([6])) - we only want the the 2nd dimension


In [28]:
token_logits = model(input_seq).logits
masked_token_logits = token_logits[0, mask_token_index, :]

top_5_tokens = torch.topk(masked_token_logits, 1, dim=1).indices[0].tolist()

In [29]:
for token in top_5_tokens:
    print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))

Asians are  dying


# Test with Models from Hugging Face

In [11]:
torch.load("anti-expert-args.bin")

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=1000,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=,
fsdp_config=None,
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_ev

In [12]:
torch.load("expert-args.bin")

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=5000,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=,
fsdp_config=None,
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_ev