# pseudocode/what I need to write

- I load the dataset with erroneous labels
- process the dataset by applying a template and tokenizing it with target tokens being the labels
    - make sure padding/truncation and batching is done correctly, so target token is really the label
- feed these encodings into the pytorch Trainer somehow
- compute metrics:
    - f1, accuracy on test
    - f1, accuracy on erroneous subset of test
    - f1, accuracy on non-erroneous subset of test
- incorporate wandb

- convert this notebook into a python file so I can run more of these experiments more efficiently

In [109]:
from transformers import AutoTokenizer
import torch

model_name = "gpt2"
ds_name = "imdb"
template = "{}\n\nIs the above review positive or negative?\n\n"
verbalizers = ["negative", "positive"]

max_length = 1024
lr = 1e-3
num_epochs = 50
batch_size = 8

device = "cuda"


In [111]:
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import default_data_collator

# load dataset
first, second = ds_name.split(":") if ":" in ds_name else (ds_name, None)
ds = load_dataset(first, second)
ds["train"] = ds["train"].shuffle()
ds["test"] = ds["test"].shuffle()

ds = DatasetDict({
    "train": ds["train"].select(range(1000)),
    "validation": ds["train"].select(range(1000, 1500)),
    "test": ds["test"].select(range(1000))
})

Found cached dataset imdb (/mnt/ssd-2/hf_cache/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 725.62it/s]
Running tokenizer on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 1024). Running this sequence through the model will result in indexing errors


1000
1306


Running tokenizer on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]              

500
3110


Running tokenizer on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]           

1000
1369


                                                                                          

In [127]:
# instantiate tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# templatize and tokenize examples
def tokenize_examples(examples):
    batch_size = len(examples["text"])
    print(batch_size)

    # apply template to each example
    texts = [template.format(text) for text in examples["text"]]
    targets = [verbalizers[label] for label in examples["label"]]
    
    # tokenize inputs and targets
    inputs = tokenizer(texts)
    labels = tokenizer(targets)

    # concatenate inputs and labels
    for i in range(batch_size):
        sample_input_ids = inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        # print(i, sample_input_ids, label_input_ids)
        # be careful that the correct whitespace is between the two parts
        inputs["input_ids"][i] = sample_input_ids + label_input_ids
        # when a label is -100, the corresponding loss is ignored
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        # 1 means attend to the token
        inputs["attention_mask"][i] = [1] * len(inputs["input_ids"][i])
    print(max([len(input_ids) for input_ids in inputs["input_ids"]]))

    # pad everything to max_length and convert to tensors
    for i in range(batch_size):
        sample_input_ids = inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        inputs["input_ids"][i] = torch.tensor(inputs["input_ids"][i][:max_length])
        inputs["attention_mask"][i] = torch.tensor(inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
        
    inputs["labels"] = labels["input_ids"]
    return inputs

def tokenize_eval_examples(examples):
    # similar to tokenize_examples, but without the label

    batch_size = len(examples["text"])

    # apply template to each example
    inputs = [template.format(text) for text in examples["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs)
    
    # pad everything to max_length and convert to tensors
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
    
    out_dict = model_inputs
    out_dict["labels"] = torch.tensor(examples["label"])
    return out_dict

Using pad_token, but it is not set yet.


In [128]:

# train
train_encodings = ds.map(
    tokenize_examples,
    batched=True,
    num_proc=1,
    remove_columns=ds["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = train_encodings["train"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)

# validation and test
eval_encodings = ds.map(
    tokenize_eval_examples,
    batched=True,
    num_proc=1,
    remove_columns=ds["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

eval_dataset = eval_encodings["validation"]
test_dataset = eval_encodings["test"]

eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
print(next(iter(eval_dataloader)))
print(next(iter(test_dataloader)))

Running tokenizer on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 1024). Running this sequence through the model will result in indexing errors


1000
1306


Running tokenizer on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]              

500
3110


Running tokenizer on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]           

1000
1369


                                                                                          

{'input_ids': tensor([[50256, 50256, 50256,  ...,  4633,    30,   628],
        [50256, 50256, 50256,  ...,  4633,    30,   628],
        [50256, 50256, 50256,  ...,  4633,    30,   628],
        ...,
        [50256, 50256, 50256,  ...,  4633,    30,   628],
        [50256, 50256, 50256,  ...,  4633,    30,   628],
        [50256, 50256, 50256,  ...,  4633,    30,   628]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([0, 0, 0, 0, 1, 1, 1, 1])}
{'input_ids': tensor([[50256, 50256, 50256,  ...,  4633,    30,   628],
        [50256, 50256, 50256,  ...,  4633,    30,   628],
        [50256, 50256, 50256,  ...,  4633,    30,   628],
        ...,
        [50256, 50256, 50256,  ...,  4633,    30,   628],
        [50256, 50256, 50256,  ...,  4633,    30,   628],
        [50256, 50256, 50



In [112]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftType

peft_config = LoraConfig(
    peft_type=PeftType.LORA, task_type=TaskType.CAUSAL_LM,
    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = AutoModelForCausalLM.from_pretrained(model_name)
model = get_peft_model(model, peft_config)
model = model.to(device)
model.print_trainable_parameters()



trainable params: 294912 || all params: 124734720 || trainable%: 0.23643136409814364


In [158]:
# define metrics

def logits_to_text(logits):
    ids = torch.argmax(logits[:, -1, :], dim=-1)
    return ids_to_text(ids)

def ids_to_text(ids):
    return tokenizer.batch_decode(ids, skip_special_tokens=True)
    
from sklearn.metrics import accuracy_score, f1_score

In [164]:
model.eval()
preds = []
labels = []

for batch in tqdm(eval_dataloader):
    with torch.no_grad():
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"])
        logits = outputs.logits
        text_preds = logits_to_text(logits)

        ps = [p == verbalizers[1] for p in text_preds]
        labs = batch["labels"].tolist()

        preds.extend(ps)
        labels.extend(labs)
print(text_preds)
print(preds)
print(labels)
print(f1_score(labels, preds, average="micro"))
print(accuracy_score(labels, preds))


100%|██████████| 63/63 [00:15<00:00,  4.17it/s]

['positive', 'negative', 'negative', 'positive']
[False, False, False, False, True, False, True, True, True, False, True, False, False, True, True, True, False, False, True, True, True, False, False, False, True, False, True, False, False, True, False, False, False, False, False, True, False, True, True, False, True, False, False, True, True, False, True, True, True, True, False, True, True, False, False, False, False, False, True, False, True, False, False, True, False, True, True, True, True, False, True, True, False, False, False, False, True, True, True, True, True, False, True, True, False, True, True, True, False, True, False, False, False, False, True, True, True, False, False, True, True, False, False, True, False, False, False, False, True, True, True, False, False, False, True, True, True, False, True, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, True, True, True, False, True, Fa




In [165]:
print(text_preds)
print(preds)
print(labels)
print(f1_score(labels, preds, average="micro"))
print(accuracy_score(labels, preds))


['positive', 'negative', 'negative', 'positive']
[False, False, False, False, True, False, True, True, True, False, True, False, False, True, True, True, False, False, True, True, True, False, False, False, True, False, True, False, False, True, False, False, False, False, False, True, False, True, True, False, True, False, False, True, True, False, True, True, True, True, False, True, True, False, False, False, False, False, True, False, True, False, False, True, False, True, True, True, True, False, True, True, False, False, False, False, True, True, True, True, True, False, True, True, False, True, True, True, False, True, False, False, False, False, True, True, True, False, False, True, True, False, False, True, False, False, False, False, True, True, True, False, False, False, True, True, True, False, True, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, True, True, True, False, True, Fa

In [119]:
from tqdm import tqdm
from torch.optim import AdamW

weight_decay = 0.01
# only the LORA parameters should be updated
optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=lr, weight_decay=weight_decay)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print("Epoch {} loss: {}".format(epoch, total_loss / len(train_dataloader)))
    # TODO: evaluate on validation set

100%|██████████| 125/125 [01:16<00:00,  1.64it/s]


Epoch 0 loss: 0.8394299149513245


100%|██████████| 125/125 [01:16<00:00,  1.64it/s]


Epoch 1 loss: 0.2956538796424866


100%|██████████| 125/125 [01:16<00:00,  1.64it/s]


Epoch 2 loss: 0.2336866855621338


100%|██████████| 125/125 [01:16<00:00,  1.64it/s]


Epoch 3 loss: 0.2190418392419815


100%|██████████| 125/125 [01:16<00:00,  1.64it/s]


Epoch 4 loss: 0.17433752119541168


100%|██████████| 125/125 [01:16<00:00,  1.64it/s]


Epoch 5 loss: 0.16745330393314362


 82%|████████▏ | 103/125 [01:02<00:13,  1.63it/s]

In [None]:
# TODO: save model

In [123]:
model.eval()
i = 89
# inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]} Label : ', return_tensors="pt")
inputs = tokenizer(template.format(ds["test"][i]["text"]), return_tensors="pt").to(device)
print(ds["test"][i]["text"])
print(inputs)

with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


This film by Friðrik Þór, director of Children of Nature, is powerful. It has great music by Sigurrós and good acting. It shows how sad insanity as a disease can be. There are many good jokes but the humor is dark. If that is not a problem then you should see this film. Note though this is not a comedy but a drama.
{'input_ids': tensor([[ 1212,  2646,   416, 19480, 27214, 12602,  6184,   252, 10205,    81,
            11,  3437,   286,  8990,   286, 10362,    11,   318,  3665,    13,
           632,   468,  1049,  2647,   416, 21984,   333,    81, 10205,    82,
           290,   922,  7205,    13,   632,  2523,   703,  6507, 30949,   355,
           257,  4369,   460,   307,    13,  1318,   389,   867,   922, 14532,
           475,   262, 14733,   318,  3223,    13,  1002,   326,   318,   407,
           257,  1917,   788,   345,   815,   766,   428,  2646,    13,  5740,
           996,   428,   318,   407,   257, 10997,   475,   257, 10512,    13,
           198,   198,  3792,   262, 

1024

In [52]:
len(processed_datasets["train"][0]["labels"])

1024

In [53]:
processed_datasets["train"][0]["input_ids"]

[50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,


In [56]:
processed_datasets["train"][0]["labels"][-100:]

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 24561,
 50256]

In [62]:
tokenizer.decode(processed_datasets["train"][0]["input_ids"][-400:])

'<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext

In [63]:
processed_datasets["train"][0]["attention_mask"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
