In [1]:
from datasets import load_dataset
import numpy as np
import torch
import pprint
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using {}".format(device))

Using cpu


In [2]:
dataset = load_dataset("ai2_arc", "ARC-Challenge")
dataset = dataset.filter(lambda item: len(item["choices"]["label"]) == 4)

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    choice_prompt = [[examples["question"][i] + " Choose one from the following: {}. {}; {}. {}; {}. {}; {}. {}".format(examples["choices"][i]["label"][0],examples["choices"][i]["text"][0], examples["choices"][i]["label"][1], examples["choices"][i]["text"][1], examples["choices"][i]["label"][2], examples["choices"][i]["text"][2], examples["choices"][i]["label"][3], examples["choices"][i]["text"][3])] for i in range(len(examples["id"]))]
    gt_sentences = ["The correct answer is: {}. {}".format(examples["answerKey"][i], examples["choices"][i]["text"][examples["choices"][i]["label"].index(examples["answerKey"][i])]) for i in range(len(examples["id"]))] 
    # Flatten everything
    choice_prompt = sum(choice_prompt, [])
    
    # Tokenize
    tokenized_examples = tokenizer(choice_prompt)
    tokenized_examples["labels"] = tokenizer(gt_sentences)["input_ids"]
    # Un-flatten
    return tokenized_examples

def show_one(example):
    '''show one example in a nicely formatted way'''
    print(f"Question: {example['question']}")
    print(f"  A - {example['choices']['text'][0]}")
    print(f"  B - {example['choices']['text'][1]}")
    print(f"  C - {example['choices']['text'][2]}")
    print(f"  D - {example['choices']['text'][3]}")
    print(f"Ground truth: {example['answerKey']}")


In [6]:
examples = dataset["train"][:3]
features = tokenize_function(examples)
print(features.keys())
print('= = = Intuitive Demo = = =')
show_one(dataset["train"][0])

print('\n= = = Processed Format = = =')
for i in range(3):
    print("-->Prompt:")
    print(tokenizer.decode(features["input_ids"][i]))
    print("-->Ground truth answer:")
    print(tokenizer.decode(features["labels"][i]))

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
= = = Intuitive Demo = = =
Question: George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?
  A - dry palms
  B - wet palms
  C - palms covered with oil
  D - palms covered with lotion
Ground truth: A

= = = Processed Format = = =
-->Prompt:
[CLS] George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat? Choose one from the following : A. dry palms ; B. wet palms ; C. palms covered with oil ; D. palms covered with lotion [SEP]
-->Ground truth answer:
[CLS] The correct answer is : A. dry palms [SEP]
-->Prompt:
[CLS] Which of the following statements best explains why magnets usually stick to a refrigerator door? Choose one from the following : A. The refrigerator door is smooth. ; B. The refrigerator door contains iron. ; C. The refrigerator door is a good conductor. ; D. The refrigerator door has electric wires in it. [SEP]


In [7]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
accepted_keys = ["input_ids", "attention_mask", "labels"]
for key in tokenized_datasets['train'].features.keys():
    if key not in accepted_keys:
        tokenized_datasets = tokenized_datasets.remove_columns(key)
tokenized_datasets.set_format("torch")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 1117/1117 [00:00<00:00, 5022.91 examples/s]
Map: 100%|██████████| 1165/1165 [00:00<00:00, 3619.34 examples/s]
Map: 100%|██████████| 295/295 [00:00<00:00, 3512.03 examples/s]


In [None]:
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
collator = DataCollatorForSeq2Seq(tokenizer, model, padding="max_length")

# TODO: load model here

batch_size = 2
# data_collator = DataCollatorForMultipleChoice(tokenizer)

train_dataset = tokenized_datasets["train"].shuffle(seed=123).select(range(1000))
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=collator)
eval_dataset = tokenized_datasets["validation"]
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=collator)
test_dataset = tokenized_datasets["test"]
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collator)

In [None]:
## Training Loop
from torch.optim import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 1
model.train()
for epoch in tqdm(range(num_epochs)):  # Define num_epochs
    epoch_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Debugging: Print shapes
        print(f"Input IDs shape: {input_ids.shape}")
        print(f"Attention mask shape: {attention_mask.shape}")
        labels = input_ids.clone()  # Assuming a denoising objective
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        epoch_loss += loss
        loss.backward()
        optimizer.step()    
    print(f"epoch {epoch+1} loss", epoch_loss)