In [1]:
!pip install datasets -q
!pip install transformers -q
!pip install accelerate -U -q
!pip install torch torchvision torchaudio -q

In [2]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice, TrainingArguments, Trainer
from datetime import datetime

project = "sciq"
dataset = load_dataset(project)
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
})

In [3]:
# Number of examples in the dataset
num_examples = len(dataset["train"])

# Print the dataset size
print(f"Number of examples in the 'sciq' dataset: {num_examples}")

# Display some sample data
sample_data = dataset["train"][0]  # You can choose any index
print("Sample Data:")
print(sample_data)


Number of examples in the 'sciq' dataset: 11679
Sample Data:
{'question': 'What type of organism is commonly used in preparation of foods such as cheese and yogurt?', 'distractor3': 'viruses', 'distractor1': 'protozoa', 'distractor2': 'gymnosperms', 'correct_answer': 'mesophilic organisms', 'support': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}


In [4]:
base_model_name = "bert-large-uncased"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForMultipleChoice.from_pretrained(base_model_name)

def preprocess(example):
    # Combine question with each distractor and correct answer
    choices = [example['distractor1'], example['distractor2'], example['distractor3'], example['correct_answer']]
    combined_texts = [example['question'] + " " + choice for choice in choices]
    
    # Encode combined texts
    encoded_choices = tokenizer(combined_texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    
    return {
        'input_ids': encoded_choices['input_ids'],
        'attention_mask': encoded_choices['attention_mask'],
        'label': torch.tensor(3)  # 3 because the correct answer is always the last option in our format
    }

import random

def preprocess(example):
    choices = [example['distractor1'], example['distractor2'], example['distractor3'], example['correct_answer']]
    
    # Shuffle choices with their indices
    zipped_choices = list(zip(choices, range(4)))  # zip with original indices
    random.shuffle(zipped_choices)
    shuffled_choices, original_indices = zip(*zipped_choices)
    
    # Determine label after shuffling
    label = original_indices.index(3)  # Find the new position of the correct answer
    
    combined_texts = [example['question'] + " " + choice for choice in shuffled_choices]
    
    # Encode combined texts
    encoded_choices = tokenizer(combined_texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    
    return {
        'input_ids': encoded_choices['input_ids'],
        'attention_mask': encoded_choices['attention_mask'],
        'label': torch.tensor(label)
    }


train_data = dataset["train"]
validation_data = dataset["validation"]

encoded_train_data = train_data.map(preprocess, remove_columns=train_data.column_names)
encoded_validation_data = validation_data.map(preprocess, remove_columns=validation_data.column_names)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
preprocess(sample_data)

{'input_ids': tensor([[  101,  2054,  2828,  1997, 15923,  2003,  4141,  2109,  1999,  7547,
           1997,  9440,  2107,  2004,  8808,  1998, 10930, 27390,  2102,  1029,
          15053,  6844,  2050,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [6]:
train_data, encoded_train_data

(Dataset({
     features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
     num_rows: 11679
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'label'],
     num_rows: 11679
 }))

In [7]:
!pip install -q wandb -U

import wandb, os
wandb.login()

wandb_project = "sciq-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mabhijoy-sar[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [9]:
run_name = base_model_name + "-" + wandb_project

In [10]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    eval_steps=100,
    logging_steps=50,
    save_strategy='epoch',
    learning_rate=3e-5,  # Slightly increased learning rate
    per_device_train_batch_size=16,  # Increased train batch size
    per_device_eval_batch_size=16,   # Increased eval batch size
    gradient_accumulation_steps=2,  # Use gradient accumulation
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
    do_eval=True,
    report_to="wandb",
    run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
    # If you choose to include a learning rate scheduler:
    lr_scheduler_type="linear",
    warmup_ratio=0.1  # Warm up over 10% of total steps
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_data,
    eval_dataset=encoded_validation_data
)

In [12]:
trainer.train()

Step,Training Loss,Validation Loss
100,1.3325,1.278223
200,1.1535,1.033543
300,0.9988,0.890164
400,0.8422,0.826168
500,0.7077,0.805956
600,0.6922,0.837888
700,0.6715,0.762519
800,0.369,0.870426
900,0.3457,0.843378
1000,0.3987,0.822918


TrainOutput(global_step=1095, training_loss=0.7403869498265933, metrics={'train_runtime': 2332.4925, 'train_samples_per_second': 15.021, 'train_steps_per_second': 0.469, 'total_flos': 3.2651968852417536e+16, 'train_loss': 0.7403869498265933, 'epoch': 3.0})

In [13]:
test_data = dataset["test"]
encoded_test_data = test_data.map(preprocess, remove_columns=test_data.column_names)

In [14]:
trainer.eval_dataset = encoded_test_data
results = trainer.evaluate()

In [15]:
print(results)

{'eval_loss': 0.8196806311607361, 'eval_runtime': 20.1356, 'eval_samples_per_second': 49.663, 'eval_steps_per_second': 3.129, 'epoch': 3.0}


In [16]:
predictions = trainer.predict(encoded_test_data)
predictions

PredictionOutput(predictions=array([[-1.4230529 , -6.4143987 , -6.7155604 ,  0.8424742 ],
       [-5.562239  , -7.2469797 , -3.7093894 ,  4.5314064 ],
       [-0.51601464, -1.2431774 , -6.3135324 ,  4.4964776 ],
       ...,
       [-6.3059225 , -4.8675466 ,  2.5583303 ,  5.1088743 ],
       [-7.419259  ,  3.008205  , -7.409169  ,  3.1992912 ],
       [-3.996625  ,  4.8065557 ,  3.4317608 ,  3.787773  ]],
      dtype=float32), label_ids=array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 

In [20]:
from collections import Counter

prediction_counts = Counter(predicted_labels)
print(prediction_counts)

Counter({3: 705, 1: 105, 2: 98, 0: 92})


In [17]:
predicted_labels = predictions.predictions.argmax(axis=1)

In [18]:
import wandb

wandb.init()

# Create a table
table = wandb.Table(columns=["Question", "Possible Answers", "Correct Answer", "Predicted Answer"])

for idx, example in enumerate(test_data):
    question = example["question"]
    possible_answers = [example["distractor1"], example["distractor2"], example["distractor3"], example["correct_answer"]]
    correct_answer = example["correct_answer"]
    predicted_answer = possible_answers[predicted_labels[idx]]
    
    table.add_data(question, possible_answers, correct_answer, predicted_answer)

# Log the table
wandb.log({"questions_predictions": table})

wandb.finish()


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▅▃▂▂▂▁▂▂▂▂
eval/runtime,▂▄▁▂▁▁▂▁▂▃█
eval/samples_per_second,▇▅█▇██▇█▇▆▁
eval/steps_per_second,▇▆█▇██▇█▇▆▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train/learning_rate,▄███▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁
train/loss,██▇▆▆▅▅▄▃▃▄▃▃▃▂▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.81968
eval/runtime,20.1356
eval/samples_per_second,49.663
eval/steps_per_second,3.129
train/epoch,3.0
train/global_step,1095.0
train/learning_rate,0.0
train/loss,0.3518
train/total_flos,3.2651968852417536e+16
train/train_loss,0.74039


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112459988887874, max=1.0…

VBox(children=(Label(value='0.327 MB of 0.327 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [21]:
import random

def shuffle_choices(example):
    choices = [example['distractor1'], example['distractor2'], example['distractor3'], example['correct_answer']]
    correct_idx = 3
    zipped_choices = list(zip(choices, range(4)))  # zip with original indices
    random.shuffle(zipped_choices)
    shuffled_choices, original_indices = zip(*zipped_choices)
    
    combined_texts = [example['question'] + " " + choice for choice in shuffled_choices]
    encoded_choices = tokenizer(combined_texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    
    return {
        'input_ids': encoded_choices['input_ids'],
        'attention_mask': encoded_choices['attention_mask'],
        'label': torch.tensor(original_indices.index(correct_idx))
    }

encoded_test_data_shuffled = test_data.map(shuffle_choices, remove_columns=test_data.column_names)
predictions_shuffled = trainer.predict(encoded_test_data_shuffled)
predicted_labels_shuffled = predictions_shuffled.predictions.argmax(axis=1)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [22]:
prediction_counts_shuffled = Counter(predicted_labels_shuffled)
print(prediction_counts_shuffled)


Counter({3: 269, 1: 260, 2: 237, 0: 234})


In [48]:
single_example = test_data[1]  # This picks the first example from the test dataset


In [49]:
# encoded_single_example = preprocess(single_example)
encoded_single_example = shuffle_choices(single_example)

In [50]:
from torch.utils.data import DataLoader

dataset_single = [encoded_single_example]
loader = DataLoader(dataset_single, batch_size=1)

In [53]:
model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    for batch in loader:
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label = logits.argmax(dim=1).item()


In [54]:
choices = [
    single_example["distractor1"],
    single_example["distractor2"],
    single_example["distractor3"],
    single_example["correct_answer"]
]

print("Question:", single_example["question"])
print("\nChoices:")
for idx, choice in enumerate(choices):
    print(f"{idx}. {choice}")

print("\nModel's Prediction:", choices[predicted_label])

correct = predicted_label == 3  # Since the correct answer is always at index 3 in your setup
print("\nWas the model correct?", correct)


Question: What term in biotechnology means a genetically exact copy of an organism?

Choices:
0. adult
1. male
2. phenotype
3. clone

Model's Prediction: phenotype

Was the model correct? False
