In [None]:
!pip install torch datasets

In [None]:
!pip install accelerate -U

In [None]:

! pip install -U transformers

In [1]:
import torch
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments, GPT2Config
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
# Load tokenizer and model
configuration = GPT2Config()
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForSequenceClassification.from_pretrained("gpt2-medium", num_labels=5)
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# # Load dataset
# dataset = load_dataset("math_qa", split="validation")  # use train split when finetuning

In [4]:
# Load the dataset and split it into training and validation sets
full_dataset = load_dataset("math_qa", split="validation") # change the split later
train_size = 0.9
train_dataset, eval_dataset = full_dataset.train_test_split(train_size=train_size).values()

In [5]:
# Tokenize and preprocess the dataset
def preprocess_data(examples):
    # inputs = [problem + " Options: " + options for problem, options in zip(examples['Problem'], examples['options'])]
    inputs = [problem + " Formula: " + formula + " Options: " + options
              for problem, formula, options in zip(examples['Problem'], examples['annotated_formula'], examples['options'])]
    labels = [ord(correct_option.lower()) - ord('a') for correct_option in examples['correct']]  # 'a' -> 0, 'b' -> 1, etc.
    return {'input_ids': tokenizer(inputs, truncation=True)['input_ids'], 'labels': labels}

# tokenized_dataset = dataset.map(preprocess_data, batched=True)
tokenized_train_dataset = train_dataset.map(preprocess_data, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_data, batched=True)
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/4027 [00:00<?, ? examples/s]

Map:   0%|          | 0/448 [00:00<?, ? examples/s]

In [73]:
# # Tokenize and preprocess the dataset
# def preprocess_data(examples):
#     inputs = [problem + " Options: " + options for problem, options in zip(examples['Problem'], examples['options'])]
#     labels = [ord(correct_option.lower()) - ord('a') for correct_option in examples['correct']]
#     return {'input_ids': tokenizer(inputs, truncation=True)['input_ids'], 'labels': labels}
# tokenized_dataset = dataset.map(preprocess_data, batched=True)
# # Data collator
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False  # Masked Language Model not used in sequence classification
# )

Map:   0%|          | 0/4475 [00:00<?, ? examples/s]

In [21]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",           # Output directory for model checkpoints
    num_train_epochs=3,               # Number of training epochs
    per_device_train_batch_size=2,   # Batch size for training
    per_device_eval_batch_size=2,    # Batch size for evaluation
    warmup_steps=500,                 # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # Weight decay rate
    logging_dir='./logs',             # Directory for storing logs
    evaluation_strategy="epoch",       # Evaluate each epoch
    learning_rate=5e-5,               # Learning rate
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
)

In [7]:
# check tokenized dataset
for i in range(5):
    sample = tokenized_train_dataset[i]
    print(f"Sample {i}:")
    print("Tokenized Input IDs:", sample['input_ids'])
    print("Length of Tokenized Input:", len(sample['input_ids']))
    print("Label:", sample['labels'])
    print("\n")
for i in range(5):
    sample = tokenized_eval_dataset[i]
    print(f"Sample {i}:")
    print("Tokenized Input IDs:", sample['input_ids'])
    print("Length of Tokenized Input:", len(sample['input_ids']))
    print("Label:", sample['labels'])
    print("\n")

Sample 0:
Tokenized Input IDs: [1169, 1720, 286, 734, 25175, 3146, 318, 8699, 2624, 764, 543, 318, 262, 4833, 286, 262, 734, 3146, 5633, 19639, 25, 19862, 17034, 7, 3695, 2624, 8, 18634, 25, 257, 1267, 8699, 837, 275, 1267, 8257, 837, 269, 1267, 9193, 837, 288, 1267, 10111, 837, 304, 1267, 5214]
Length of Tokenized Input: 48
Label: 2


Sample 1:
Tokenized Input IDs: [64, 1728, 14153, 38298, 5732, 422, 2026, 1528, 284, 3126, 1528, 262, 640, 1022, 2672, 9262, 8794, 319, 281, 7593, 4038, 764, 416, 644, 1411, 318, 262, 640, 1022, 9262, 8794, 3220, 416, 1262, 262, 38298, 5633, 19639, 25, 29162, 7, 7146, 485, 7, 7266, 83, 974, 7, 1899, 11, 2026, 828, 2026, 828, 1500, 62, 3064, 8, 18634, 25, 257, 1267, 1679, 4064, 837, 275, 1267, 4747, 352, 1220, 513, 4064, 837, 269, 1267, 2026, 4064, 837, 288, 1267, 7930, 362, 1220, 513, 4064, 837, 304, 1267, 1160, 4064]
Length of Tokenized Input: 90
Label: 4


Sample 2:
Tokenized Input IDs: [10919, 318, 262, 1551, 1271, 286, 24438, 19867, 2672, 284, 23000, 

In [22]:
torch.cuda.empty_cache()

In [24]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.6184,1.608475
2,1.558,1.62207
3,1.3216,1.886897


TrainOutput(global_step=6042, training_loss=1.5108282050720234, metrics={'train_runtime': 2382.8167, 'train_samples_per_second': 5.07, 'train_steps_per_second': 2.536, 'total_flos': 2520365865947136.0, 'train_loss': 1.5108282050720234, 'epoch': 3.0})

In [25]:
# Save the model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [28]:
# Run the baseline with fine-tuned model

# Load the dataset
dataset_baseline = load_dataset("math_qa", split="train")

# Load the fine-tuned tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')
model = GPT2ForSequenceClassification.from_pretrained('./fine_tuned_gpt2', num_labels=5)
model.eval()
tokenizer.pad_token = tokenizer.eos_token

# Prediction function
def predict(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    return torch.argmax(logits, dim=1)

answer_mapping = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}

# Evaluate predictions
correct = 0
num_example = 100
for i in range(num_example):
    prompt = dataset_baseline[i]["Problem"] + " " + dataset_baseline[i]["options"]
    prediction_index = predict(prompt).item()
    prediction_label = answer_mapping[prediction_index]
    print(prediction_label)
    correct_answer = dataset_baseline[i]['correct']
    if prediction_label == correct_answer:
        correct += 1

accuracy = correct / num_example
print(f"Accuracy: {accuracy:.2f}")


b
b
c
e
d
a
b
b
e
b
a
b
b
b
b
a
a
e
b
c
a
a
b
c
b
c
c
b
b
a
e
e
c
b
e
e
e
d
c
b
e
a
b
b
b
b
e
c
b
c
c
b
b
d
e
a
b
a
c
c
b
a
c
b
e
a
b
b
c
a
c
a
e
a
a
d
c
d
a
a
c
b
c
c
c
e
b
d
d
b
d
c
a
a
a
c
c
d
b
c
Accuracy: 0.24
