In [None]:
!pip install transformers datasets torch

In [20]:
# Prompt: Problem + options
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("math_qa", split="validation")

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2ForSequenceClassification.from_pretrained('gpt2-medium', num_labels=5)
model.eval()
tokenizer.pad_token = tokenizer.eos_token
def predict(text):
    # inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=tokenizer.model_max_length)
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    return torch.argmax(logits, dim=1)
answer_mapping = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}

correct = 0
num_example = 100
for i in range(num_example):
    prompt = dataset[i]["Problem"] + " " + dataset[i]["options"]
    prediction_index = predict(prompt).item()
    prediction_label = answer_mapping[prediction_index]
    # print(prediction_label)
    correct_answer = dataset[i]['correct']
    if prediction_label == correct_answer:
        correct += 1

accuracy = correct / num_example
print(f"Accuracy: {accuracy:.2f}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.21


In [15]:
# Prompt: Problem + Rationale + Options
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("math_qa", split="validation")

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2ForSequenceClassification.from_pretrained('gpt2-medium', num_labels=5)
model.eval()
tokenizer.pad_token = tokenizer.eos_token
def predict(text):
    # inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=tokenizer.model_max_length)
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    return torch.argmax(logits, dim=1)
answer_mapping = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}

correct = 0
num_example = 100
for i in range(num_example):
    prompt = dataset[i]["Problem"] + " " + dataset[i]["Rationale"] + " " + dataset[i]["options"]
    prediction_index = predict(prompt).item()
    prediction_label = answer_mapping[prediction_index]
    # print(prediction_label)
    correct_answer = dataset[i]['correct']
    if prediction_label == correct_answer:
        correct += 1

accuracy = correct / num_example
print(f"Accuracy: {accuracy:.2f}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.21


In [19]:
# Prompt: Annotated_formula + Options
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("math_qa", split="validation")

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2ForSequenceClassification.from_pretrained('gpt2-medium', num_labels=5)
model.eval()
tokenizer.pad_token = tokenizer.eos_token
def predict(text):
    # inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=tokenizer.model_max_length)
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    return torch.argmax(logits, dim=1)
answer_mapping = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}

correct = 0
num_example = 100
for i in range(num_example):
    prompt = dataset[i]["annotated_formula"] + " " + dataset[i]["options"]
    prediction_index = predict(prompt).item()
    prediction_label = answer_mapping[prediction_index]
    # print(prediction_label)
    correct_answer = dataset[i]['correct']
    if prediction_label == correct_answer:
        correct += 1

accuracy = correct / num_example
print(f"Accuracy: {accuracy:.2f}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.19


In [18]:
# Prompt: Problem + Annotated_formla + options
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("math_qa", split="validation")

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2ForSequenceClassification.from_pretrained('gpt2-medium', num_labels=5)
model.eval()
tokenizer.pad_token = tokenizer.eos_token
def predict(text):
    # inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=tokenizer.model_max_length)
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    return torch.argmax(logits, dim=1)
answer_mapping = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}

correct = 0
num_example = 100
for i in range(num_example):
    prompt = dataset[i]["Problem"] + " " + dataset[i]["annotated_formula"] + " " + dataset[i]["options"]
    prediction_index = predict(prompt).item()
    prediction_label = answer_mapping[prediction_index]
    # print(prediction_label)
    correct_answer = dataset[i]['correct']
    if prediction_label == correct_answer:
        correct += 1

accuracy = correct / num_example
print(f"Accuracy: {accuracy:.2f}")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.23
