In [6]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9.]{3,}", torch.__version__).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [7]:
from unsloth import FastLanguageModel
import torch

# Load the model and tokenizer from Hugging Face
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=1024,
    dtype=None,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.11.1: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [13]:
from datasets import load_dataset

dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")
dataset = dataset.shuffle(seed=42)

train_dataset = dataset.select(range(27000))  # 27K train
eval_dataset = dataset.select(range(27000, 30000))  # 3K validation

In [12]:
EOS_TOKEN = tokenizer.eos_token

def format_instruction(example):
    question = example["question"]
    solution = example["solution"]
    answer = example["answer"]
    is_correct = example["is_correct"]

    text = f"""You are a great mathematician and you are tasked with finding if a solution to a given maths question is correct or not. Your response should be 'True' if the solution is correct, otherwise 'False'. Below is the question, the proposed solution, and the final answer.

Question:
{question}

Proposed Solution:
{solution}

Proposed Final Answer:
{answer}

Evaluation Criteria:
1. Solution approach is logically sound
2. Computational steps are accurate
3. The proposed final answer is correct

Based on this analysis, your response is:
{is_correct}{EOS_TOKEN}"""
    return {"text": text}

In [14]:
train_dataset = train_dataset.map(format_instruction)
eval_dataset = eval_dataset.map(format_instruction)

Map:   0%|          | 0/27000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [11]:
model.gradient_checkpointing_enable()
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0,
    bias="none",
)

Unsloth 2025.11.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [17]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir = "./outputs",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    gradient_accumulation_steps = 4,
    learning_rate = 2e-5,
    num_train_epochs = 2,
    logging_steps = 50,
    save_steps = 500,
    eval_steps = 500,
    eval_strategy = "steps",
    save_strategy = "steps",
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,
    warmup_steps = 500,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    report_to = "none",
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = 1024,
    args = training_args,
)

In [18]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 27,000 | Num Epochs = 2 | Total steps = 1,688
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss,Validation Loss
500,0.602,0.584554
1000,0.5713,0.561869
1500,0.5483,0.550565


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=1688, training_loss=0.6379733605407426, metrics={'train_runtime': 9007.1143, 'train_samples_per_second': 5.995, 'train_steps_per_second': 0.187, 'total_flos': 1.3994164043354604e+18, 'train_loss': 0.6379733605407426, 'epoch': 2.0})

In [22]:
def predict_is_correct(question, solution, answer):
    prompt = f"""You are a great mathematician and you are tasked with finding if a solution to a given maths question is correct or not. Your response should be 'True' if the solution is correct, otherwise 'False'. Below is the question, the proposed solution, and the final answer.

Question:
{question}

Proposed Solution:
{solution}

Proposed Final Answer:
{answer}

Evaluation Criteria:
1. Solution approach is logically sound
2. Computational steps are accurate
3. The proposed final answer is correct

Based on this analysis, your response is:
"""

    inputs = tokenizer(
        [prompt],
        return_tensors = "pt",
        truncation = True,
        max_length = 1024
    ).to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens = 8,
        use_cache = True
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return "true" in response.split("\n")[-1].lower()

In [20]:
small_test_dataset = dataset.select(range(30000, 30100))

In [23]:
from tqdm import tqdm
import pandas as pd

# Prepare the model for faster inference
FastLanguageModel.for_inference(model)

correct_count = 0
for i, example in enumerate(tqdm(small_test_dataset)):
    prediction = predict_is_correct(
        example["question"],
        example["solution"],
        example["answer"]
    )
    if prediction == example["is_correct"]:
        correct_count += 1

print(f"Accuracy: {correct_count / len(small_test_dataset)}")

100%|██████████| 100/100 [00:22<00:00,  4.54it/s]

Accuracy: 0.79





In [24]:
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")

In [26]:
from tqdm import tqdm
import pandas as pd

# Prepare the model for faster inference
FastLanguageModel.for_inference(model)

predictions = []
for i, example in enumerate(tqdm(test_dataset)):
    prediction = predict_is_correct(
        example["question"],
        example["solution"],
        example["answer"]
    )
    predictions.append(prediction)

# Create submission file
submission = pd.DataFrame({
    "ID": range(len(predictions)),
    "is_correct": predictions
})
submission.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")

100%|██████████| 10000/10000 [37:17<00:00,  4.47it/s]

Submission file created: submission.csv



