## Env

In [None]:
# %%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch
    match = re.match(r"[0-9.]{3,}", str(torch.version))
    if match:
        v = match.group(0)
        xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
        !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
        !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
        !pip install --no-deps unsloth
    else:
        print("Warning: Could not parse torch version. Installing default xformers version.")
        !pip install --no-deps bitsandbytes accelerate xformers peft trl triton cut_cross_entropy unsloth_zoo
        !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
        !pip install --no-deps unsloth

!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting xformers
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.10.13-py3-none-any.whl.metadata (32 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl (117.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m117.

In [None]:

import os, torch, numpy as np
os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")

import unsloth

from typing import Dict, Any
from dataclasses import dataclass

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    Trainer,
)
from transformers.trainer_callback import EarlyStoppingCallback

from peft import LoraConfig, get_peft_model, TaskType

print("Torch:", torch.__version__, "CUDA:", torch.version.cuda, "BF16:", torch.cuda.is_bf16_supported())
device = "cuda" if torch.cuda.is_available() else "cpu"
device


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
Torch: 2.8.0+cu126 CUDA: 12.6 BF16: True


'cuda'

## Config

In [None]:
base_model_name = "unsloth/Meta-Llama-3.1-8B"
max_seq_length = 1024

lora_cfg = dict(r=16, alpha=32, dropout=0.05)
hp_cfg   = dict(bsz=4, gas=8, lr=2e-4, max_steps=2000)

device = "cuda" if torch.cuda.is_available() else "cpu"
use_bf16 = torch.cuda.is_bf16_supported()


USE_4BIT = True
bnb_config = BitsAndBytesConfig(
    load_in_4bit=USE_4BIT,
    bnb_4bit_compute_dtype=torch.bfloat16 if use_bf16 else torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
) if USE_4BIT else None

## Model



In [None]:

tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=2,
    problem_type="single_label_classification",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16 if use_bf16 else torch.float16,
    device_map="auto" if USE_4BIT else None,
    quantization_config=bnb_config if USE_4BIT else None,
)

target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

peft_config = LoraConfig(
    r=lora_cfg["r"],
    lora_alpha=lora_cfg["alpha"],
    lora_dropout=lora_cfg["dropout"],
    bias="none",
    task_type=TaskType.SEQ_CLS,   
    target_modules=target_modules,
)
model = get_peft_model(model, peft_config)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at unsloth/Meta-Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Data


In [None]:
from datasets import load_dataset


dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
train_ds = dataset["train"]
test_ds  = dataset["test"]


train_full = train_ds.shuffle(seed=42)
val_size = min(2000, int(0.1 * len(train_full)))
val_ds = train_full.select(range(val_size))
train_ds_small = train_full.select(range(val_size, len(train_full)))

print(train_ds_small[0].keys())
print("Train:", len(train_ds_small), "Val:", len(val_ds), "Test:", len(test_ds))

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

dict_keys(['question', 'is_correct', 'answer', 'solution'])
Train: 998000 Val: 2000 Test: 10000


In [None]:
INFER_TEMPLATE = (
    "You are a verifier. Decide if the provided solution to the math question is correct.\n"
    "Respond with a binary decision (0 = incorrect, 1 = correct).\n\n"
    "Question:\n{question}\n\n"
    "Solution:\n{solution}\n"
)

def build_input_text(ex: Dict[str, Any]) -> str:
    q = str(ex.get("question", "")).strip()
    s = str(ex.get("solution", "")).strip()
    a = str(ex.get("answer", "")).strip()
    return INFER_TEMPLATE.format(question=q, solution=s)

def preprocess_batch(batch: Dict[str, Any]) -> Dict[str, Any]:
    texts = [build_input_text({"question": q, "solution": s, "answer":a })
             for q, s, a in zip(batch["question"], batch["solution"], batch["answer"])]
    enc = tokenizer(
        texts,
        truncation=True,
        max_length=max_seq_length,
        padding=False,  
        return_tensors=None,
    )

    labels = [1 if bool(y) else 0 for y in batch["is_correct"]]
    enc["labels"] = labels
    return enc

train_tokenized = train_ds.map(preprocess_batch, batched=True, remove_columns=train_ds.column_names)
val_tokenized   = val_ds.map(preprocess_batch,   batched=True, remove_columns=val_ds.column_names)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = (preds == labels).mean().item()
    return {"accuracy": acc}

In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)


### **SFTTrainer Setup**


In [None]:
args = TrainingArguments(
    output_dir=f"outputs/seqcls_lora",
    per_device_train_batch_size=hp_cfg["bsz"],
    per_device_eval_batch_size=hp_cfg["bsz"],
    gradient_accumulation_steps=hp_cfg["gas"],
    learning_rate=hp_cfg["lr"],
    max_steps=hp_cfg["max_steps"],
    warmup_ratio=0.03,                      
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    weight_decay=0.01,
    fp16=not use_bf16,
    bf16=use_bf16,
    logging_steps=50,
    report_to="none",
    save_total_limit=2,

    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",      
    greater_is_better=True,
    seed=42,
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0)]

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=callbacks,
)




  trainer = Trainer(


In [None]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy
200,5.0107,0.523983,0.7805
400,3.6979,0.411523,0.813
600,4.0036,0.36689,0.832
800,3.4621,0.409401,0.8205
1000,2.7779,0.546455,0.793
1200,3.0341,0.331986,0.8635
1400,2.6732,0.314161,0.859
1600,2.3458,0.286122,0.8805
1800,2.3553,0.271252,0.8875
2000,2.3245,0.268516,0.8905


TrainOutput(global_step=2000, training_loss=3.264457836151123, metrics={'train_runtime': 9008.3292, 'train_samples_per_second': 7.105, 'train_steps_per_second': 0.222, 'total_flos': 1.1143243099039334e+18, 'train_loss': 3.264457836151123, 'epoch': 0.064})


 ### **Inference and Evaluation**

In [None]:
metrics = trainer.evaluate()
print("Eval metrics:", metrics)


Eval metrics: {'eval_loss': 0.268516480922699, 'eval_accuracy': 0.8905, 'eval_runtime': 101.374, 'eval_samples_per_second': 19.729, 'eval_steps_per_second': 4.932, 'epoch': 0.064}


### **Generate Submission File**

In [None]:
import torch, pandas as pd
from tqdm import tqdm


INFER_TEMPLATE = (
    "You are a verifier. Decide if the provided solution to the math question is correct.\n"
    "Respond with a binary decision (0 = incorrect, 1 = correct).\n\n"
    "Question:\n{question}\n\n"
    "Solution:\n{solution}\n"
)

def build_input_text(q, s):
    return INFER_TEMPLATE.format(question=str(q).strip(), solution=str(s).strip())

predictions = []
for ex in tqdm(test_ds):
    q = ex["question"]
    s = ex.get("solution", "")
    text = build_input_text(q, s)

    enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_seq_length).to(model.device)
    with torch.no_grad():
        logits = model(**enc).logits        # [1, 2]
    pred_id = int(logits.argmax(dim=-1).item())   # 0 or 1
    pred_bool = bool(pred_id)                     # 0->False, 1->True
    predictions.append(pred_bool)

submission = pd.DataFrame({"ID": range(len(predictions)), "is_correct": predictions})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv with", len(predictions), "rows")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [26:44<00:00,  6.23it/s]

Saved submission.csv with 10000 rows





# SAVE THE MODEL TO DRIVE AND RUN INFERENCE
Add code to save the model checkpoint to Google Drive, load the model from the checkpoint, and generate the final submission CSV file.

## Mount google drive

### Subtask:
Mount Google Drive to save the model checkpoint.


**Reasoning**:
Mount Google Drive to save the model checkpoint.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Save model checkpoint

### Subtask:
Save the trained model checkpoint to the specified path in Google Drive.


**Reasoning**:
Define the save path and save the model and tokenizer to Google Drive.



In [None]:
import os

# Define the path to save the model checkpoint in Google Drive
save_path = "/content/drive/MyDrive/llama3_8b_math_verifier_checkpoint"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model checkpoint and tokenizer saved to: {save_path}")

Model checkpoint and tokenizer saved to: /content/drive/MyDrive/llama3_8b_math_verifier_checkpoint


## Load model from checkpoint

### Subtask:
Load the model from the saved checkpoint.


**Reasoning**:
Load the model and tokenizer from the saved checkpoint path in Google Drive and prepare the model for inference.



In [None]:
# Define the path where the model checkpoint was saved in Google Drive
save_path = "/content/drive/MyDrive/llama3_8b_math_verifier_checkpoint"

# Load the model and tokenizer from the saved path
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = save_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Prepare the loaded model for faster inference
FastLanguageModel.for_inference(model)

print(f"Model and tokenizer loaded from: {save_path}")

## Generate submission file

### Subtask:
Generate the submission CSV file using the loaded model.


**Reasoning**:
Generate the submission CSV file by iterating through the test dataset, generating predictions using the loaded model, and saving the results to a pandas DataFrame.



In [None]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

# Load the official test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
predictions = []

# Create the prompt template for inference (no answer included)
inference_prompt = """You are a great mathematician and you are tasked with finding if a solution to a given maths question is correct or not. Your response should be 'True' if the solution is correct, otherwise 'False'. Below is the Question and Solution.
Question:
{}
Solution:
{}
Output:
"""

# A simple function to parse 'True' or 'False' from the model's raw output
def parse_output(response_text):
    # Find the text after "Output:"
    output_part = response_text.split("Output:\n")[-1]
    # Check if "True" is in that part, case-insensitively
    if 'true' in output_part.lower():
        return True
    return False

# Loop through the test dataset and generate a prediction for each example
for example in tqdm(test_dataset):
    question = example["question"]
    solution = example["solution"]

    # Format the prompt
    prompt = inference_prompt.format(question, str(solution))
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate the prediction
    outputs = model.generate(**inputs, max_new_tokens=8, use_cache=True)
    response_text = tokenizer.batch_decode(outputs)[0]

    # Parse the prediction and add it to our list
    prediction = parse_output(response_text)
    predictions.append(prediction)

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'is_correct': predictions
})

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print("You can now download this file and submit it to the Kaggle competition.")