In [None]:
import re

def clean_text(t):
    t = re.sub(r"```.*?```", "", t, flags=re.S)
    t = re.sub(r"\\boxed\{(.*?)\}", r"\1", t)
    t = re.sub(r"\s+", " ", t.strip())
    return t

## Mount google drive

### Subtask:
Mount Google Drive to save the model checkpoint.


**Reasoning**:
Mount Google Drive to save the model checkpoint.



In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Save model checkpoint

### Subtask:
Save the trained model checkpoint to the specified path in Google Drive.


**Reasoning**:
Define the save path and save the model and tokenizer to Google Drive.



In [12]:
import os

# Define the path to save the model checkpoint in Google Drive
save_path = "/content/drive/MyDrive/llama3_8b_math_verifier_checkpoint"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model checkpoint and tokenizer saved to: {save_path}")

Model checkpoint and tokenizer saved to: /content/drive/MyDrive/llama3_8b_math_verifier_checkpoint


## Load model from checkpoint

### Subtask:
Load the model from the saved checkpoint.


**Reasoning**:
Load the model and tokenizer from the saved checkpoint path in Google Drive and prepare the model for inference.



In [None]:
# Define the path where the model checkpoint was saved in Google Drive
save_path = "/content/drive/MyDrive/llama3_8b_math_verifier_checkpoint"

# Load the model and tokenizer from the saved path
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = save_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Prepare the loaded model for faster inference
FastLanguageModel.for_inference(model)

print(f"Model and tokenizer loaded from: {save_path}")

==((====))==  Unsloth 2025.10.8: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model and tokenizer loaded from: /content/drive/MyDrive/llama3_8b_math_verifier_checkpoint


## Generate submission file

### Subtask:
Generate the submission CSV file using the loaded model.


**Reasoning**:
Generate the submission CSV file by iterating through the test dataset, generating predictions using the loaded model, and saving the results to a pandas DataFrame.



In [None]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

# Load the official test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
predictions = []

# Create the prompt template for inference (no answer included)
inference_prompt = """You are a great mathematician and you are tasked with finding if a solution to a given maths question is correct or not. Your response should be 'True' if the solution is correct, otherwise 'False'. Below is the Question and Solution.
Question:
{}
Solution:
{}
Output:
"""

# A simple function to parse 'True' or 'False' from the model's raw output
def parse_output(response_text):
    # Find the text after "Output:"
    output_part = response_text.split("Output:\n")[-1]
    # Check if "True" is in that part, case-insensitively
    if 'true' in output_part.lower():
        return True
    return False

# Loop through the test dataset and generate a prediction for each example
for example in tqdm(test_dataset):
    question = example["question"]
    solution = example["solution"]

    # Format the prompt
    prompt = inference_prompt.format(question, str(solution))
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate the prediction
    outputs = model.generate(**inputs, max_new_tokens=8, use_cache=True)
    response_text = tokenizer.batch_decode(outputs)[0]

    # Parse the prediction and add it to our list
    prediction = parse_output(response_text)
    predictions.append(prediction)

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'is_correct': predictions
})

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print("You can now download this file and submit it to the Kaggle competition.")

100%|██████████| 10000/10000 [1:33:05<00:00,  1.79it/s]


Submission file 'submission.csv' created successfully!
You can now download this file and submit it to the Kaggle competition.





In [None]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.10.12-py3-none-any.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.10.13 (from unsloth)
  Downloading unsloth_zoo-2025.10.13-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.23.0,>=0.18.2 (from unsl

In [None]:
!pip install trl
!pip install transformers



In [None]:
# ============================================================
#  DL-Fall-25 Kaggle Contest — Llama3-8B SF
# ============================================================

!pip install -q unsloth transformers peft accelerate bitsandbytes datasets

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch, re, numpy as np
from datasets import concatenate_datasets



dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train").shuffle(seed=42)

def clean_text(t):
    if not isinstance(t, str):
        return ""
    t = re.sub(r"```.*?```", "", t, flags=re.S)
    t = re.sub(r"\\boxed\{(.*?)\}", r"\1", t)
    t = re.sub(r"\s+", " ", t.strip())
    return t

# Reasoning-aware prompt template (no label leakage)
training_prompt = """You are a careful mathematician.
Examine the following question, its proposed solution, and decide if the answer is correct.
Explain your reasoning briefly, then respond with 'True' or 'False'.

Question:
{}

Proposed Answer and Solution:
{}

Your reasoning:
[think step by step]

Final answer:"""

def format_prompt(example):
    q = clean_text(example["question"])
    a = clean_text(example.get("answer", ""))
    s = clean_text(example["solution"])
    # merge answer + solution for clarity
    merged_sol = f"{a}\n\n{s}".strip()
    correct = bool(example["is_correct"])
    label_text = "True" if correct else "False"
    prompt = training_prompt.format(q, merged_sol).strip()
    return {"prompt": prompt, "label_text": label_text, "label": int(correct)}

dataset = dataset.map(format_prompt)

# ------------------------------------------------------------
#  Optional class balancing
# ------------------------------------------------------------
labels = np.array(dataset["label"])
n_pos, n_neg = np.sum(labels==1), np.sum(labels==0)
if abs(n_pos - n_neg) > 0.1 * len(labels):
    pos = dataset.filter(lambda e: e["label"] == 1)
    neg = dataset.filter(lambda e: e["label"] == 0)
    n_min = min(len(pos), len(neg))
    dataset = concatenate_datasets([pos.select(range(n_min)),
                                    neg.select(range(n_min))]).shuffle(seed=42)

# ------------------------------------------------------------
#  Split train / validation
# ------------------------------------------------------------
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"].select(range(0, min(50_000, len(split["train"]))))
val_dataset   = split["test"]

print(f"Train: {len(train_dataset)} | Validation: {len(val_dataset)}")
print("\nExample prompt:\n", train_dataset[0]["prompt"])
print("\nLabel:", train_dataset[0]["label_text"])



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.7/348.7 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m273.6/273.6 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Train: 50000 | Validation: 80000

Example prompt:
 You are a careful mathematician.
Examine the following question, its proposed solution, and decide if the answer is correct.
Explain your reasoning briefly, then respond with 'True' or 'False'.

Question:
Jon runs a website where he gets paid for every person who visits. He gets paid $0.10 for every person who visits. Each hour he gets 50 visits. His website operates 24 hours a day. How many dollars does he make in a 30 day month?

Proposed Answer and Solution:
3600

Let's solve this problem using Python code. <llm-code> # let's solve for how many dollars Jon will make in 1 day first visits_per_hour = 50 dollars_per_visit = 0.10 dollars_in_one_day = visits_per_hour * dollars_per_visit * 24 # now that we know dollars in one day, let's multiply it by 30 days dollars_per_month = dollars_in_one_day * 30 dollars_per_month </llm-code> <llm-code-output> 3600.0 </llm-code-output> Thus Jon makes 3600 dollars every month.

Your reasoning:
[think

In [None]:
!pip install -q transformers unsloth  peft accelerate bitsandbytes datasets

In [None]:

from unsloth import FastLanguageModel
model_name = "unsloth/Meta-Llama-3.1-8B"

max_seq_length = 1024     # safe context length for Colab
dtype = torch.bfloat16    # more stable gradients than fp16
load_in_4bit = True       # use 4-bit QLoRA for efficiency


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    device_map="auto",
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    #llm_int8_enable_fp32_cpu_offload=True,  # allows partial CPU offload
)

# ------------------------------------------------------------
# Tokenizer configuration
# ------------------------------------------------------------
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

# LoRA config
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=64,
    lora_dropout=0.05,
    use_gradient_checkpointing=True,  # optional, saves memory
)

model.print_trainable_parameters()

print("Trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))



MAX_LEN = 1024

def tokenize_function(examples):
    texts = [p + " " + y + tokenizer.eos_token for p, y in zip(examples["prompt"], examples["label_text"])]
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length",   # ensures same length
    )
    # for causal LM, labels = input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


train_tokenized = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
val_tokenized   = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)




training_args = TrainingArguments(
    output_dir="./llama3_sft",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    eval_steps=10000,
    save_steps=10000,
    logging_steps=200,
    learning_rate=3e-4,
    warmup_ratio=0.1,
    num_train_epochs=1,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    bf16=True,
    gradient_checkpointing=True,
    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
)

# ============================================================
# 5. Training
# ============================================================
trainer.train()



from tqdm import tqdm

def predict_correctness(dataset, max_new_tokens=5):
    preds = []
    for ex in tqdm(dataset):
        prompt = ex["prompt"] + " "
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            output = model.generate(**inputs, max_new_tokens=max_new_tokens)
        pred_text = tokenizer.decode(output[0], skip_special_tokens=True)
        # take only the generated suffix after the prompt
        gen = pred_text[len(prompt):].strip().split()[0]
        gen = gen.replace(".", "").capitalize()
        preds.append("True" if "true" in gen else "False")
    return preds

# Evaluate on a small validation subset
val_subset = val_dataset.select(range(100))
preds = predict_correctness(val_subset)
true_labels = val_subset["label_text"]
acc = np.mean([p == t for p, t in zip(preds, true_labels)])
print(f"\nValidation Accuracy (subset of 100): {acc:.3f}")

# ============================================================
# 7. Submission Generation
# ============================================================

# Load Kaggle test split
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")

def format_test(example):
    q = clean_text(example["question"])
    a = clean_text(example.get("answer", ""))
    s = clean_text(example["solution"])
    merged = f"{a}\n\n{s}".strip()
    prompt = training_prompt.format(q, merged).strip()
    return {"prompt": prompt}

test_dataset = test_dataset.map(format_test)
preds = predict_correctness(test_dataset)

import pandas as pd
submission = pd.DataFrame({
    "ID": range(len(preds)),
    "is_correct": preds
})
submission.to_csv("submission.csv", index=False)
print("\nSaved submission.csv ✓")


==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196
Trainable parameters: 41943040


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1 | Total steps = 3,125
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss


 96%|█████████▌| 96/100 [00:32<00:01,  2.99it/s]


IndexError: list index out of range

In [None]:
# Prepare model for fast inference
FastLanguageModel.for_inference(model)

# Inference prompt aligned with True/False labels
inference_prompt = """You are a great mathematician and you are tasked with finding if a solution to a given maths question is correct or not.
Your response should be 'True' if the solution is correct, otherwise 'False'.
Below is the Question, Proposed Answer, and the detailed Reasoning.

Question:
{}

Proposed Answer:
{}

Reasoning:
{}

Output:
"""

# Select a sample from validation set
example = val_dataset[10]
question = example["question"]
answer = example["answer"]
solution = example["solution"]

# Format inference prompt
prompt = inference_prompt.format(question, answer, solution)

# Tokenize input
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

# Generate the model's prediction
outputs = model.generate(**inputs, max_new_tokens=8, use_cache=True)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the model-generated completion
prediction = decoded[len(prompt):].strip()

# Normalize and map any variant forms
if "true" in prediction.lower():
    pred_label = "True"
elif "false" in prediction.lower():
    pred_label = "False"
else:
    pred_label = "False" if "no" in prediction.lower() else "True"

# Display result
print("#### QUESTION ####")
print(question)
print("\n#### PROPOSED ANSWER ####")
print(answer)
print("\n#### REASONING ####")
print(solution)
print("\n#### MODEL'S PREDICTION ####")
print(pred_label)
print("\n#### CORRECT ANSWER ####")
print(example["is_correct"])

#### QUESTION ####
If
\[\sin x + \cos x + \tan x + \cot x + \sec x + \csc x = 7,\]then find $\sin 2x.$

#### PROPOSED ANSWER ####
3 \sqrt{57}/2

#### REASONING ####
Since $\sec x$ is equivalent to $1/\cos x$, $\csc x$ is equivalent to $1/\sin x$, and $\cot x$ is equivalent to $1/\tan x$, the expression becomes:

\[\sin x + \cos x + \frac{1}{\sin x} + \frac{1}{\cos x} + \frac{1}{\sin x} + \frac{1}{\cos x} = 7\]

\[\sin x + \cos x + \frac{2}{\sin x} + \frac{2}{\cos x} = 7\]

Using the quadratic formula, we can solve for $\sin x$ and $\cos x$ and find their respective values.

\[\sin x = \frac{11 + \sqrt{57}}{4} \quad \text{and} \quad \cos x = \frac{11 - \sqrt{57}}{4}\]

To find $\sin 2x$, we use the following identity:

\[\sin 2x = 2 \sin x \cos x\]

Plugging in the values of $\sin x$ and $\cos x$ and simplifying, we get:

\[\sin 2x = \frac{3 \sqrt{57}}{2}\]

So the answer is $\boxed{3 \sqrt{57}/2}$.

#### MODEL'S PREDICTION ####
False

#### CORRECT ANSWER ####
False


In [None]:
import pandas as pd
from tqdm import tqdm

# Load the official test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
predictions = []

# A simple function to parse 'True' or 'False' from the model's raw output
def parse_output(response_text):
    # Find the text after "Output:"
    output_part = response_text.split("Output:\n")[-1]
    # Check if "True" is in that part, case-insensitively
    if 'true' in output_part.lower():
        return True
    return False



In [10]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
import torch

# Load test split
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")


EOS_TOKEN = tokenizer.eos_token

inference_prompt = """You are a careful mathematician.
Examine the following question, proposed answer, and solution.
Decide if the answer is correct. Respond with 'True' or 'False' only.

Question:
{}

Proposed Answer:
{}

Solution:
{}

Final answer:"""

def predict_is_correct(example):
    question = example["question"]
    answer   = example.get("answer", "")
    solution = example.get("solution", "")

    prompt = inference_prompt.format(question, answer, solution)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=5, do_sample=False)

    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    gen_part = response_text[len(prompt):].strip()


    if not gen_part:
        return "False"

    first_token = gen_part.split()[0].lower()
    if "true" in first_token:
        return "True"
    elif "false" in first_token:
        return "False"
    else:
        # fallback heuristic: check entire text
        if "true" in gen_part.lower():
            return "True"
        elif "false" in gen_part.lower():
            return "False"
        else:
            return "False"

# Generate predictions
predictions = []
for i, ex in enumerate(tqdm(test_dataset, desc="Generating predictions")):
    pred = predict_is_correct(ex)
    predictions.append(pred)

# Create submission DataFrame
submission = pd.DataFrame({
    "ID": range(len(predictions)),
    "is_correct": predictions
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("\n submission.csv created successfully!")
print(submission.head())


Generating predictions: 100%|██████████| 10000/10000 [46:51<00:00,  3.56it/s]


 submission.csv created successfully!
   ID is_correct
0   0      False
1   1      False
2   2      False
3   3      False
4   4      False



