In [3]:
# %%capture
# This cell will take time
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting unsloth
  Downloading unsloth-2024.11.7-py3-none-any.whl.metadata (59 kB)
Collecting unsloth-zoo>=2024.11.1 (from unsloth)
  Downloading unsloth_zoo-2024.11.5-py3-none-any.whl.metadata (16 kB)
Collecting torch>=2.4.0 (from unsloth)
  Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting transformers>=4.46.1 (from unsloth)
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datas

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.381 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Downloading shards: 100%|██████████| 4/4 [01:21<00:00, 20.29s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.14s/it]


In [6]:
# download and load competition dataset

from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

Generating train split: 100%|██████████| 1000000/1000000 [00:03<00:00, 309070.39 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 293735.22 examples/s]


DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
import numpy as np
from datasets import Dataset
def create_train_val_split(dataset, train_size=50000, val_size=2000, seed=42):
    """
    Create train and validation splits from the dataset.
    
    Args:
        dataset: The original dataset
        train_size: Number of samples for training
        val_size: Number of samples for validation
        seed: Random seed for reproducibility
    
    Returns:
        train_dataset: Dataset with train_size samples
        val_dataset: Dataset with val_size samples
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    
    # Get total number of samples
    total_samples = len(dataset)
    
    # Create indices for the entire dataset
    all_indices = np.arange(total_samples)
    
    # Shuffle the indices
    np.random.shuffle(all_indices)
    
    # Select indices for train and validation
    train_indices = all_indices[:train_size]
    val_indices = all_indices[train_size:train_size + val_size]
    
    # Create the splits
    train_dataset = Dataset.from_dict(dataset[train_indices])
    val_dataset = Dataset.from_dict(dataset[val_indices])
    
    # Convert datasets using the Alpaca format
    train_dataset = train_dataset.map(convert_to_alpaca, batched=True)
    val_dataset = val_dataset.map(convert_to_alpaca, batched=True)
    
    print(f"Original dataset size: {total_samples}")
    print(f"Training dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")
    
    return train_dataset, val_dataset

In [13]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an expert mathematician tasked with verifying mathematical solutions. Carefully analyze the given question and answer, then determine if the answer is correct. Respond ONLY with 'True' if the answer is correct, or 'False' if it's incorrect.

### Input:
Question: {}
Answer: {}

### Response:
{}"""
    
def convert_to_alpaca(examples):
    questions = examples["question"]
    answers = examples["answer"]
    outputs = [str(x) for x in examples["is_correct"]]
    texts = []
    for question, answer, output in zip(questions, answers, outputs):
        text = alpaca_prompt.format(question, answer, output)
        texts.append(text)
    return {"text": texts}
    
# Convert datasets
ds = dataset['train']
train_dataset, val_dataset = create_train_val_split(ds, train_size=100_000, val_size=5000)
test_dataset = dataset['test'].map(convert_to_alpaca, batched=True)

# Save datasets
train_dataset.to_json("math_train.json")
test_dataset.to_json("math_test.json")

Map: 100%|██████████| 100000/100000 [00:00<00:00, 110949.15 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 111757.51 examples/s]


Original dataset size: 1000000
Training dataset size: 100000
Validation dataset size: 5000


Creating json from Arrow format: 100%|██████████| 100/100 [00:01<00:00, 59.97ba/s]
Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 61.65ba/s]


15408217

In [14]:
train_dataset[0]

{'question': 'Let $\\mathbf{a} = \\begin{pmatrix} 3 \\\\ 1 \\end{pmatrix}$ and $\\mathbf{b} = \\begin{pmatrix} -5 \\\\ 2 \\end{pmatrix}.$  Find the area of the triangle with vertices $\\mathbf{0},$ $\\mathbf{a},$ and $\\mathbf{b}.$',
 'is_correct': True,
 'answer': '5.5',
 'solution': "We can find an area of parallelogram given vectors' lengths and angle between them.\n<llm-code>\nimport math\n\n# area of parallelogram\ndef area(vec1, vec2):\n    dot_product = math.sqrt(vec1[0]**2 + vec1[1]**2) * math.sqrt(vec2[0]**2 + vec2[1]**2)\n    cos_alpha = (vec1[0] * vec2[0] + vec1[1] * vec2[1]) / (dot_product)\n    angle = math.acos(cos_alpha)\n    parallelogram_area = dot_product * math.sin(angle)\n    return parallelogram_area / 2\n\n# define vectors a and b\na = [3, 1]\nb = [-5, 2]\n\n# find the area of the parallelogram\nparallelogram_area = area(a, b)\n\n# print the area\nprint(parallelogram_area)\n</llm-code>\n<llm-code-output>\n5.500000000000001\n</llm-code-output>\nSo the answer is $\\

In [15]:
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig
from unsloth import is_bfloat16_supported

# Define the training arguments (initial values)
training_args = SFTConfig(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    warmup_ratio=0.03,
    max_steps=5000,
    learning_rate=3e-4,  # Start with a conservative learning rate
    lr_scheduler_type="cosine",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,  # Evaluate every few steps
    save_steps=100,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    gradient_checkpointing=True,
    optim="adamw_8bit",
    # group_by_length=True,
    output_dir="outputs",
    report_to="none",  # Replace with "wandb" if you want tracking
)

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset.shuffle(seed=42),
    eval_dataset=val_dataset.shuffle(seed=42),
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=training_args,
    packing=False,
)

Map: 100%|██████████| 100000/100000 [00:13<00:00, 7602.24 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 8523.23 examples/s]
Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [16]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 2
\        /    Total batch size = 16 | Total steps = 5,000
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
100,0.5467,0.576411
200,0.5889,0.581045
300,0.5637,0.562463
400,0.586,0.550477
500,0.5327,0.536628
600,0.5581,0.522176
700,0.522,0.517041
800,0.528,0.49884
900,0.4797,0.488832
1000,0.4861,0.471368


KeyboardInterrupt: 

In [17]:
# Sample inferene data point
test_dataset = dataset['test']

sample_ques = val_dataset['question'][1]
sample_ans = val_dataset['answer'][1]

In [18]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
input_prompt = alpaca_prompt.format(
        sample_ques, # ques
        sample_ans, # given answer
        "", # output - leave this blank for generation! LLM willl generate is it is True or False
    )

print("Input Prompt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 2, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
response

Input Prompt:
 Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an expert mathematician tasked with verifying mathematical solutions. Carefully analyze the given question and answer, then determine if the answer is correct. Respond ONLY with 'True' if the answer is correct, or 'False' if it's incorrect.

### Input:
Question: The perimeter of a particular square and the circumference of a particular circle are equal. What is the ratio of the area of the square to the area of the circle? Express your answer as a common fraction in terms of $\pi$.
Answer: \frac{\pi^2}{4}

### Response:



['False\n\n']

In [19]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [20]:
!zip -r model.zip lora_model/ outputs/checkpoint-3600/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: lora_model/ (stored 0%)
  adding: lora_model/README.md (deflated 66%)
  adding: lora_model/adapter_model.safetensors (deflated 7%)
  adding: lora_model/adapter_config.json (deflated 54%)
  adding: lora_model/tokenizer_config.json (deflated 96%)
  adding: lora_model/special_tokens_map.json (deflated 71%)
  adding: lora_model/tokenizer.json (deflated 85%)
  adding: outputs/checkpoint-3600/ (stored 0%)
  adding: outputs/checkpoint-3600/README.md (deflated 66%)
  adding: outputs/checkpoint-3600/adapter_model.safetensors (deflated 7%)
  adding: outputs/checkpoint-3600/adapter_config.json (deflated 54%)
  adding: outputs/checkpoint-3600/tokenizer_config.json (deflated 96%)
  adding: outputs/checkpoint-3600/special_tokens_map.json (deflated 71%)
  adding: outputs/checkpoint-3600/tokenizer.json (deflated 85%)
  adding: outputs/checkpoint-3600/training_args.bin (deflated 51%)
  adding: outputs/checkpoint-3600/optimizer.pt (deflated 13%)
  adding: outputs/checkpoint-3600/scheduler.pt (

In [6]:
!unzip model_3.zip

Archive:  model_3.zip
   creating: lora_model_3/
  inflating: lora_model_3/adapter_model.safetensors  
  inflating: lora_model_3/README.md  
  inflating: lora_model_3/tokenizer.json  
  inflating: lora_model_3/adapter_config.json  
  inflating: lora_model_3/tokenizer_config.json  
  inflating: lora_model_3/special_tokens_map.json  


In [21]:
import torch
from tqdm.auto import tqdm
import pandas as pd

In [22]:
def fast_inference(model, tokenizer, test_dataset, batch_size=32):
    """
    Fast inference optimized for HuggingFace datasets
    """
    # Enable fast inference
    FastLanguageModel.for_inference(model)
    # model.eval()
    
    # Create dataloader using HF features
    dataloader = torch.utils.data.DataLoader(
        test_dataset, 
        batch_size=batch_size,
        shuffle=False
    )
    
    predictions = []
    prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an expert mathematician tasked with verifying mathematical solutions. Carefully analyze the given question and answer, then determine if the answer is correct. Respond ONLY with 'True' if the answer is correct, or 'False' if it's incorrect.

### Input:
Question: {question}
Answer: {answer}

### Response:"""

    with torch.no_grad():
        for batch in tqdm(dataloader):
            # Create prompts using HF dataset fields
            batch_prompts = [
                prompt_template.format(
                    question=q,
                    answer=a
                ) for q, a in zip(batch['question'], batch['answer'])
            ]
            
            # Tokenize
            inputs = tokenizer(
                batch_prompts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(model.device)

            # Generate
            outputs = model.generate(
                **inputs,
                max_new_tokens=2,
                num_beams=1,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode
            batch_preds = tokenizer.batch_decode(
                outputs[:, inputs['input_ids'].shape[1]:], 
                skip_special_tokens=True
            )
            predictions.extend([pred.strip() for pred in batch_preds])
    
    # Create submission DataFrame
    submission_df = pd.DataFrame({
        'ID': range(len(predictions)),
        'is_correct': [pred.lower() == 'true' for pred in predictions]
    })
    
    return submission_df

In [10]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model_3", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inferenc

==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A10. Max memory: 22.184 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.6. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.6 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
       

In [23]:
FastLanguageModel.for_inference(model)
print("Running inference and calculating metrics...")
results_df = fast_inference(
    model, 
    tokenizer, 
    test_dataset,
    batch_size=16
)

Running inference and calculating metrics...


100%|██████████| 625/625 [04:52<00:00,  2.13it/s]


In [24]:
results_df.to_csv("submission.csv", index=False)