In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM


model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # ✅ TinyLlama model (1.1B)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
from huggingface_hub import login

login()  # Will prompt for your token


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset

# Load the GSM8K dataset (default config is 'main')
dataset = load_dataset("gsm8k", "main")  # or "default" if you prefer


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [4]:
def generate_gsm8k_prompt(example, tokenizer, split="train"):
    question = example["question"]

    r1_prefix = [
        {
            "role": "system",
            "content": "You are a helpful assistant. You first think about the reasoning process step by step and then provide the user with an answer."
        },
        {
            "role": "user",
            "content": f"{question} Please show your reasoning inside <think> </think> tags and your final answer inside <answer> </answer> tags."
        },
        {
            "role": "assistant",
            "content": "Let me solve this step by step.\n<think>"
        }
    ]

    return {
        "prompt": tokenizer.apply_chat_template(r1_prefix, tokenize=False, continue_final_message=True),
        "target": example["answer"],
        "question": question,
        "split": split
    }


In [11]:
from datasets import load_dataset, concatenate_datasets

train_subset = dataset["train"].select(range(200))

# Map the prompt generator to the train split
train_dataset = dataset["train"].map(
    lambda x: generate_gsm8k_prompt(x, tokenizer),
    desc="Formatting train prompts"
)

# Optional: Concatenate with test split
# test_dataset = dataset["test"].map(lambda x: generate_gsm8k_prompt(x, tokenizer), desc="Formatting test prompts")
# full_dataset = concatenate_datasets([train_dataset, test_dataset])
full_dataset = train_dataset

# Convert to pandas for saving
df = full_dataset.to_pandas()

# Save to JSONL file
df.to_json("gsm8k_formatted.json", orient="records", lines=True)


In [12]:
# Print the first processed example
print(train_dataset[0])


{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72', 'prompt': '<|system|>\nYou are a helpful assistant. You first think about the reasoning process step by step and then provide the user with an answer.</s>\n<|user|>\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Please show your reasoning inside <think> </think> tags and your final answer inside <answer> </answer> tags.</s>\n<|assistant|>\nLet me solve this step by step.\n<think>', 'target': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72', 'split': 'train'}


In [14]:
import json

# Load the first example
with open("gsm8k_formatted.json", "r") as f:
    first_line = json.loads(f.readline())

prompt = first_line["prompt"]
print("📨 Prompt:\n", prompt)


📨 Prompt:
 <|system|>
You are a helpful assistant. You first think about the reasoning process step by step and then provide the user with an answer.</s>
<|user|>
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Please show your reasoning inside <think> </think> tags and your final answer inside <answer> </answer> tags.</s>
<|assistant|>
Let me solve this step by step.
<think>


In [15]:
import torch

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Run inference
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode only the completion
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

print("\n🤖 Model Response:\n", response)



🤖 Model Response:
 
- Natalia sold clips to 48 of her friends in April.
- Then she sold half as many clips in May.

- How many clips did Natalia sell altogether in April and May?

- Let's say she sold 36 clips in April and 24 in May.

- So, she sold a total of 60 clips in both April and May.

- Now, let's calculate the number of clips she sold in April.

- Let's say she sold 18 clips in April.



In [16]:
def format_reward_func(prompt: str, completion: str, example: dict[str, str]) -> float:
    import re

    try:
        reward = 0.0
        completion_lower = completion.lower()

        # ✅ Reasoning present?
        reasoning_keywords = [
            "step", "first", "then", "calculate", "next", "approach", "let's", "we get"
        ]
        if any(kw in completion_lower for kw in reasoning_keywords):
            reward += 2.0

        # ✅ Uses <think> and <answer> tags properly
        if "<think>" in completion and "</think>" in completion:
            reward += 1.0
        if "<answer>" in completion and "</answer>" in completion:
            reward += 1.0

        # ✅ Correct final answer?
        correct_answer_match = re.search(r"answer is (\d+)", example["answer"].lower())
        model_answer_match = re.search(r"<answer>\s*(.*?)\s*</answer>", completion)

        if correct_answer_match and model_answer_match:
            correct = correct_answer_match.group(1).strip()
            model = model_answer_match.group(1).strip()

            # Allow numeric match (you can make this more flexible with eval)
            if correct == model:
                reward += 5.0

        return reward

    except Exception:
        return 0.0


In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
from trl import GRPOTrainer, GRPOConfig
import pandas as pd


# ✅ Apply LoRA
model = prepare_model_for_kbit_training(model)
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=16,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.1,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# ✅ Load your pandas DataFrame (already prepared)
# hf_dataset = Dataset.from_pandas(dataset)

# ✅ GRPOConfig


# grpo_config = GRPOConfig(
#     output_dir="./qwen1.5-countdown-grpo",
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=4,
#     num_train_epochs=1,
#     learning_rate=5e-6,
#     logging_steps=2,
#     save_steps=0,
#     warmup_steps=0,
#     bf16=False,                              # ✅ Set False if Colab doesn’t support BF16
#     fp16=True,
#     remove_unused_columns=False,
#     max_prompt_length=256,
#     max_completion_length=64,
#     num_generations=2
# )

grpo_config = GRPOConfig(
    output_dir="./qwen1.5-countdown-grpo",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=5e-6,
    logging_steps=4,                    # ✅ More frequent logging
    report_to="none",                   # ✅ Disable W&B logging for now
    save_steps=10,                      # Optional
    bf16=False,
    fp16=True,
    remove_unused_columns=False,
    max_prompt_length=256,
    max_completion_length=64,
    num_generations=2                  # ✅ Reduce to 1 for faster runs
)


def combined_reward_fn_factory(dataset):
    def reward_fn(prompts, completions, **kwargs):
        rewards = []
        for i, (prompt, completion) in enumerate(zip(prompts, completions)):
            try:
                example = dataset[i]

                format_reward = format_reward_func(prompt, completion, example)
                # equation_reward = equation_reward_func(prompt, completion, example)

                # total_reward = format_reward + equation_reward
                total_reward = format_reward
                rewards.append(total_reward)

            except Exception as e:
                print(f"[WARN] Reward function failed at index {i}: {e}")
                rewards.append(0.0)
        # print(rewards)
        return rewards

    return reward_fn

# def combined_reward_fn_factory(dataset):
#     def reward_fn(prompts, completions, **kwargs):
#         # ✅ Skip format_reward_func and equation_reward_func for now
#         print(f"[DEBUG] Scoring {len(prompts)} completions...")  # Optional logging
#         return [1.0 for _ in prompts]  # Return fixed reward
#     return reward_fn

reward_fn = combined_reward_fn_factory(train_dataset)






In [23]:
# ✅ Initialize GRPOTrainer
trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    train_dataset=train_dataset,
    reward_funcs=reward_fn,
)



trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
4,-0.0008
8,-0.0017
12,-0.001
16,-0.0001
20,-0.0003
24,0.0008
28,-0.0002
32,0.0006
36,0.0003
40,0.0004


TrainOutput(global_step=467, training_loss=0.0010722211618933676, metrics={'train_runtime': 3145.4944, 'train_samples_per_second': 2.376, 'train_steps_per_second': 0.148, 'total_flos': 0.0, 'train_loss': 0.0010722211618933676})

In [24]:
import torch

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Run inference
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode only the completion
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

print("\n🤖 Model Response:\n", response)



🤖 Model Response:
 
First, let's think about the reasoning process step by step.

Step 1: Calculate the number of clips sold by Natalia in April and May.
Step 2: Calculate the total number of clips sold by Natalia throughout the month of April and May.
Step 3: Multiply the total number of clips sold by April and May by the percentage of sales in April and May.
Step 4: Divide the result by 48 to get the average number of clips sold per month.
Step 5: Add 48 clips to
