In [None]:
import os
# Force unsloth to use the local GPU memory efficiently
os.environ["UNSLOTH_RETURN_LOGITS"] = "1"

In [None]:
# m!pip install uv
# Cell 1: Installs
# We need 'trl' for GRPO and 'unsloth' for the model
!uv pip install unsloth vllm
!uv pip install --no-deps trl peft accelerate bitsandbytes
!uv pip uninstall xformers
!uv pip install -q datasets
!uv pip install -q pandas
!uv pip install -q tensorboard
!uv pip install -q -U "huggingface-hub>=0.34.0,<1.0"



In [None]:
# !uv venv
# !uv pip install unsloth

In [None]:
# !source .venv/bin/activate

In [None]:
# !uv pip install torch torchvision
# !uv pip install "transformers>=5.0.0rc1"

In [None]:
import sys
import os
try:
    # Fix for xformers/triton incompatibility
    sys.modules['xformers'] = None
except: pass

# Cell 2: Load Model (Gemma 3 4B)
from unsloth import FastLanguageModel
import torch

max_seq_length = 4096 # Reasoning needs space!
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit", # Or "unsloth/gemma-3-4b-it" if 4bit not up yet
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Enable PEFT (LoRA)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    use_gradient_checkpointing = "unsloth", # Critical for 12GB VRAM
    random_state = 3407,
)

In [None]:
# Cell 3: The HICRA Logic (Strategic Grams)

# These are the "thinking words" the paper identified. 
# When the model uses these, it is "planning".
STRATEGIC_GRAMS = [
    "first i need to", "let's look at", "alternatively", "wait", 
    "but i'm not sure", "let's see if", "notice that", 
    "the final answer is", "let's assume", "we can conclude",
    "implies that", "to solve this", "break it down", 
    "suppose that", "checking the", "recall that"
]

def correctness_reward_func(prompts, completions, answer, **kwargs):
    """
    Reward = 1.0 if the final answer is correct, 0.0 otherwise.
    This is the "Ground Truth" signal.
    """
    rewards = []
    for completion, correct_ans in zip(completions, answer):
        # Simple check: is the answer roughly in the text?
        # In a real system, you'd extract the number exactly.
        # For now, we check if the correct string appears in the output.
        if str(correct_ans) in completion:
            rewards.append(1.0)
        else:
            rewards.append(0.0)
    return rewards

def hicra_planning_reward_func(prompts, completions, **kwargs):
    """
    HICRA Proxy: Gives a small bonus for using 'Strategic Grams'.
    This encourages the model to 'think' before answering.
    """
    rewards = []
    for completion in completions:
        score = 0.0
        # Check for presence of planning words
        completion_lower = completion.lower()
        for gram in STRATEGIC_GRAMS:
            if gram in completion_lower:
                score += 0.1 # Small bonus for EACH planning step
        
        # Cap the bonus so it doesn't game the system just by spamming words
        rewards.append(min(score, 0.5)) 
    return rewards

In [None]:
# Cell 4: Prepare Data for GRPO
from datasets import load_dataset

# Load the file you generated with the API script
dataset = load_dataset("json", data_files="reasoning_dataset.json", split="train")

# GRPO expects a specific format. We don't need a system prompt for simple math.
# It just needs 'prompt' and 'answer' (which we generated).
print(dataset[0])

In [None]:
# Cell 5: Train with GRPO
from trl import GRPOTrainer, GRPOConfig

# Load the TensorBoard extension
%load_ext tensorboard

# Start TensorBoard pointing to your output directory
# (Make sure 'gemma-3-reasoning-output' matches the 'output_dir' in your GRPOConfig!)
%tensorboard --logdir gemma-3-reasoning-output

# Configuration for 12GB VRAM
training_args = GRPOConfig(
    output_dir="gemma-3-reasoning-output",
    learning_rate=2e-5, # RL usually needs lower LR
    per_device_train_batch_size=1, # Keep small for VRAM
    gradient_accumulation_steps=8, 
    max_prompt_length=512,
    max_completion_length=512, # Allow it to think!
    num_generations=2, # Generate 2 answers per question to compare
    max_steps=200, # Quick run to test
    use_vllm=False,
    save_steps=50,
    logging_steps=1,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    report_to="tensorboard"
)

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[correctness_reward_func, hicra_planning_reward_func],
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

# Start the RL Loop!
trainer.train()

In [None]:
# Continue training from step 60 to step 180
trainer.args.max_steps = 180  # New target

# Resume from the last checkpoint
trainer_stats = trainer.train(resume_from_checkpoint=True)

In [None]:
# Continue training from step 60 to step 180
trainer.args.max_steps = 270  # New target

# Resume from the last checkpoint
trainer_stats = trainer.train(resume_from_checkpoint=True)

Set up the transformers inference API:

1. Adjusting Your Script for the Project
Here is the adjusted script. I have updated it to fit the Gemma-9B context and added a safety step to clear memory before merging (crucial on cloud GPUs to avoid crashing at the finish line).

You should append this to the end of your training notebook/script.

2. Important Step for HF Spaces
You must add your Hugging Face Token as a Secret in the Space settings, or the script won't be able to push the model.

Go to your Space -> Settings.

Scroll to "Variables and secrets".

Add a New Secret: HF_TOKEN -> [Paste your Write token].

In [None]:
import torch
import os
import gc
from huggingface_hub import login

# --- 1. MEMORY CLEANUP (Crucial for Cloud) ---
# RL Training fills VRAM. We need to clear it before the heavy "Merge" step.
print("üßπ Cleaning up VRAM before merging...")
try:
    del trainer
    del batch
    gc.collect()
    torch.cuda.empty_cache()
except:
    pass

# --- 2. RELOAD MODEL FOR MERGING ---
# Sometimes it's safer to reload the base model + adapter freshly to merge
# independent of the messy training state.
from unsloth import FastLanguageModel

print("üîÑ Reloading model for clean merge...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b-it-bnb-4bit", # Your base model
    max_seq_length = 4096,
    dtype = None,
    load_in_4bit = True,
)

# Load the adapters you just trained
# Assuming your GRPOConfig output_dir was "gemma-reasoning-output"
# and the latest checkpoint is saved there.
from peft import PeftModel
model = PeftModel.from_pretrained(model, "gemma-reasoning-output/checkpoint-final") # Update path to your actual checkpoint folder!

# --- 3. LOGIN & PUSH ---
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    login(token=hf_token)
else:
    print("‚ö†Ô∏è No HF_TOKEN found! Check your Space 'Settings' -> 'Variables' to add it.")

repo_name = "david-barnes/Gemma-2-9B-Reasoning-v1" # Your new repo name

print(f"‚è≥ Merging to 16-bit and Pushing to: {repo_name}...")

# This takes care of the de-quantization and merging in one go
model.push_to_hub_merged(
    repo_name,
    tokenizer,
    save_method = "merged_16bit", # 16-bit is best for sharing reasoning models
    token = hf_token
)

print("‚úÖ Success! Your reasoning model is live.")

### 3. Configure LoRA:

Unsloth handles the target modules automatically (including the tricky gate_proj, up_proj, etc. that vanilla Peft requires you to list manually).

### Check where the model is stored

In [None]:
# Check where the model is cached
from huggingface_hub import hf_hub_download
import os

cache_dir = os.path.expanduser("~/.cache/huggingface/hub/")
print(f"Model cache location: {cache_dir}")
print("\nContents:")
if os.path.exists(cache_dir):
    for item in os.listdir(cache_dir)[:10]:  # Show first 10 items
        print(f"  - {item}")
else:
    print("Cache directory not found yet")

# You can also set a custom cache location if you prefer:
# os.environ['HF_HOME'] = '/path/to/custom/cache'

## Apply QLora

Quick calculation:

700 records
Effective batch size = per_device_batch_size (2) √ó gradient_accumulation_steps (4) = 8
Steps per epoch = 700 / 8 = ~88 steps
So 60 steps = ~0.7 epochs - you haven't even completed one full pass through your data yet!

Recommendations:

Epochs |	Steps |	Use Case |
1 |	~90 |	Minimum - sees all data once |
2-3 |	~180-270|	Sweet spot for fine-tuning |
5+ |	440+ |	Risk of overfitting |

Since your loss was still decreasing at step 60, you probably have room to train more. I'd suggest trying max_steps = 180 (about 2 epochs) for a good balance.

Watch for:

‚úÖ Good sign: Loss continues decreasing smoothly
‚ö†Ô∏è Overfitting warning: Loss drops very low (<0.1) or starts fluctuating

### LOGIN TO HUB

When we push to HuggingFace Hub, it will merge our local QLoRa adaptor with the base model we used to train, on the hub.

In [None]:
import os
from huggingface_hub import login

# Try to login with token from environment variable
hf_token = os.getenv('HF_TOKEN')
if hf_token:
	login(token=hf_token)
	print("Logged in with HF_TOKEN environment variable") 
else:
	# Skip login for local training - you can still train without pushing to hub
	print("No HF_TOKEN found. Proceeding with local training on local GPU...")
	print("Note: You won't be able to push models to HuggingFace Hub without authentication")

# Push Model to hub!

In [None]:
from unsloth import FastLanguageModel
import os
device = "cuda:0"

# 1. CONFIGURATION
# Point this to the exact folder on your disk
checkpoint_path = "outputs/checkpoint-180" 
repo_name = "DataImaginations/ministral-3B-Beancount-v1" # Your Hugging Face repo
hf_token = os.getenv('HF_TOKEN')

# 2. LOAD SPECIFIC CHECKPOINT
# Unsloth is smart: if you point it to a folder, it loads the base model 
# AND applies the adapters from that folder automatically.
print(f"üìÇ Loading checkpoint from {checkpoint_path}...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = checkpoint_path, 
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True, # Keep True for fast loading (Unsloth handles the merge magic)
)

# 3. MERGE & PUSH
# This will de-quantize the base model, merge your checkpoint-180 adapters, 
# and upload a clean 16-bit model to the Hub.
print(f"üöÄ Merging and pushing to {repo_name}...")

model.push_to_hub_merged(
    repo_name,
    tokenizer,
    save_method = "merged_16bit", # options: "merged_4bit", "merged_16bit"
    token = hf_token
)

print("‚úÖ Done! Your Junior Accountant (Checkpoint 180) is live!")