In [None]:
import os
# Force unsloth to use the local GPU memory efficiently
os.environ["UNSLOTH_RETURN_LOGITS"] = "1"

In [None]:
!./setup_grpo_transformers.sh



[2mAudited [1m3 packages[0m [2min 5ms[0m[0m
[2K  [31m√ó[0m No solution found when resolving dependencies:                                  [0m
[31m  ‚ï∞‚îÄ‚ñ∂ [0mBecause 4-45 was not found in the package registry and you require 4-45,
[31m      [0mwe can conclude that your requirements are unsatisfiable.
[2K  [31m√ó[0m No solution found when resolving dependencies:                                  [0m
[31m  ‚ï∞‚îÄ‚ñ∂ [0mBecause 0-11 was not found in the package registry and you require 0-11,
[31m      [0mwe can conclude that your requirements are unsatisfiable.
[2K  [31m√ó[0m No solution found when resolving dependencies:                                  [0m
[31m  ‚ï∞‚îÄ‚ñ∂ [0mBecause 0-7 was not found in the package registry and you require 0-7,
[31m      [0mwe can conclude that your requirements are unsatisfiable.
[2K  [31m√ó[0m No solution found when resolving dependencies:                                  [0m
[31m  ‚ï∞‚îÄ‚ñ∂ [0mBecause 0-33 was n

### Local login, not for use with spaces

In [9]:
# Check where the model is cached
from huggingface_hub import hf_hub_download
import os

cache_dir = os.path.expanduser("~/.cache/huggingface/hub/")
print(f"Model cache location: {cache_dir}")
print("\nContents:")
if os.path.exists(cache_dir):
    for item in os.listdir(cache_dir)[:10]:  # Show first 10 items
        print(f"  - {item}")
else:
    print("Cache directory not found yet")

# You can also set a custom cache location if you prefer:
# os.environ['HF_HOME'] = '/path/to/custom/cache'

Model cache location: /home/user/.cache/huggingface/hub/

Contents:
  - version.txt


### Server-Side HF Login

In [None]:
import os
from huggingface_hub import login

# Login using your HF token
hf_token = os.getenv('HF_TOKEN')  # Try environment variable first

if hf_token:
    login(token=hf_token)
    print("‚úÖ Logged in with HF_TOKEN environment variable")
else:
    # If no env var, prompt for token (you'll need to paste it)
    login()
    print("‚úÖ Logged in interactively")

In [None]:
# !ssh -i ~/.ssh/id_ed25519 dataimaginations-heirarchical-reasoning@ssh.hf.space "echo 'export HF_TOKEN=' >> ~/.bashrc"

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import GRPOConfig, GRPOTrainer

# --- CONFIGURATION ---
MODEL_NAME = "google/gemma-3-4b-it"
output_dir = "gemma-3-reasoning-output"

# 1. Load Model in 4-bit (The "Unsloth" replacement)
print("‚è≥ Loading model in 4-bit...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, # Efficient on Ampere GPUs (A10G)
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation="flash_attention_2" # Optional: Faster if supported
)

‚è≥ Loading model in 4-bit...


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-3-4b-it.
401 Client Error. (Request ID: Root=1-6949d7ed-30aa135c591894357abfd08d;a08343e2-045f-4f25-a69f-7f50a671dc83)

Cannot access gated repo for url https://huggingface.co/google/gemma-3-4b-it/resolve/main/config.json.
Access to model google/gemma-3-4b-it is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
# 2. Prepare Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.padding_side = "left" # CRITICAL for reasoning/generation steps!
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Cell 3: The HICRA Logic (Strategic Grams)

# These are the "thinking words" the paper identified. 
# When the model uses these, it is "planning".
STRATEGIC_GRAMS = [
    "first i need to", "let's look at", "alternatively", "wait", 
    "but i'm not sure", "let's see if", "notice that", 
    "the final answer is", "let's assume", "we can conclude",
    "implies that", "to solve this", "break it down", 
    "suppose that", "checking the", "recall that"
]

def correctness_reward_func(prompts, completions, answer, **kwargs):
    """
    Reward = 1.0 if the final answer is correct, 0.0 otherwise.
    This is the "Ground Truth" signal.
    """
    rewards = []
    for completion, correct_ans in zip(completions, answer):
        # Simple check: is the answer roughly in the text?
        # In a real system, you'd extract the number exactly.
        # For now, we check if the correct string appears in the output.
        if str(correct_ans) in completion:
            rewards.append(1.0)
        else:
            rewards.append(0.0)
    return rewards

def hicra_planning_reward_func(prompts, completions, **kwargs):
    """
    HICRA Proxy: Gives a small bonus for using 'Strategic Grams'.
    This encourages the model to 'think' before answering.
    """
    rewards = []
    for completion in completions:
        score = 0.0
        # Check for presence of planning words
        completion_lower = completion.lower()
        for gram in STRATEGIC_GRAMS:
            if gram in completion_lower:
                score += 0.1 # Small bonus for EACH planning step
        
        # Cap the bonus so it doesn't game the system just by spamming words
        rewards.append(min(score, 0.5)) 
    return rewards

In [None]:
# Cell 4: Prepare Data for GRPO
from datasets import load_dataset

# Load the file you generated with the API script
dataset = load_dataset("json", data_files="reasoning_dataset.json", split="train")

# GRPO expects a specific format. We don't need a system prompt for simple math.
# It just needs 'prompt' and 'answer' (which we generated).
print(dataset[0])

In [None]:
# Attach LoRA Adapters (PEFT)
print("üîó Attaching LoRA adapters...")
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
    lora_dropout=0.05,
    bias="none",
)
# We wrap the model manually so GRPO knows it's a PEFT model
model = get_peft_model(model, peft_config)

In [None]:
# Load the TensorBoard extension
%load_ext tensorboard

# Start TensorBoard pointing to your output directory
# (Make sure 'gemma-3-reasoning-output' matches the 'output_dir' in your GRPOConfig!)
%tensorboard --logdir gemma-3-reasoning-output

# Define Training Arguments (GRPO)
training_args = GRPOConfig(
    output_dir="gemma-3-reasoning-output",
    learning_rate=5e-6, # Lower LR for RL
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_prompt_length=512,
    max_completion_length=512, # The "Thinking" Space
    num_generations=4, # Rollouts per prompt
    max_steps=200, 
    save_steps=50,
    logging_steps=1,
    fp16=False,
    bf16=True, # Use bfloat16 on A10G
    report_to="tensorboard"
)

In [None]:
# 5. Initialize Trainer
# Note: We assume you still have your 'dataset' and reward functions from before
print("üöÄ Starting GRPO Trainer...")
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer, # Newer TRL uses 'processing_class' instead of 'tokenizer'
    reward_funcs=[correctness_reward_func, hicra_planning_reward_func],
    args=training_args,
    train_dataset=dataset,
)

In [None]:
# Train with GRPO
trainer.train()


In [None]:
# Continue training from step 60 to step 180
trainer.args.max_steps = 180  # New target

# Resume from the last checkpoint
trainer_stats = trainer.train(resume_from_checkpoint=True)

In [None]:
# Continue training from step 60 to step 180
trainer.args.max_steps = 270  # New target

# Resume from the last checkpoint
trainer_stats = trainer.train(resume_from_checkpoint=True)

Set up the transformers inference API:

1. Adjusting Your Script for the Project
Here is the adjusted script. I have updated it to fit the Gemma-9B context and added a safety step to clear memory before merging (crucial on cloud GPUs to avoid crashing at the finish line).

You should append this to the end of your training notebook/script.

2. Important Step for HF Spaces
You must add your Hugging Face Token as a Secret in the Space settings, or the script won't be able to push the model.

Go to your Space -> Settings.

Scroll to "Variables and secrets".

Add a New Secret: HF_TOKEN -> [Paste your Write token].

In [5]:
import torch
import os
import gc
from huggingface_hub import login

# --- 1. MEMORY CLEANUP (Crucial for Cloud) ---
# RL Training fills VRAM. We need to clear it before the heavy "Merge" step.
print("üßπ Cleaning up VRAM before merging...")
try:
    del trainer
    del batch
    gc.collect()
    torch.cuda.empty_cache()
except:
    pass

# --- 2. RELOAD MODEL FOR MERGING ---
# Sometimes it's safer to reload the base model + adapter freshly to merge
# independent of the messy training state.
from unsloth import FastLanguageModel

print("üîÑ Reloading model for clean merge...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b-it-bnb-4bit", # Your base model
    max_seq_length = 4096,
    dtype = None,
    load_in_4bit = True,
)

# Load the adapters you just trained
# Assuming your GRPOConfig output_dir was "gemma-reasoning-output"
# and the latest checkpoint is saved there.
from peft import PeftModel
model = PeftModel.from_pretrained(model, "gemma-reasoning-output/checkpoint-final") # Update path to your actual checkpoint folder!

# --- 3. LOGIN & PUSH ---
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    login(token=hf_token)
else:
    print("‚ö†Ô∏è No HF_TOKEN found! Check your Space 'Settings' -> 'Variables' to add it.")

repo_name = "david-barnes/Gemma-2-9B-Reasoning-v1" # Your new repo name

print(f"‚è≥ Merging to 16-bit and Pushing to: {repo_name}...")

# This takes care of the de-quantization and merging in one go
model.push_to_hub_merged(
    repo_name,
    tokenizer,
    save_method = "merged_16bit", # 16-bit is best for sharing reasoning models
    token = hf_token
)

print("‚úÖ Success! Your reasoning model is live.")

üßπ Cleaning up VRAM before merging...


ModuleNotFoundError: No module named 'unsloth'

### 3. Configure LoRA:

Unsloth handles the target modules automatically (including the tricky gate_proj, up_proj, etc. that vanilla Peft requires you to list manually).

### Check where the model is stored

In [6]:
# Check where the model is cached
from huggingface_hub import hf_hub_download
import os

cache_dir = os.path.expanduser("~/.cache/huggingface/hub/")
print(f"Model cache location: {cache_dir}")
print("\nContents:")
if os.path.exists(cache_dir):
    for item in os.listdir(cache_dir)[:10]:  # Show first 10 items
        print(f"  - {item}")
else:
    print("Cache directory not found yet")

# You can also set a custom cache location if you prefer:
# os.environ['HF_HOME'] = '/path/to/custom/cache'

Model cache location: /home/user/.cache/huggingface/hub/

Contents:
  - version.txt


## Apply QLora

Quick calculation:

700 records
Effective batch size = per_device_batch_size (2) √ó gradient_accumulation_steps (4) = 8
Steps per epoch = 700 / 8 = ~88 steps
So 60 steps = ~0.7 epochs - you haven't even completed one full pass through your data yet!

Recommendations:

Epochs |	Steps |	Use Case |
1 |	~90 |	Minimum - sees all data once |
2-3 |	~180-270|	Sweet spot for fine-tuning |
5+ |	440+ |	Risk of overfitting |

Since your loss was still decreasing at step 60, you probably have room to train more. I'd suggest trying max_steps = 180 (about 2 epochs) for a good balance.

Watch for:

‚úÖ Good sign: Loss continues decreasing smoothly
‚ö†Ô∏è Overfitting warning: Loss drops very low (<0.1) or starts fluctuating

### LOGIN TO HUB

When we push to HuggingFace Hub, it will merge our local QLoRa adaptor with the base model we used to train, on the hub.

In [None]:
import os
from huggingface_hub import login

# Try to login with token from environment variable
hf_token = os.getenv('HF_TOKEN')
if hf_token:
	login(token=hf_token)
	print("Logged in with HF_TOKEN environment variable") 
else:
	# Skip login for local training - you can still train without pushing to hub
	print("No HF_TOKEN found. Proceeding with local training on local GPU...")
	print("Note: You won't be able to push models to HuggingFace Hub without authentication")

# Push Model to hub!

In [None]:
from unsloth import FastLanguageModel
import os
device = "cuda:0"

# 1. CONFIGURATION
# Point this to the exact folder on your disk
checkpoint_path = "outputs/checkpoint-180" 
repo_name = "DataImaginations/ministral-3B-Beancount-v1" # Your Hugging Face repo
hf_token = os.getenv('HF_TOKEN')

# 2. LOAD SPECIFIC CHECKPOINT
# Unsloth is smart: if you point it to a folder, it loads the base model 
# AND applies the adapters from that folder automatically.
print(f"üìÇ Loading checkpoint from {checkpoint_path}...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = checkpoint_path, 
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True, # Keep True for fast loading (Unsloth handles the merge magic)
)

# 3. MERGE & PUSH
# This will de-quantize the base model, merge your checkpoint-180 adapters, 
# and upload a clean 16-bit model to the Hub.
print(f"üöÄ Merging and pushing to {repo_name}...")

model.push_to_hub_merged(
    repo_name,
    tokenizer,
    save_method = "merged_16bit", # options: "merged_4bit", "merged_16bit"
    token = hf_token
)

print("‚úÖ Done! Your Junior Accountant (Checkpoint 180) is live!")