In [None]:
# Installs Unsloth, Xformers (Flash Attention), and TRL
!uv pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!uv pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft

In [None]:
!uv pip install -q -U bitsandbytes
# Use stable releases instead of git main to avoid breaking changes
# !uv pip install -q -U "transformers>=4.36.0,<4.50.0"  # Stable version that works with bnb, 
# !uv pip install transformers -U
# changed from 'git+https://github.com/huggingface/transformers.git'
!uv pip install -q -U "peft>=0.7.0"
!uv pip install -q -U "accelerate>=0.25.0"
!uv pip install -q datasets
!uv pip install -q pandas
!uv pip install -q tensorboard
!uv pip install -q -U "huggingface-hub>=0.34.0,<1.0"
!uv pip install -q trl

In [None]:
!uv pip install torch torchvision
# !uv pip install "transformers>=5.0.0rc1"

### Create Inference Harness

The next two cells are just to create a simple inference harness which we will use to do quick evals whilst we review our checkpoints. They are nothing to do with training the model and we‚Äôll revisit later.

---

The line `os.environ['TOKENIZERS_PARALLELISM'] = 'false'` is just to stop warnings where HF tokens use multiple CPU cores by default

When combined with PyTorch's DataLoader (which also uses multiprocessing), you can get conflicts so we set to `False`

In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

from IPython.display import HTML, display

# Apply CSS to enable text wrapping in code output
display(HTML('''
<style>
  pre {
      white-space: pre-wrap;
  }
</style>
'''))

Set up the transformers inference API:

In [2]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  # IMPORTANT: Must match training format exactly!
  # Training uses "### Instruction:" and "### Response:", not "Question/Answer"
  prompt_template = """### Instruction:
{query}

### Response:
"""
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.batch_decode(generated_ids)
  return (decoded[0])

### 3. Model & Tokenizer loading 

We'll load the model using **QLoRA** quantization to reduce the usage of memory
In full fine-tuning:
Our optimizer **AdamW** updates every weight matrix in the neural network.


We use FastLanguageModel here.

## Important: 

I've set it to load the BF16 Reasoning model in 4-bit mode, which fits perfectly on consumer GPUs while keeping high accuracy.

This should change with the instruct fp 8 model i think?

In [3]:
from unsloth import FastLanguageModel
import torch

# 1. Configuration
max_seq_length = 2048
dtype = None # Auto-detects your GPU capabilities
load_in_4bit = True # This replaces your 'bnb_config'

# 2. Load BOTH Model and Tokenizer
# Use Unsloth's pre-converted version!
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Ministral-3-3B-Instruct-2512",  # Match the download name!
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.6: Fast Mistral patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 4070 SUPER. Num GPUs = 1. Max memory: 11.594 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:08<00:00,  4.33s/it]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


RuntimeError: Unsloth: The tokenizer is weirdly not loaded? Please check if there is one.

### 3. Configure LoRA:

Unsloth handles the target modules automatically (including the tricky gate_proj, up_proj, etc. that vanilla Peft requires you to list manually).

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

4. Data Loading & Formatting (The "Junior Accountant" Logic):

This is where we inject your specific "Junior Accountant" System Prompt.

It maps your refined_data.json to the Mistral chat format automatically.

In [None]:
from datasets import load_dataset

# Define your custom System Prompt
system_prompt = """You are an expert accountant using Beancount syntax.
Instructions:
1. Analyze the transaction and the historical <context>.
2. FORMULATE A PLAN inside <plan> tags. Decide the high-level category (Asset, Liability, Income, Expense) and the double-entry logic.
3. EXECUTE THE PLAN inside <reasoning> tags. Verify the account name against history and confirm the math balances to zero.
4. WRITE THE CODE inside <entry> tags. Use strict Beancount syntax.
IMPORTANT: Output ONLY the raw XML."""

def formatting_prompts_func(examples):
    conversations = []
    
    # We assume your JSON has 'data.prompt' (input) and 'predictions...text' (output)
    # You might need to adjust these keys based on exactly how Label Studio exported the JSON
    # This example assumes a flat format: {"prompt": "...", "response": "..."}
    # If using raw Label Studio export, let me know and I can tweak this extraction!
    
    for prompt, response in zip(examples['prompt'], examples['response']):
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": response}
        ]
        
        # Apply the chat template (Correctly handles [INST] tags)
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        conversations.append(text)
        
    return { "text" : conversations }

# Load your dataset
# Make sure 'refined_data.json' is formatted with 'prompt' and 'response' fields!
dataset = load_dataset("json", data_files="final_train.json", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True)

## Critical Check: JSON Format
Label Studio exports JSON in a nested format (inside predictions, result, etc.), but load_dataset usually expects a flat list of {"prompt": "...", "response": "..."}.

Before running this, run a quick Python script to flatten your refined_data.json into a train.json for Unsloth:

In [None]:
import json
import re

input_file = "final_train.json"
output_file = "ready_to_train.json"

print(f"üìñ Reading {input_file}...")
with open(input_file, "r") as f:
    raw_data = json.load(f)

flat_data = []
skipped_count = 0

for item in raw_data:
    try:
        # 1. Extract Prompt
        prompt = item['data']['prompt']
        
        # 2. Extract Response (CRITICAL CHANGE: Look in 'annotations', not 'predictions')
        # The Senior Accountant saves the final version in 'annotations'
        response_text = item['annotations'][0]['result'][0]['value']['text'][0]
        
        # 3. AUTO-CLEANUP: Fix the "Space after Colon" bug
        # Claude wrote "Assets: Lloyds:Checking", but Beancount prefers "Assets:Lloyds:Checking"
        # This regex removes the space after the colon for the 5 root account types
        response_text = re.sub(r'(Assets|Liabilities|Expenses|Income|Equity):\s+', r'\1:', response_text)
        
        flat_data.append({
            "prompt": prompt,
            "response": response_text
        })
        
    except (KeyError, IndexError) as e:
        # This catches any malformed records
        skipped_count += 1
        continue

# 4. Save flattened file
with open(output_file, "w") as f:
    json.dump(flat_data, f, indent=2)

print(f"‚úÖ Success! Processed {len(flat_data)} records.")
if skipped_count > 0:
    print(f"‚ö†Ô∏è Skipped {skipped_count} malformed records.")
print(f"üíæ Saved to {output_file} - You are ready to train!")

### Check where the model is stored

In [None]:
# Check where the model is cached
from huggingface_hub import hf_hub_download
import os

cache_dir = os.path.expanduser("~/.cache/huggingface/hub/")
print(f"Model cache location: {cache_dir}")
print("\nContents:")
if os.path.exists(cache_dir):
    for item in os.listdir(cache_dir)[:10]:  # Show first 10 items
        print(f"  - {item}")
else:
    print("Cache directory not found yet")

# You can also set a custom cache location if you prefer:
# os.environ['HF_HOME'] = '/path/to/custom/cache'

## Apply QLora

In [None]:

from trl import SFTTrainer, SFTConfig  # Changed import
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = SFTConfig(  # Changed from TrainingArguments to SFTConfig
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        # These are now part of SFTConfig
        max_seq_length = max_seq_length,
        dataset_text_field = "text",
    ),
)

# Start Training
trainer_stats = trainer.train()

Quick calculation:

700 records
Effective batch size = per_device_batch_size (2) √ó gradient_accumulation_steps (4) = 8
Steps per epoch = 700 / 8 = ~88 steps
So 60 steps = ~0.7 epochs - you haven't even completed one full pass through your data yet!

Recommendations:

Epochs |	Steps |	Use Case |
1 |	~90 |	Minimum - sees all data once |
2-3 |	~180-270|	Sweet spot for fine-tuning |
5+ |	440+ |	Risk of overfitting |

Since your loss was still decreasing at step 60, you probably have room to train more. I'd suggest trying max_steps = 180 (about 2 epochs) for a good balance.

Watch for:

‚úÖ Good sign: Loss continues decreasing smoothly
‚ö†Ô∏è Overfitting warning: Loss drops very low (<0.1) or starts fluctuating

In [None]:
# Continue training from step 60 to step 180
trainer.args.max_steps = 180  # New target

# Resume from the last checkpoint
trainer_stats = trainer.train(resume_from_checkpoint=True)

In [None]:
# Continue training from step 60 to step 180
trainer.args.max_steps = 270  # New target

# Resume from the last checkpoint
trainer_stats = trainer.train(resume_from_checkpoint=True)

### LOGIN TO HUB

When we push to HuggingFace Hub, it will merge our local QLoRa adaptor with the base model we used to train, on the hub.

In [None]:
import os
from huggingface_hub import login

# Try to login with token from environment variable
hf_token = os.getenv('HF_TOKEN')
if hf_token:
	login(token=hf_token)
	print("Logged in with HF_TOKEN environment variable") 
else:
	# Skip login for local training - you can still train without pushing to hub
	print("No HF_TOKEN found. Proceeding with local training on local GPU...")
	print("Note: You won't be able to push models to HuggingFace Hub without authentication")

In [None]:
from unsloth import FastLanguageModel
import os
device = "cuda:0"

# 1. CONFIGURATION
# Point this to the exact folder on your disk
checkpoint_path = "outputs/checkpoint-180" 
repo_name = "DataImaginations/ministral-3B-Beancount-v1" # Your Hugging Face repo
hf_token = os.getenv('HF_TOKEN')

# 2. LOAD SPECIFIC CHECKPOINT
# Unsloth is smart: if you point it to a folder, it loads the base model 
# AND applies the adapters from that folder automatically.
print(f"üìÇ Loading checkpoint from {checkpoint_path}...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = checkpoint_path, 
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True, # Keep True for fast loading (Unsloth handles the merge magic)
)

# 3. MERGE & PUSH
# This will de-quantize the base model, merge your checkpoint-180 adapters, 
# and upload a clean 16-bit model to the Hub.
print(f"üöÄ Merging and pushing to {repo_name}...")

model.push_to_hub_merged(
    repo_name,
    tokenizer,
    save_method = "merged_16bit", # options: "merged_4bit", "merged_16bit"
    token = hf_token
)

print("‚úÖ Done! Your Junior Accountant (Checkpoint 180) is live!")

## ALL EDITS BELOW ARE PURELY ME IN PACKAGE HELL AFTER USING A MODEL SO NEW THAT TRANSFORMERS AND UNSLOTH DON'T MATCH

### IF YOU'RE USING THIS 1 WEEK+ AFTER 18/12/2025 YOU WON'T NEED THE BELOW (WHICH DIDN'T WORK ANYWAY)

---

In [None]:
!uv pip uninstall unsloth unsloth_zoo
!uv pip uninstall unsloth unsloth_zoo  #

In [None]:
import unsloth
# This SHOULD fail with ModuleNotFoundError. If it doesn't, manual deletion is needed.

In [None]:
# Install dependencies FIRST
!uv pip install "transformers==4.46.3" "peft==0.13.2" "trl==0.8.6" "accelerate==0.34.2" "huggingface_hub>=0.26.0" "bitsandbytes==0.44.1"
# Install Unsloth Stable (PyPI version, NOT git)
!uv pip install "unsloth==2024.11.7"  # November stable release known to work with TR 4.46

In [None]:
# 1. Uninstall existing packages to prevent conflicts
!uv pip uninstall transformers peft trl unsloth accelerate
# 2. Install "Known Good" compatible versions (Late 2024 Stable Stack)
# - Transformers 4.46.3: Stable, works with Unsloth and HF Hub
# - PEFT 0.13.2: Compatible with Tr 4.46
# - TRL 0.8.6: Compatible with Tr 4.46
# - Accelerate 0.34.2: Stable backend
!uv pip install "transformers==4.46.3" "peft==0.13.2" "trl==0.8.6" "accelerate==0.34.2" "huggingface_hub>=0.26.0" "bitsandbytes==0.44.1"
# 3. Install Unsloth WITHOUT upgrading dependencies
# This forces it to use our pinned versions
!uv pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --no-deps
# 2. Restart your kernel is REQUIRED after this!
print("Please restart your kernel now!")

In [None]:
!uv pip install "bitsandbytes==0.44.1"

In [None]:
!uv pip install -q -U bitsandbytes

# Push Model to hub!

In [None]:
import os
from huggingface_hub import login

# Try to login with token from environment variable
hf_token = os.getenv('HF_TOKEN')
if hf_token:
	login(token=hf_token)
	print("Logged in with HF_TOKEN environment variable") 
else:
	# Skip login for local training - you can still train without pushing to hub
	print("No HF_TOKEN found. Proceeding with local training on local GPU...")
	print("Note: You won't be able to push models to HuggingFace Hub without authentication")

In [None]:
from transformers import AutoConfig, MistralConfig
try:
    AutoConfig.register("mistral3", MistralConfig)
    AutoConfig.register("ministral3", MistralConfig)
except Exception:
    pass

In [None]:
from unsloth import FastLanguageModel
import os
device = "cuda:0"

# 1. CONFIGURATION
# Point this to the exact folder on your disk
checkpoint_path = "outputs/checkpoint-180" 
repo_name = "DataImaginations/ministral-3B-Beancount-v1" # Your Hugging Face repo
hf_token = os.getenv('HF_TOKEN')

# 2. LOAD SPECIFIC CHECKPOINT
# Unsloth is smart: if you point it to a folder, it loads the base model 
# AND applies the adapters from that folder automatically.
print(f"üìÇ Loading checkpoint from {checkpoint_path}...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = checkpoint_path, 
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True, # Keep True for fast loading (Unsloth handles the merge magic)
)

# 3. MERGE & PUSH
# This will de-quantize the base model, merge your checkpoint-180 adapters, 
# and upload a clean 16-bit model to the Hub.
print(f"üöÄ Merging and pushing to {repo_name}...")

model.push_to_hub_merged(
    repo_name,
    tokenizer,
    save_method = "merged_16bit", # options: "merged_4bit", "merged_16bit"
    token = hf_token
)

print("‚úÖ Done! Your Junior Accountant (Checkpoint 180) is live!")

In [None]:
# 1. Completely clean slate
!uv pip uninstall transformers unsloth peft trl accelerate bitsandbytes

# 2. Install the KNOWN WORKING stack from late 2024
!uv pip install "transformers==4.46.3" "peft==0.13.2" "trl==0.8.6" "accelerate==0.34.2" "bitsandbytes==0.44.1" "huggingface_hub>=0.26.0"
 
# 3. Install Unsloth WITHOUT letting it upgrade dependencies
!uv pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --no-deps

print("‚úÖ Done! RESTART YOUR KERNEL NOW before running any other cells!")

In [None]:
from unsloth import FastLanguageModel
import os

checkpoint_path = "outputs/checkpoint-180"
repo_name = "DataImaginations/ministral-3B-Beancount-v1"
hf_token = os.getenv('HF_TOKEN')

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=checkpoint_path,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

model.push_to_hub_merged(
    repo_name,
    tokenizer,
    save_method="merged_16bit",
    token=hf_token
)

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load base model
base_model_name = "unsloth/Ministral-3-3B-Instruct-2512"  # Or the Unsloth version
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load and merge adapter
model = PeftModel.from_pretrained(model, "outputs/checkpoint-180")
model = model.merge_and_unload()

# Push to hub
model.push_to_hub("DataImaginations/ministral-3B-Beancount-v1")
tokenizer.push_to_hub("DataImaginations/ministral-3B-Beancount-v1")

In [None]:
# First, reinstall with compatible versions (no downgrade needed)
# !uv pip install -q peft transformers huggingface_hub torch

# Then run the script
%run merge_and_push.py

In [None]:
# Cell 1: Upgrade transformers (restart kernel after this!)
!uv pip install -U transformers peft huggingface_hub
print("‚úÖ Done! RESTART YOUR KERNEL NOW!")

In [None]:
# Cell 2: Run the merge
%run merge_and_push.py