## Install Dependencies

In [1]:
%%capture
# 1. Force un-install potentially conflicting libraries first
!pip uninstall -y unsloth unsloth-zoo peft trl transformers

# 2. Install Unsloth and compatible dependencies
# We use the specific 'colab-new' tag which is stable for T4 environments like Kaggle/Colab
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# 3. Install other requirements without deps to prevent version overwrites
!pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes unsloth-zoo

## Import Libraries and Setup

In [2]:
import torch
from unsloth import FastLanguageModel
import json
import pandas as pd
from datasets import Dataset, concatenate_datasets
import os
import glob

# Check GPU capability
max_seq_length = 2048 # Auto supports RoPE Scaling
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage

print(f"GPU Model: {torch.cuda.get_device_name(0)}")

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


2026-01-10 16:15:28.858062: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768061729.159071      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768061729.211692      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768061729.824900      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768061729.824939      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768061729.824942      24 computation_placer.cc:177] computation placer alr

ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
GPU Model: Tesla T4


## Load, Merge, and Count Data

In [3]:
# Define your specific file paths
file_paths = [
    "/kaggle/input/nusantara-law-corpus/Adagium/all.json",
    "/kaggle/input/nusantara-law-corpus/Glosarium-MA/all.json",
    "/kaggle/input/nusantara-law-corpus/KHPTSultra/all.json",
    "/kaggle/input/nusantara-law-corpus/LawDictionary/all.json",
    "/kaggle/input/nusantara-law-corpus/UUD/uud.json"
]

combined_data = []

# Iterate and load
for file_path in file_paths:
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # Ensure data is a list of records
                if isinstance(data, list):
                    combined_data.extend(data)
                    print(f"Successfully loaded {len(data)} records from: {os.path.basename(file_path)}")
                else:
                    print(f"Warning: {file_path} format is not a list of records.")
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    else:
        print(f"File not found: {file_path}")

# Convert to Pandas DataFrame first for easier handling
df = pd.DataFrame(combined_data)

# Print total count
print("-" * 30)
print(f"Total merged data points: {len(df)}")
print("-" * 30)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

Successfully loaded 89 records from: all.json
Successfully loaded 207 records from: all.json
Successfully loaded 144 records from: all.json
Successfully loaded 2456 records from: all.json
Successfully loaded 250 records from: uud.json
------------------------------
Total merged data points: 3146
------------------------------


## Load Model (Gemma 2 9B)

In [4]:
from unsloth import FastLanguageModel
import torch

# 1. Load Model & Tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b-bnb-4bit",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

# 2. Configure Adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

# 3. Define Formatting Function (Must be done AFTER tokenizer is loaded)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must be defined here

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    contexts     = examples["context"]
    responses    = examples["response"]
    texts = []
    for instruction, context, response in zip(instructions, contexts, responses):
        # Must add EOS_TOKEN, otherwise generation will go on forever!
        text = alpaca_prompt.format(instruction, context, response) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# 4. Apply Formatting
# Now we map the function to the dataset
dataset = dataset.map(formatting_prompts_func, batched = True)

# 5. Split Data (90% Train, 10% Test)
dataset_split = dataset.train_test_split(test_size=0.1)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print(f"Success! Data formatted.")
print(f"Training Samples: {len(train_dataset)}")
print(f"Evaluation Samples: {len(eval_dataset)}")

==((====))==  Unsloth 2026.1.2: Fast Gemma2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

Unsloth 2026.1.2 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


Map:   0%|          | 0/3146 [00:00<?, ? examples/s]

Success! Data formatted.
Training Samples: 2831
Evaluation Samples: 315


## Configure QLoRA Adapters

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank. 16 is a good balance. 32/64 if you have a lot of data/compute.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0, # 0 is optimized
    bias = "none",    # "none" is optimized
    use_gradient_checkpointing = "unsloth", # Crucial for long context/low VRAM
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Already have LoRA adapters! We shall skip this step.


## Training (SFTTrainer)

In [6]:
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# 1. Split data (90% train, 10% test)
dataset_split = dataset.train_test_split(test_size=0.1)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

# 2. Configure Training for ~6 Hours
sft_config = SFTConfig(
    output_dir = "outputs",
    max_seq_length = max_seq_length,
    dataset_text_field = "text",
    dataset_num_proc = 2,
    packing = False,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    
    # 1 Epoch ~= 354 steps. 
    # 6 Epochs ~= 2124 steps ~= 6 Hours of training
    num_train_epochs = 6,      
    learning_rate = 5e-5,      
    lr_scheduler_type = "cosine", 
    warmup_steps = 100,        
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.01,
    seed = 3407,
    eval_strategy = "steps",
    eval_steps = 350,          
    save_steps = 350,          
    logging_steps = 10,        
    report_to = "none",
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,   
    args = sft_config,
)

# 3. Execute
print("Starting Training Run...")
trainer_stats = trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2831 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/315 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Starting Training Run...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,831 | Num Epochs = 6 | Total steps = 2,124
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 54,018,048 of 9,295,724,032 (0.58% trained)


Step,Training Loss,Validation Loss
350,0.7965,0.77762
700,0.6325,0.759988
1050,0.4578,0.823412
1400,0.3219,0.959484
1750,0.2341,1.123829
2100,0.1679,1.24077


Unsloth: Will smartly offload gradients to save VRAM!


Unsloth: Not an error, but Gemma2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


## Execute Training

In [7]:
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,831 | Num Epochs = 6 | Total steps = 2,124
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 54,018,048 of 9,295,724,032 (0.58% trained)


Step,Training Loss,Validation Loss
350,0.2874,1.119042
700,0.2418,1.14444
1050,0.1544,1.273487
1400,0.1327,1.368575
1750,0.1057,1.447787
2100,0.0949,1.49495


Unsloth: Will smartly offload gradients to save VRAM!


## Inference and Evaluation

In [8]:
import time

# 1. Switch to inference mode
FastLanguageModel.for_inference(model)

# 2. Define varied test cases from your dataset
test_prompts = [
    "Jelaskan prinsip 'Nullum delictum nulla poena sine praevia lege poenali'.",
    "Apa sanksi bagi pejabat yang menyalahgunakan wewenang?",
    "Jelaskan perbedaan antara hukum perdata dan hukum pidana."
]

# 3. Advanced Generation Function
def generate_response(prompt, context=""):
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                prompt,
                context,
                "", # Output is empty for generation
            )
        ],
        return_tensors="pt"
    ).to("cuda")

    start_time = time.time()
    
    # Advanced parameters for control
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,       
        use_cache=True,
        temperature=0.6,          # Lower (0.1-0.6) is better for factual/legal tasks
        top_k=50,                 
        top_p=0.9,                
        repetition_penalty=1.1,   
        do_sample=True            
    )
    
    end_time = time.time()
    
    decoded_output = tokenizer.batch_decode(outputs)[0]
    response = decoded_output.split("### Response:\n")[-1].replace(tokenizer.eos_token, "")
    
    # Calculate speed
    num_tokens = len(outputs[0])
    duration = end_time - start_time
    tokens_per_sec = num_tokens / duration
    
    return response, tokens_per_sec

# 4. Run Evaluation
print("=== Starting Advanced Evaluation ===\n")

for i, prompt in enumerate(test_prompts):
    print(f"Test Case {i+1}: {prompt}")
    response, speed = generate_response(prompt)
    print(f"Response:\n{response}")
    print(f"Speed: {speed:.2f} tokens/sec")
    print("-" * 50)

=== Starting Advanced Evaluation ===

Test Case 1: Jelaskan prinsip 'Nullum delictum nulla poena sine praevia lege poenali'.
Response:
Prinsip hukum pidana bahwa suatu tindakan baru dapat dipidana jika undang-undang yang mengatur tentang pidana dari tindakan tersebut telah ada sebelum tindakan dilakukan.
Speed: 12.47 tokens/sec
--------------------------------------------------
Test Case 2: Apa sanksi bagi pejabat yang menyalahgunakan wewenang?
Response:
Penyalahgunaan wewenang oleh pejabat publik dapat dipidana dengan penjara maksimal tiga tahun dan denda sebanyak 400 Upah Minimum Ulang tahun (Regulasi Pemerintah).
Speed: 21.64 tokens/sec
--------------------------------------------------
Test Case 3: Jelaskan perbedaan antara hukum perdata dan hukum pidana.
Response:
Hukum perdata mengatur hubungan warga negara dalam hal harta kekayaan, hubungan keluarga, dan kontrak. Pelanggaran hukum perdata biasanya tidak bersifat criminal dan tidak ditindak oleh penegak hukum. Di sisi lain, hukum

## Save the Model

In [9]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# 1. Login
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_token")
login(hf_token)

repo_name = "bayhaqieee/gemma-2-9b-nlaw-gguf" 

print("=== STEP 1: Pushing GGUF to Hugging Face (This takes time & RAM) ===")
# We do this FIRST to ensure disk space is available for the merge process.
try:
    model.save_pretrained_gguf(
        repo_name, 
        tokenizer, 
        quantization_method = "q4_k_m",
        token = hf_token
    )
    print("GGUF Pushed to Hugging Face successfully!")
except Exception as e:
    print(f"GGUF Push Failed: {e}")

print("\n Pushing Adapters (LoRA) to Hugging Face ")
# This saves just the small learning files to your repo
model.push_to_hub(repo_name, token=hf_token)
tokenizer.push_to_hub(repo_name, token=hf_token)
print("Adapters Pushed to Hugging Face!")

=== STEP 1: Pushing GGUF to Hugging Face (This takes time & RAM) ===
Unsloth: Merging model weights to 16-bit format...


config.json: 0.00B [00:00, ?B/s]

GGUF Push Failed: Failed to save/merge model: Unsloth: Failed saving locally - no disk space left. Uploading can work luckily! Use .push_to_hub instead.

 Pushing Adapters (LoRA) to Hugging Face 


README.md:   0%|          | 0.00/548 [00:00<?, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Saved model to https://huggingface.co/bayhaqieee/gemma-2-9b-nlaw-gguf


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


Adapters Pushed to Hugging Face!


In [10]:
print("\nSaving Adapters Locally (Kaggle)")
local_folder = "gemma-2-9b-nlaw_adapter"
model.save_pretrained(local_folder)
tokenizer.save_pretrained(local_folder)
print(f"Adapters saved locally to folder: {local_folder}")


Saving Adapters Locally (Kaggle)
Adapters saved locally to folder: gemma-2-9b-nlaw_adapter
