In [2]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth  # Do this in local & cloud setups
else:
    import torch; v = re.match(r'[\d]{1,}\.[\d]{1,}', str(torch.__version__)).group(0)
    xformers = 'xformers==' + {'2.10':'0.0.34','2.9':'0.0.33.post1','2.8':'0.0.32.post2'}.get(v, "0.0.34")
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth_zoo bitsandbytes accelerate {xformers} peft trl triton unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Supports RoPE Scaling internally, so choose any!
dtype = None # None for auto detection. Float16 for T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage

# Load Llama 3 8B
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Apply LoRA Adapters (Using the Research-Grade settings we discussed)
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Upgraded to 32 to capture Romanized Nepali phonetics
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",], # Targeting all linear layers
    lora_alpha = 64, # Always 2x the rank
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # 30% less VRAM
    random_state = 3407,
    use_rslora = True,  # Rank stabilized LoRA for higher rank stability
    loftq_config = None, 
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


2026-02-20 15:56:18.689372: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771602978.870744      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771602978.922721      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771602979.334134      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771602979.334173      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771602979.334176      55 computation_placer.cc:177] computation placer alr

ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.2.1: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

Unsloth 2026.2.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
from datasets import load_dataset

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise generation goes on forever
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# 1. Load your local 10k JSON dataset
# Upload your file to Colab and change "your_dataset.json" to your file's name
raw_dataset = load_dataset("json", data_files={"train": "/kaggle/input/datasets/anandrimal/roman-nepali/final_cross_lingual_indic_10k.json"}, split="train")

# 2. Split the dataset (10% of 10k is 1k for testing, 9k for training)
dataset_split = raw_dataset.train_test_split(test_size=0.1, seed=3407)

# 3. Apply the prompt formatting to both splits
dataset = dataset_split.map(formatting_prompts_func, batched = True,)

# 4. Separate them into Train and Eval sets
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(f"âœ… Data loaded successfully!")
print(f"Training on: {len(train_dataset)} rows")
print(f"Evaluating on: {len(eval_dataset)} rows")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

âœ… Data loaded successfully!
Training on: 9000 rows
Evaluating on: 1000 rows


In [5]:
from trl import SFTConfig, SFTTrainer
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,  # The 9,000 rows
    eval_dataset = eval_dataset,    # The 1,000 rows
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Set to True if your sequences are short for 5x faster training
    args = SFTConfig(
        per_device_train_batch_size = 2,
        per_device_eval_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 50, # Higher warmup for 9k rows
        num_train_epochs = 1, # 1 Full pass over the 9k rows
        # max_steps = 60, # Removed so it uses the full 1 epoch
        learning_rate = 2e-4,
        
        # ðŸ§ª Evaluation Settings (This will track the 1k Test Data)
        eval_strategy = "steps",
        eval_steps = 100, # Will test the model against the 1k rows every 100 steps
        
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine", # Better for domain adaptation (Nepali)
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Change to "wandb" if you use Weights & Biases
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/9000 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/1000 [00:00<?, ? examples/s]

ðŸ¦¥ Unsloth: Padding-free auto-enabled, enabling faster training.


In [6]:
# Check memory before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# Start Training!
trainer_stats = trainer.train()

# Show final stats
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")

GPU = Tesla T4. Max memory = 14.563 GB.
6.766 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,000 | Num Epochs = 1 | Total steps = 1,125
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
100,1.9411,1.850617
200,1.7567,1.654052
300,1.5821,1.576673
400,1.5263,1.485602
500,1.3502,1.429227
600,1.3446,1.357514
700,1.3317,1.297412
800,1.2617,1.244819
900,1.2663,1.205169
1000,1.1444,1.181577


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


10949.5791 seconds used for training.
182.49 minutes used for training.


In [None]:
# Test the model on a Romanized Nepali prompt
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Kripaya euta choto katha lekhnuho.", # instruction (Write a short story)
        "", # input (Leave blank if not needed)
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256)

In [14]:
# Save specifically for an 8-bit comparison
model.save_pretrained_gguf(
    "nepali_model_8bit", 
    tokenizer, 
    quantization_method = "q8_0" # The high-fidelity 8-bit standard
)

Unsloth: Merging model weights to 16-bit format...


RuntimeError: Failed to save/merge model: Unsloth: Failed saving locally - no disk space left. Uploading can work luckily! Use .push_to_hub instead.

In [None]:
import os

# Create a folder in your Drive
save_directory = "/content/drive/MyDrive/nepali_research_8bit"
os.makedirs(save_directory, exist_ok=True)

model.save_pretrained_gguf(
    save_directory, 
    tokenizer, 
    quantization_method = "q8_0",
)

Unsloth: Merging model weights to 16-bit format...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:00<00:00, 7256.58it/s]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [01:47<00:00, 26.76s/it]


Unsloth: Merge process complete. Saved to `/content/drive/MyDrive/nepali_research_8bit`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: llama.cpp found in the system. Skipping installation.
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...


In [11]:
# Saves the adapters to a local folder named "nepali_lora_model"
model.save_pretrained("nepali_lora_model")
tokenizer.save_pretrained("nepali_lora_model")

('nepali_lora_model/tokenizer_config.json',
 'nepali_lora_model/special_tokens_map.json',
 'nepali_lora_model/tokenizer.json')

In [12]:
# Saves the model locally in Q4_K_M GGUF format
model.save_pretrained_gguf("nepali_model_gguf", tokenizer, quantization_method="q4_k_m")

# If you prefer 8-bit quantization, you can use "q8_0" instead:
# model.save_pretrained_gguf("nepali_model_gguf", tokenizer, quantization_method="q8_0")

Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/942 [00:00<?, ?B/s]

RuntimeError: Failed to save/merge model: Unsloth: Failed saving locally - no disk space left. Uploading can work luckily! Use .push_to_hub instead.

In [13]:
# Merges the weights and saves as a full 16-bit model
model.save_pretrained_merged("nepali_model_8bit", tokenizer, save_method="merged_8bit")

RuntimeError: Unsloth: Failed saving locally - no disk space left. Uploading can work luckily! Use .push_to_hub instead.