<a href="https://colab.research.google.com/github/DibiaCorp85/fine-tuning_nllb-200_600M/blob/main/_Fine_Tuning_En_Ig_LaTn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Install Dependencies**

In [2]:
!pip install --quiet chainlit pyngrok datasets transformers evaluate accelerate peft sacrebleu rouge_score bitsandbytes

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m


In [3]:
!pip install --quiet --upgrade fsspec datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m307.2/491.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.[0m[31m
[0m

## **Import Core Libraries**

In [4]:
import os
import requests
import random
import torch
from torch.optim import AdamW
from datasets import (load_dataset,
                      concatenate_datasets,
                      DatasetDict,
                      Dataset,
                      get_dataset_config_names,
                      Features,
                      ClassLabel,
                      Value,
                      Translation
                      )

from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    BitsAndBytesConfig,
)
from torch.utils.tensorboard import SummaryWriter
import evaluate
import pandas as pd
from sklearn.model_selection import train_test_split

from peft import (
    TaskType,
    LoraConfig,
    get_peft_model,
    PeftModel,
    PeftConfig,
)

from huggingface_hub import login
from google.colab import drive
import getpass
from pyngrok import conf, ngrok
import subprocess
import time

In [None]:
# Mount Google Drive to save model and logs
drive.mount('/content/drive', force_remount = True)
save_dir = "/content/drive/MyDrive/Colab Notebooks/NLLB_200/En-Ig_LaTn"
os.makedirs(save_dir, exist_ok = True)

Mounted at /content/drive


## **Load Datasets**

English-"Language" datasets are loaded from Hugging Face.

The following are the datasets are used:

* Opus100 containing the above listed languages paired with English language.

### **Opus100 Dataset**

In [None]:
login("hf access token key")

# List of desired target languages ISO codes(to pair with English Language)
target_language = {"ig" : "ibo"}

source_language = "en" # Fixed source language

desired_pairs = [f"{source_language}-{tgt}" for tgt in target_language]

# Fetch all configurations from Opus100
available_configs = get_dataset_config_names("opus100")

# Filter those that exist in Opus100
present_pairs = [pair for pair in desired_pairs if pair in available_configs]
missing_pairs = [pair for pair in desired_pairs if pair not in available_configs]

# Print results
print(" The En-Ig language pair is present in Opus100 dataset:")
for pair in present_pairs:
    print(f" - {pair} ({target_language[pair.split('-')[1]]})")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

 The En-Ig language pair is present in Opus100 dataset:
 - en-ig (ibo)


#### **Load dataset**

In [None]:
# English-Igbo
selected_language = ["ig"]

if "ig" in selected_language:
    try:
        opus_en_ig = load_dataset("opus100", "en-ig")
        print("English-Igbo language pair downloaded!")
    except Exception as e:
        print("Failed to download English-Igbo:", e)

test-00000-of-00001.parquet:   0%|          | 0.00/44.4k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/770k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/45.3k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1843 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/18415 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1843 [00:00<?, ? examples/s]

English-Igbo language pair downloaded!


## **Qualitative and Quantitative Examination of Datasets**

The following is a check-list to examine each dataset:

1. Splits
2. Features/columns
3. Format
4. Missing row
5. Check internal usaga of language pairs

### **Splits Check**

In [None]:
# Check splits
def print_split_info(name, dataset):
    print(f"\n📊 Dataset: {name}")

    if isinstance(dataset, DatasetDict):
        for split_name, split in dataset.items():
            print(f"  ➤ Split: {split_name} | Rows: {split.num_rows}")
    elif isinstance(dataset, Dataset):
        print(f"  ➤ Single split | Rows: {dataset.num_rows}")
    else:
        print("❌ Unrecognized dataset type.")

In [None]:
print_split_info("Opus100: English-Igbo Language Pair", opus_en_ig)


📊 Dataset: Opus100: English-Igbo Language Pair
  ➤ Split: test | Rows: 1843
  ➤ Split: train | Rows: 18415
  ➤ Split: validation | Rows: 1843


### **Features/Columns/Schema Check**

In [None]:
# Check schema/column names using just the train split
def print_dataset_features(datasets_with_names):
    """
    Prints the .features of multiple Hugging Face datasets with names.

    Args:
        datasets_with_names (list of tuples): List of (dataset, name) pairs.
    """
    for dataset, name in datasets_with_names:
        print(f"\n📘 Features for: {name}")
        print(dataset.features)

In [None]:
print_dataset_features([
    (opus_en_ig['train'], "Opus EN-IG")])


📘 Features for: Opus EN-IG
{'translation': Translation(languages=['en', 'ig'], id=None)}


### **NLLB-Format-Compatibility Check**

In [None]:
# The NLLB model expects the following format:

"""
{
  "translation": {
    "source language": "source sentence",
    "target language": "target sentence"
  }
}
"""

'\n{\n  "translation": {\n    "source language": "source sentence",\n    "target language": "target sentence"\n  }\n}\n'

#### **Recast Dataset to NLLB Format**

To recast your dataset to the NLLB format, you need to ensure two key things:

1. The translation feature uses a tuple of language codes, not a list.

2. This format aligns with what the NLLB tokenizer expects, especially for multilingual training.

In [None]:
# Define the correct NLLB-style features with a tuple
translation_features = Features({
    "translation": Translation(languages=("en", "ig"))  # <- Use tuple
})

# Recast all splits to match NLLB format
opus_en_ig = DatasetDict({
    split: ds.cast(translation_features)
    for split, ds in opus_en_ig.items()
})

Casting the dataset:   0%|          | 0/1843 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18415 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1843 [00:00<?, ? examples/s]

In [None]:
# Verify format
print(opus_en_ig["train"].features)

{'translation': Translation(languages=('en', 'ig'), id=None)}


### **Check Missing Row**

In [None]:
def check_missing_rows_all_splits(dataset_dict, name, src_lang=None, tgt_lang=None):
    """
    Checks for missing or invalid rows across all splits in a DatasetDict.

    Args:
        dataset_dict (DatasetDict): The dataset with multiple splits.
        name (str): Dataset name for reporting.
        src_lang (str): Source language key (e.g., 'en').
        tgt_lang (str): Target language key (e.g., 'ig').
    """
    for split_name, split_dataset in dataset_dict.items():
        total = len(split_dataset)
        missing = 0

        for row in split_dataset:
            try:
                if "translation" in row:
                    trans = row["translation"]
                    src = trans.get(src_lang, "").strip() if src_lang else ""
                    tgt = trans.get(tgt_lang, "").strip() if tgt_lang else ""
                else:
                    src = row.get(src_lang, "").strip()
                    tgt = row.get(tgt_lang, "").strip()

                if not src or not tgt or len(src) <= 1 or len(tgt) <= 1:
                    missing += 1
            except Exception:
                missing += 1

        print(f"🔍 {name} ({split_name}): {missing} missing / {total} total rows")


In [None]:
# Run checks
check_missing_rows_all_splits(opus_en_ig, "Opus EN-IG", src_lang="en", tgt_lang="ig")

🔍 Opus EN-IG (test): 1843 missing / 1843 total rows
🔍 Opus EN-IG (train): 18415 missing / 18415 total rows
🔍 Opus EN-IG (validation): 1843 missing / 1843 total rows


## **Data Cleaning**

### **Clear Missing Rows**

In [None]:
from datasets import DatasetDict

def clean_missing_rows(dataset_dict, src_lang=None, tgt_lang=None):
    """
    Removes missing/invalid rows across all splits in a DatasetDict.

    Args:
        dataset_dict (DatasetDict): The dataset with splits (e.g. train, test, validation).
        src_lang (str): Source language key (e.g., 'en').
        tgt_lang (str): Target language key (e.g., 'ig').

    Returns:
        DatasetDict: Cleaned dataset with bad rows removed.
    """
    cleaned_splits = {}

    for split_name, split_dataset in dataset_dict.items():
        def is_valid(row):
            try:
                if "translation" in row:
                    src = row["translation"].get(src_lang, "").strip()
                    tgt = row["translation"].get(tgt_lang, "").strip()
                else:
                    src = row.get(src_lang, "").strip()
                    tgt = row.get(tgt_lang, "").strip()
                return bool(src and tgt and len(src) > 1 and len(tgt) > 1)
            except Exception:
                return False

        print(f"🧹 Cleaning split: {split_name}...")
        cleaned_split = split_dataset.filter(is_valid)
        cleaned_splits[split_name] = cleaned_split
        print(f"✅ {len(cleaned_split)} rows retained from {len(split_dataset)}")

    return DatasetDict(cleaned_splits)


In [None]:
opus_en_ig_clean = clean_missing_rows(opus_en_ig, src_lang="en", tgt_lang="ig")

🧹 Cleaning split: test...


Filter:   0%|          | 0/1843 [00:00<?, ? examples/s]

✅ 1839 rows retained from 1843
🧹 Cleaning split: train...


Filter:   0%|          | 0/18415 [00:00<?, ? examples/s]

✅ 18406 rows retained from 18415
🧹 Cleaning split: validation...


Filter:   0%|          | 0/1843 [00:00<?, ? examples/s]

✅ 1835 rows retained from 1843


### **Standardize Feature Schema: Confirmation**

In [None]:
# Define the correct NLLB-style features with a tuple
translation_features = Features({
    "translation": Translation(languages=("en", "ig"))  # <- Use tuple
})

# Recast all splits to match NLLB format
opus_en_ig_clean = DatasetDict({
    split: ds.cast(translation_features)
    for split, ds in opus_en_ig.items()
})

print(opus_en_ig_clean["train"].features)

Casting the dataset:   0%|          | 0/1843 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18415 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1843 [00:00<?, ? examples/s]

{'translation': Translation(languages=('en', 'ig'), id=None)}


Everything looks fine

## **Save Dataset to Disc**

In [None]:
save_path = f"{save_dir}/CleanedIgboDataset"
opus_en_ig_clean.save_to_disk(save_path)
print(f"✅ Dataset saved to: {save_path}")

Saving the dataset (0/1 shards):   0%|          | 0/1843 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18415 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1843 [00:00<?, ? examples/s]

✅ Dataset saved to: /content/drive/MyDrive/Colab Notebooks/NLLB_200/En-Ig_LaTn/CleanedIgboDataset


## **Tokenization**

### **Load Tokenizer**

In [None]:
model_checkpoint = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.padding_side = "right" #  proper for attention mask alignment and decoder positioning for encoder-decoder model like NLLB

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

### **Preprocess**

In [None]:
opus_dataset = opus_en_ig_clean

# Isolate data splits
train_dataset = opus_dataset["train"]
val_dataset = opus_dataset["validation"]
test_dataset = opus_dataset["test"]


# Set the target language for NLLB tokenizer globally
tokenizer.src_lang = "eng_Latn"
tokenizer.tgt_lang = "ibo_Latn"

def preprocess(example):
    source = example.get("translation", {}).get("en", None)
    target = example.get("translation", {}).get("ig", None)

    if not source or not target:
        return {
            "input_ids": [],
            "attention_mask": [],
            "labels": []
            }

    # Add source language prefix for NLLB-style
    input_text = f">>ibo_Latn<< {source}"

    # Tokenize source and target using set lang codes
    model_inputs = tokenizer(
        input_text,
        max_length=128,
        padding="max_length",
        truncation=True,
    )

    target_inputs = tokenizer(
        target,
        max_length=128,
        padding="max_length",
        truncation=True,
    )

    model_inputs["labels"] = target_inputs["input_ids"]
    return model_inputs

In [None]:
# Map Preprocessing on all splits

train_tokenized = train_dataset.map(
    preprocess,
    remove_columns=["translation"],
    num_proc=4,  # Optional: use multiple processes
    desc="Tokenizing train set"
).filter(lambda example: example.get("labels") is not None)

val_tokenized = val_dataset.map(
    preprocess,
    remove_columns=["translation"],
    num_proc=4,
    desc="Tokenizing val set"
).filter(lambda example: example.get("labels") is not None)

test_tokenized = test_dataset.map(
    preprocess,
    remove_columns=["translation"],
    num_proc=4,
    desc="Tokenizing test set"
).filter(lambda example: example.get("labels") is not None)

Tokenizing train set (num_proc=4):   0%|          | 0/18415 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18415 [00:00<?, ? examples/s]

Tokenizing val set (num_proc=4):   0%|          | 0/1843 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1843 [00:00<?, ? examples/s]

Tokenizing test set (num_proc=4):   0%|          | 0/1843 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1843 [00:00<?, ? examples/s]

In [None]:
# Check a sample from tokenized data to confirm tokenization
print(train_tokenized[0])

{'input_ids': [256047, 20545, 256073, 57642, 42365, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [256047, 42365, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
# The strcture above is interpreted as:

"""
{
  'input_ids': [...],
  'attention_mask': [...],
  'labels': [...]
}

"""

"\n{\n  'input_ids': [...],\n  'attention_mask': [...],\n  'labels': [...]\n}\n\n"

In [None]:
print(train_tokenized)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 18415
})


## **Fine-Tune NLLB**

### **Configure BitsAndBytes**

In [None]:
# BitsAndBytes parameters
################################################################################
use_4bit = True # 4-bit precision on base model loading
bnb_4bit_compute_dtype = torch.float16 # compute datatype for 4-bit base model
bnb_4bit_quant_type = "nf4" # quantization type
use_nested_quant = False # activate nested quantization for 4-bit base models (double quantization)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint,
                                              device_map="auto",
                                              low_cpu_mem_usage=True,  # Explicitly set to avoid the warning
                                              quantization_config=bnb_config)

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

### **Test Model with Zero Shot Inferencing**

In [None]:
%%time

# 🌍 Define source & target languages (ISO 639-3 codes)
src_lang = "eng_Latn"
tgt_lang = "ibo_Latn"

# ✏️ Example input sentence in English
input_sentence = "The weather today is very pleasant."

# 🔡 Tokenize with language codes
inputs = tokenizer(
    input_sentence,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# ✨ Set language tokens
inputs["forced_bos_token_id"] = tokenizer.convert_tokens_to_ids(tgt_lang)
tokenizer.src_lang = src_lang

# 🔁 Run inference
with torch.no_grad():
    output_tokens = model.generate(
        **inputs,
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

# 🗣️ Decode result
translated_text = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]
print(f"🔤 English: {input_sentence}")
print(f"🌍 Igbo: {translated_text}")

🔤 English: The weather today is very pleasant.
🌍 Igbo: Ihu igwe dị nnọọ mma taa.
CPU times: user 1.11 s, sys: 565 ms, total: 1.67 s
Wall time: 1.7 s


### **Setup Model with LoRA (PEFT**

In [None]:
# Set up LoRA config with target_modules
lora_config = LoraConfig(
    r = 8,  # Rank of the decomposition
    lora_alpha = 32,  # Scaling factor for LoRA updates
    lora_dropout = 0.05,  # Dropout rate for LoRA
    task_type = TaskType.SEQ_2_SEQ_LM,  # Sequence-to-sequence task
    bias = 'none',
    target_modules = ["q_proj", "v_proj", "k_proj", "o_proj"],  # Attention layers (query, value, key, output)
)

# Apply LoRA adapters to the model
model_with_lora = get_peft_model(model, lora_config)

In [None]:
# Check trainable parameters
model_with_lora.print_trainable_parameters()

trainable params: 1,769,472 || all params: 616,843,264 || trainable%: 0.2869


### **Prepare DataCollator**

In [None]:
# Instantiate the data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer,
                                       model = model_with_lora,
                                       padding = True)

### **Define Seq2SeqTrainingArguments**

In [None]:
# Training arguments

training_args = Seq2SeqTrainingArguments(
    eval_strategy = "epoch",  # Evaluate after every epoch
    logging_dir = f"{save_dir}/logs",  # Directory for storing logs
    logging_strategy = "steps",  # Log every N steps
    logging_steps = 25,  # Log every 25 steps
    save_strategy = "epoch",  # Save model after every epoch
    save_total_limit = 3,  # Keep only the latest 3 checkpoints
    per_device_train_batch_size = 4,  # Batch size per device for training
    per_device_eval_batch_size = 4,  # Batch size per device for evaluation
    gradient_accumulation_steps = 2,  # Accumulate gradients for 2 steps before updating weights
    num_train_epochs = 3,  # Total number of epochs
    predict_with_generate = True,  # Predict with generate
    weight_decay = 0.01,  # Weight decay
    lr_scheduler_type = "linear",  # Linear learning rate scheduler
    optim = "paged_adamw_32bit",  # Optimizer to use
    learning_rate = 2e-5,  # Initial learning rate
    eval_steps = 500, # run validation every 500 steps
    fp16 = True,  # Use mixed precision training (not recommended for faster training on GPUs, especially A100 GPUs)
    load_best_model_at_end = True,  # Load the best model at the end based on evaluation metric
    metric_for_best_model = "eval_loss",  # Metric to monitor for the best model (e.g., BLEU score for translation)
    greater_is_better = False,  # Higher BLEU metric scores are better
    report_to = "none",  # Use TensorBoard for logging
    disable_tqdm = False,  # Enable or disable tqdm (progress bar)
    save_steps = 500,  # Save model checkpoints every 500 steps
    label_names = ["labels"],  # Name of the label column in the dataset
  )


### **Compute Metrics**

In [None]:
# metric = evaluate.load("sacrebleu")

# def compute_metrics(eval_preds):
#     preds, labels = eval_preds
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     result = metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
#     return {"bleu": result["score"]}


### **Training Setup**

In [None]:
trainer = Seq2SeqTrainer(
    model=model_with_lora,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized.select(range(min(len(val_tokenized), 2000))),
    data_collator=data_collator,
    #compute_metrics=compute_metrics  # compute BLEU
)

### **Train**

In [None]:
# Compute total training time
start_time = time.time()
print(f"Training starts at {start_time}")

trainer.train()

end_time = time.time()
print(f"Training ends at {end_time}")

total_seconds = end_time - start_time
hours = int(total_seconds // 3600)
minutes = int((total_seconds % 3600) // 60)
seconds = int(total_seconds % 60)

print(f"Training time: {hours}h {minutes}m {seconds}s")

Training starts at 1747760410.6678948


Epoch,Training Loss,Validation Loss
1,7.228,7.184656
2,7.1698,7.118298


Epoch,Training Loss,Validation Loss
1,7.228,7.184656
2,7.1698,7.118298
3,7.1149,7.103433


Training ends at 1747763873.3523066
Training time: 0h 57m 42s


## **Save Model and Tokenizer**

In [None]:
trainer.save_model(f"{save_dir}/En-Ig_FT_model")
tokenizer.save_pretrained(f"{save_dir}/En-Ig_FT_model")

('/content/drive/MyDrive/Colab Notebooks/NLLB_200/En-Ig_LaTn/En-Ig_FT_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/NLLB_200/En-Ig_LaTn/En-Ig_FT_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/NLLB_200/En-Ig_LaTn/En-Ig_FT_model/sentencepiece.bpe.model',
 '/content/drive/MyDrive/Colab Notebooks/NLLB_200/En-Ig_LaTn/En-Ig_FT_model/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/NLLB_200/En-Ig_LaTn/En-Ig_FT_model/tokenizer.json')

## **Push to Hugging Face**

In [None]:
!pip install huggingface_hub




In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `DibiaCorp` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenti

In [None]:
from huggingface_hub import HfApi, HfFolder
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Define repo details
repo_name = "drakensberg85/English-Igbo_NLLB_FT_model"
model_path = f"{save_dir}/En-Ig_FT_model"

# Upload using transformers
AutoModelForSeq2SeqLM.from_pretrained(model_path).push_to_hub(repo_name)
AutoTokenizer.from_pretrained(model_path).push_to_hub(repo_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/7.11M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/32.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/drakensberg85/English-Igbo_NLLB_FT_model/commit/eb3f4c25c6f66e2f0ac8a4745e3d760e58a55966', commit_message='Upload tokenizer', commit_description='', oid='eb3f4c25c6f66e2f0ac8a4745e3d760e58a55966', pr_url=None, repo_url=RepoUrl('https://huggingface.co/drakensberg85/English-Igbo_NLLB_FT_model', endpoint='https://huggingface.co', repo_type='model', repo_id='drakensberg85/English-Igbo_NLLB_FT_model'), pr_revision=None, pr_num=None)

## **TensorBoard Logging and Setup**

In [None]:
# writer = SummaryWriter(f"{save_dir}/logs")
# print("Training complete. View metrics using TensorBoard:")
# print(f"Run this in Colab terminal: tensorboard --logdir={save_dir}/logs")

In [None]:
# %reload_ext tensorboard
# %tensorboard --logdir "{/content/drive/MyDrive/Colab Notebooks/NLLB_200/En-Ig_LaTn/}logs"

## **Inference Check**

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Use the correct model path
model_path = f"{save_dir}/En-Ig_FT_model"

# Load tokenizer and model from local files
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True)

# Send model to appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example sentence for translation
english_sentence = "human"
source_sentence = f">>ibo_Latn<< {english_sentence}"

# Tokenize input and move to device
inputs = tokenizer(source_sentence, return_tensors="pt").to(device)

# Generate translation
with torch.no_grad():
    output = model.generate(**inputs, max_length=128, num_beams=5, early_stopping=True)

# Decode and print translation
igbo_translation = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"English: {english_sentence}")
print(f"Igbo: {igbo_translation}")


English: human
Igbo: Mmadụ


## **Incorporating Chainlit**

In [None]:
%%writefile app.py

import chainlit as cl
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch


# Load model & tokenizer
model_path = f"{save_dir}/En-Ig_FT_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

@cl.on_chat_start
async def start():
    await cl.Message(content="👋 Welcome! Type something in English and I'll translate it to Igbo!").send()

@cl.on_message
async def main(message: cl.Message):
    input_text = f">>ibo_Latn<< {message.content}"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Create an empty Chainlit message to stream into
    response = cl.Message(content="")
    await response.send()

    # Generate tokens step-by-step
    output_tokens = model.generate(
        **inputs,
        max_length=128,
        num_beams=1,  # Beam search disables streaming behavior
        do_sample=False,
        output_scores=False,
        return_dict_in_generate=True
    )

    # Stream tokens (you can simulate streaming with a short delay per chunk if needed)
    output_text = tokenizer.decode(output_tokens.sequences[0], skip_special_tokens=True)

    # Simulate streaming (token-by-token)
    for token in output_text.split():
        response.content += token + " "
        await response.update()

    # Final update
    await response.update()


Writing app.py
