# Phase 1

In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
)
import torch
import os


In [2]:
# Load translated Taita-English dataset (assuming you've built it like before)
# If it's not yet built, run your previous preprocessing script first.
dataset = load_dataset("thinkKenya/kenyan-low-resource-language-data", "dav_swa")

# Load tokenizer and model (we'll fine-tune T5-small)
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Translate Swahili to English using Rogendo
rogendo_model_name = "Rogendo/sw-en"
rogendo_tokenizer = AutoTokenizer.from_pretrained(rogendo_model_name)
rogendo_model = AutoModelForSeq2SeqLM.from_pretrained(rogendo_model_name).to(device)
rogendo_model.eval()



MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58905, 512, padding_idx=58904)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58905, 512, padding_idx=58904)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [4]:
# Step 1: Translate Swahili to English using the Rogendo model
def translate_swa_to_en(batch):
    swahili_sentences = [item["swa"] for item in batch["translation"]]
    inputs = rogendo_tokenizer(swahili_sentences, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = rogendo_model.generate(**inputs, max_length=128)
    english_sentences = rogendo_tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return {"english": english_sentences}

# Apply translation on the train split
translated_train = dataset["train"].map(translate_swa_to_en, batched=True, batch_size=16)

# Step 2: Build input-output pairs (Taita → English)
def build_input_target(example):
    return {
        "input_text": example["translation"]["dav"],   # Taita
        "target_text": example["english"]              # Translated English
    }

# Apply mapping
taita_en_dataset = translated_train.map(build_input_target)

# Keep only necessary columns
taita_en_dataset = taita_en_dataset.remove_columns(
    [col for col in taita_en_dataset.column_names if col not in ["input_text", "target_text"]]
)

# Preview a sample
print(taita_en_dataset[1])


Map:   0%|          | 0/21329 [00:00<?, ? examples/s]

Map:   0%|          | 0/21329 [00:00<?, ? examples/s]

{'input_text': 'Hata iji Wavika', 'target_text': 'When You Are at Hand'}


In [6]:
# Tokenize for training
def preprocess(example):
    inputs = tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(example["target_text"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized = taita_en_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/21329 [00:00<?, ? examples/s]

In [7]:
# Set up training args with checkpointing
training_args = Seq2SeqTrainingArguments(
    output_dir="./checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=5,
    save_total_limit=2,
    predict_with_generate=True,
    load_best_model_at_end=True,
    resume_from_checkpoint=True  # <<< This ensures it resumes if rerun
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_cummax_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
    registered at /workspace/repositories/IPEX/pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /workspace/repositories/IPEX/pytorch/build/aten/src/ATen/RegisterCPU.cpp:30476
       new kernel: registered at /workspace/repositories/IPEX/ipex/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:2971 (function operator())


[2025-03-11 17:34:36,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to xpu (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import evaluate
import os


bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
ter = evaluate.load("ter")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    return {
        "bleu": bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])["score"],
        "rougeL": rouge.compute(predictions=decoded_preds, references=decoded_labels)["rougeL"],
        "ter": ter.compute(predictions=decoded_preds, references=decoded_labels)["score"],
    }

split = tokenized.train_test_split(test_size=0.1)
train_ds = split["train"]
eval_ds = split["test"]

eval_ds = eval_ds.select(range(100))

training_args = Seq2SeqTrainingArguments(
    output_dir="./taita_en_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    eval_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    generation_max_length=64,
    predict_with_generate=True,
    fp16=torch.cuda.is_available()
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


2025-03-11 17:35:11,768 - _logger.py - IPEX - INFO - Currently split master weight for xpu only support sgd


Epoch,Training Loss,Validation Loss,Bleu,Rougel,Ter
1,0.3065,0.282355,0.968597,0.150864,120.550162
2,0.2468,0.23786,2.34073,0.219025,114.239482
3,0.2246,0.225209,2.975336,0.245232,111.97411


2025-03-11 17:39:14,860 - absl - INFO - Using default tokenizer.
2025-03-11 17:43:28,734 - absl - INFO - Using default tokenizer.
2025-03-11 17:47:42,297 - absl - INFO - Using default tokenizer.


TrainOutput(global_step=7200, training_loss=0.29543087111579047, metrics={'train_runtime': 750.6268, 'train_samples_per_second': 76.72, 'train_steps_per_second': 9.592, 'total_flos': 1952139039473664.0, 'train_loss': 0.29543087111579047, 'epoch': 3.0})

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch # type: ignore

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "Rogendo/sw-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

swahili_sentences = [
    "Habari za asubuhi.",
    "Jina langu ni Amina.",
    "Ninapenda kusoma vitabu.",
    "Tafadhali nisaidie kuelewa hii.",
    "Asante sana kwa msaada wako."
]

translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer, src_lang="sw", tgt_lang="en", device=0 if torch.cuda.is_available() else -1)

for sentence in swahili_sentences:
    result = translation_pipeline(sentence)
    print(f"Swahili: {sentence}")
    print(f"English : {result[0]['translation_text']}")
    print("-" * 40)


Using device: cpu


Device set to use cpu


Swahili: Habari za asubuhi.
English : Good morning news.
----------------------------------------
Swahili: Jina langu ni Amina.
English : My name is Amen.
----------------------------------------
Swahili: Ninapenda kusoma vitabu.
English : I love to read books.
----------------------------------------
Swahili: Tafadhali nisaidie kuelewa hii.
English : Please help me to understand this.
----------------------------------------
Swahili: Asante sana kwa msaada wako.
English : Thank you so much for your help.
----------------------------------------


In [15]:
metrics = trainer.evaluate()
print(metrics)

2025-03-11 18:13:26,871 - absl - INFO - Using default tokenizer.


{'eval_loss': 0.22520889341831207, 'eval_bleu': 2.975336211265933, 'eval_rougeL': 0.24518282834969118, 'eval_ter': 111.97411003236246, 'eval_runtime': 13.9611, 'eval_samples_per_second': 7.163, 'eval_steps_per_second': 1.791, 'epoch': 3.0}


In [None]:
from transformers import pipeline

translator = pipeline(
    "translation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

input_text = "Wavi wijali kulela wana."

result = translator(input_text, max_length=64)

print("Input:", input_text)
print("Translation:", result[0]['translation_text'])


Device set to use xpu:0


Input: Wavi wijali kulela wana.
Translation: Parents are quick to children.


In [None]:
texts = [
    "Wavi wijali kulela wana.",
    "Wana ni inosi kufuma kwa mlungu.",
    "Hata iji Wavika"
]


'''
Actual Translations of the texts list
[
    Parents care about the upbringing of children.
    Children are a gift from God.
    When you arrive
]
'''

for text in texts:
    result = translator(text, max_length=64)
    print(f"\nInput: {text}")
    print(f"Translation: {result[0]['translation_text']}")



Input: Wavi wijali kulela wana.
Translation: Parents are quick to children.

Input: Wana ni inosi kufuma kwa mlungu.
Translation: Children are a gift from God.

Input: Hata iji Wavika
Translation: Even if they're on the move
