In [None]:
import pandas as pd
import re

#load dataset

df = pd.read_excel('/content/drive/MyDrive/yoruba_dataset.xlsx')

def clean_text(text):

   # keep letters, spaces, basic punctuation
         return re.sub(r"[^a-zA-ZÀ-ž\s.,!?'-]", "", str(text))

print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/yoruba_dataset.xlsx'

In [None]:
#clean up column names by strippig whitespaces
df.columns = df.columns.str.strip()

#drop rows with missing values in yoruba or english
df.dropna(subset=['yoruba', 'Translation'], inplace=True)
print(df[['yoruba', 'Translation']].head())

In [None]:
#Convert to lowercase
df["yoruba"] = df["yoruba"].str.lower()
df["Translation"] = df["Translation"].str.lower()

#Preview after converting to lowercase
print("After converting to lowercase:")
print(df[["yoruba", "Translation"]].head())

In [None]:
#preview before removing duplicates
print("Number of rows before removing duplicates", df.shape[0])
print(df.head())

#Remove duplicates
df = df.drop_duplicates()

#preview after removing duplicates
print("Number of rows after removing duplicates", df.shape[0])
print(df.head())

In [None]:
#preview before removing short text
print("Number of rows before removing short text", df.shape[0])
print(df.head())

#Remove rows with short text
df = df[(df["yoruba"].str.len() > 2) & (df["Translation"].str.len() > 2)]

#preview after removing short text
print("Number of rows after removing short text", df.shape[0])
print(df.head())

In [None]:
#save cleaned dataset
df.to_excel("translation_dataset_cleaned.xlsx", index=False)

print("After cleaning:", df.shape)
print("Cleaned dataset saved as translation_dataset_cleaned.xlsx")

from google.colab import files
files.download("translation_dataset_cleaned.xlsx")

# NOW FOR THE SPLITTING of the dataset into training, validation, and test sets.

In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# === 1. Load your dataset ===
# Example: Yoruba dataset with English and Yoruba columns
df = pd.read_excel("/content/translation_dataset_cleaned (2).xlsx")  # change to your file name

# Optional: view first few rows
print(df.head())

# === 2. Define train/validation/test ratios ===
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# === 3. Split the data ===
# First, split into train + temp (validation + test)
train_df, temp_df = train_test_split(df, test_size=(1 - train_ratio), random_state=42)

# Now split temp into validation and test sets
val_df, test_df = train_test_split(temp_df, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

# === 4. Check dataset sizes ===
print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")
print(f"Test set: {len(test_df)} samples")

# === 5. Save to new files ===
train_df.to_excel("train_dataset.xlsx", index=False)
val_df.to_excel("validation_dataset.xlsx", index=False)
test_df.to_excel("test_dataset.xlsx", index=False)

print("✅ Datasets successfully split and saved:")
print("- train_dataset.xlsx")
print("- validation_dataset.xlsx")
print("- test_dataset.xlsx")

                                              yoruba  \
0  láti ọdún tó kọjá ni ọ̀rọ̀ owó orí tuntun náà ...   
1  ṣáájú ni ilé aṣòfin àpapọ̀ ti buwọ́lu àbá owó ...   
2  kódà àwọn gómìnà kan tako ìgbésẹ̀ ìlànà owó or...   
3  nínú àtẹ̀jáde kan tí ìgbìmọ̀ tó ń rí sí ọ̀rọ̀ ...   
4  bákan náà ni agbẹnusọ bola tinubu, bayo onanug...   

                                         Translation  
0  since last year, the issue of the new tax has ...  
1  earlier, the national assembly had passed the ...  
2  in fact, some governors opposed the new tax po...  
3  in a statement released by the tax committee o...  
4  similarly, bola tinubu’s spokesperson, bayo on...  
Train set: 5380 samples
Validation set: 673 samples
Test set: 673 samples
✅ Datasets successfully split and saved:
- train_dataset.xlsx
- validation_dataset.xlsx
- test_dataset.xlsx



Train a translation model (English → Target language) using an Excel file.
Works directly in notebooks or Python scripts.

✅ Features:
- Reads Excel files directly
- Auto-detects English/Yoruba/Translation column names
- Compatible with both new and old Transformers versions
- Automatically saves model and tokenizer after training
- Disables wandb logging for clean output


Setup and Package Installation

This cell ensures a clean and compatible environment for training translation models:

Uninstalls old versions of key libraries and clears cache to prevent conflicts.

Upgrades pip to the latest version.

Installs specific versions of:

transformers → model training

accelerate → GPU/distributed support

datasets → data handling

sentencepiece → subword tokenization

sacrebleu → translation evaluation

peft → parameter-efficient fine-tuning

This guarantees stable, reproducible training in your notebook.

In [None]:
# Install the necessary packages with specific versions to avoid conflicts
# Clean previous installations
!pip uninstall -y transformers accelerate datasets sentencepiece sacrebleu peft
!pip cache purge

# Install compatible versions
!pip install -q --upgrade pip
!pip install -q transformers==4.40.0 accelerate==0.27.0 datasets sentencepiece sacrebleu==2.0.0 peft==0.10.0

Found existing installation: transformers 4.40.0
Uninstalling transformers-4.40.0:
  Successfully uninstalled transformers-4.40.0
Found existing installation: accelerate 0.11.0
Uninstalling accelerate-0.11.0:
  Successfully uninstalled accelerate-0.11.0
Found existing installation: datasets 3.0.1
Uninstalling datasets-3.0.1:
  Successfully uninstalled datasets-3.0.1
Found existing installation: sentencepiece 0.1.99
Uninstalling sentencepiece-0.1.99:
  Successfully uninstalled sentencepiece-0.1.99
Found existing installation: sacrebleu 2.0.0
Uninstalling sacrebleu-2.0.0:
  Successfully uninstalled sacrebleu-2.0.0
Files removed: 67 (20.3 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[33m  DEPRECATION: Building 'sentencepiece' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future 

Optional / Redundant Installation

This cell installs/upgrades a few key libraries (transformers, datasets, sacrebleu, sentencepiece) as a fallback.

Marked as redundant because the primary installation and version management is handled in the first setup cell.

Kept here temporarily to ensure the notebook runs if previous cells are skipped.

Can be safely removed once the environment is correctly set up.

In [None]:
# Redundant installation cell, will be removed or kept as-is if not the primary installation method
# Keeping as-is for now, but the primary installation fix is in the first cell.
# !pip install -U transformers==4.40.0 datasets sacrebleu sentencepiece

Clean Previous Installations

This cell ensures a fresh environment by:

Uninstalling old versions of key libraries (transformers, accelerate, tokenizers, sentencepiece).

Purging the pip cache to prevent conflicts during reinstallation.

✅ Useful when switching library versions or resolving package conflicts before reinstalling.

In [None]:
!pip uninstall -y transformers accelerate tokenizers sentencepiece
!pip cache purge

Install Specific Library Versions

This cell installs compatible versions of key libraries for stable training:

transformers==4.40.0 → model training and fine-tuning

datasets==3.0.1 → data handling and preprocessing

accelerate → GPU/distributed training support

sentencepiece → subword tokenization

sacrebleu → translation evaluation metrics

Ensures reproducibility and avoids version conflicts in your notebook.

In [None]:
# !pip install transformers==4.40.0 datasets==3.0.1 accelerate sentencepiece sacrebleu

Collecting transformers==4.40.0
  Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
Collecting datasets==3.0.1
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.0)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets==3.0.1)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0

Install Compatible Packages

This cell sets up the environment with specific library versions to ensure stable and reproducible training:

Upgrades pip to the latest version.

Installs:

transformers==4.40.0 → model training/fine-tuning

datasets==3.0.1 → dataset handling and preprocessing

accelerate==0.11.0 → GPU and distributed training support

sentencepiece==0.1.99 → subword tokenization

sacrebleu==2.0.0 → translation evaluation

pandas==2.2.2 → data manipulation

Ensures version compatibility and prevents conflicts in your notebook.

In [None]:
# Install the necessary packages with specific versions to avoid conflicts
!pip install -q --upgrade pip
!pip install -q transformers==4.40.0 datasets==3.0.1 accelerate==0.27.0 sentencepiece==0.1.99 sacrebleu==2.0.0 pandas==2.2.2 peft==0.10.0

Full-Data EN→YO Translation Training Pipeline

This cell performs full-dataset fine-tuning of the English → Yoruba MarianMT model using the following setup:

Configuration

Loads training and validation data from Excel.

Detects source (translation) and target (yoruba) columns.

Sets training hyperparameters for speed and memory efficiency (short sequences, gradient accumulation, mixed precision).

Model & Tokenizer

Loads Helsinki-NLP/opus-mt-en-mul.

Enables gradient checkpointing to save GPU memory.

Tokenization

Tokenizes the datasets with max_seq_length=64 for faster training.

Training

Uses Trainer with gradient accumulation to simulate a larger batch size.

Trains for 1 epoch (adjustable) on full dataset.

Saving

Saves both the model weights and tokenizer to the output_dir.

✅ Optimized for Colab GPU, full dataset, and accelerated training.

In [None]:
# ============================================
# 🔧 FIX for accelerate + transformers mismatch
# ============================================

!pip uninstall -y transformers accelerate
!pip install transformers==4.41.2 accelerate==0.30.1

# Optional: reinstall others for safety
!pip install -q sentencepiece==0.1.99 datasets==3.0.1 sacrebleu==2.3.1 pandas==2.2.2

import os
os.kill(os.getpid(), 9)  # ♻️ Restart runtime to apply versions


Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: accelerate 1.11.0
Uninstalling accelerate-1.11.0:
  Successfully uninstalled accelerate-1.11.0
Collecting transformers==4.41.2
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.30.1
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.2)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K   [90m━━━━

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/2.6 MB[0m [31m32.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentencepiece (setup.py) ... [?25l[?25hcanceled
Traceback (m

In [None]:
import torch

# 🔍 GPU check and setup
if torch.cuda.is_available():
    device = torch.device("cuda")
    gpu_name = torch.cuda.get_device_name(0)
    print(f"🚀 GPU detected: {gpu_name}")
else:
    device = torch.device("cpu")
    print("⚠️ GPU NOT available — switch to GPU for faster training!")
    print("👉 In Colab: Runtime > Change runtime type > Hardware accelerator > GPU")

# Show device for confirmation
print(f"💻 Using device: {device}")


🚀 GPU detected: Tesla T4
💻 Using device: cuda


In [None]:
# ============================================
# ⚡ FULL-DATA EN→YO TRANSLATION TRAINING PIPELINE (FAST)
# ============================================

import os
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    MarianTokenizer,
    MarianMTModel,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)

# =====================
# CONFIGURATION
# =====================
train_xlsx = "train_dataset.xlsx"
val_xlsx = "validation_dataset.xlsx"
model_name = "Helsinki-NLP/opus-mt-en-mul"
output_dir = "./translation_model_full"

# SPEED + FULL-DATA SETTINGS
num_train_epochs = 1          # keep 1 epoch for speed
per_device_batch_size = 8     # GPU-friendly batch size
max_seq_length = 64           # shorter sequences for speed
gradient_accumulation_steps = 4  # simulate bigger batch: 8*4=32

# ENVIRONMENT
os.environ["WANDB_DISABLED"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

# DEVICE
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"💻 Using device: {device.upper()}")

# =====================
# LOAD DATA
# =====================
print("📂 Loading Excel data...")
train_df = pd.read_excel(train_xlsx)
val_df = pd.read_excel(val_xlsx)

# CLEAN COLUMNS
train_df.columns = [c.strip().lower() for c in train_df.columns]
val_df.columns = [c.strip().lower() for c in val_df.columns]

# DETECT SOURCE AND TARGET
if "translation" in train_df.columns and "yoruba" in train_df.columns:
    src_col, tgt_col = "translation", "yoruba"
else:
    raise ValueError(f"❌ Could not detect columns. Found: {train_df.columns.tolist()}")

print(f"✅ Training samples: {len(train_df)}, Validation samples: {len(val_df)}")

# CREATE DATASETS
train_ds = Dataset.from_pandas(train_df.rename(columns={src_col:"src", tgt_col:"tgt"}))
val_ds   = Dataset.from_pandas(val_df.rename(columns={src_col:"src", tgt_col:"tgt"}))

# =====================
# LOAD MODEL & TOKENIZER
# =====================
print(f"🚀 Loading model: {model_name}")
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)
# model.gradient_checkpointing_enable()  # save GPU memory

# =====================
# TOKENIZATION FUNCTION
# =====================
def preprocess(batch):
    model_inputs = tokenizer(batch["src"], max_length=max_seq_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["tgt"], max_length=max_seq_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("🔡 Tokenizing datasets...")
train_tok = train_ds.map(preprocess, batched=True)
val_tok   = val_ds.map(preprocess, batched=True)

# DATA COLLATOR
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# =====================
# TRAINING ARGUMENTS
# =====================
training_args = TrainingArguments(
    output_dir=output_dir,
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=per_device_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    fp16=True,                  # mixed precision
    logging_steps=550,
    report_to="none"
)

# =====================
# TRAINER
# =====================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# =====================
# START TRAINING
# =====================
print("🏋️ Training model (full dataset, accelerated)...")
trainer.train()

# =====================
# SAVE MODEL
# =====================
print("💾 Saving model...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Done! Model saved to {output_dir}")

💻 Using device: CUDA
📂 Loading Excel data...
✅ Training samples: 5380, Validation samples: 673
🚀 Loading model: Helsinki-NLP/opus-mt-en-mul


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

🔡 Tokenizing datasets...


Map:   0%|          | 0/5380 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

Map:   0%|          | 0/673 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


🏋️ Training model (full dataset, accelerated)...


Step,Training Loss




💾 Saving model...
✅ Done! Model saved to ./translation_model_full


Download the Trained Model as a ZIP

This cell zips the trained model folder and downloads it to your local machine:

Specify the model folder (/content/translation_model_full) and the output ZIP file name.

Zip the folder using shutil.make_archive.

Download the ZIP via files.download().

⚠️ Note: Direct download may fail for large models (>1–2 GB). If so, consider uploading to Google Drive instead.

In [None]:
import shutil
from google.colab import files

# 1️⃣ Path to your saved model folder
model_folder = "/content/translation_model_full"
zip_file = "translation_model_full_epochs.zip"

# 2️⃣ Make a zip of the folder
shutil.make_archive(base_name=model_folder, format='zip', root_dir=model_folder)
print(f"✅ Model folder zipped as: {zip_file}")

# 3️⃣ Download the zip
files.download(zip_file)


# **Evaluation**

In [None]:
!pip install -q sacrebleu==2.3.1


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ============================================
# ⚡ EVALUATION: EN→YO TRANSLATION MODEL
# ============================================

import torch
import pandas as pd
from transformers import MarianTokenizer, MarianMTModel
import sacrebleu

# CONFIG
model_dir = "./translation_model_full"       # Path to your trained model
val_xlsx = "validation_dataset.xlsx"         # Validation file used earlier
max_seq_length = 64

# LOAD MODEL AND TOKENIZER
print("🚀 Loading trained model for evaluation...")
tokenizer = MarianTokenizer.from_pretrained(model_dir)
model = MarianMTModel.from_pretrained(model_dir).to("cuda" if torch.cuda.is_available() else "cpu")

# LOAD VALIDATION DATA
val_df = pd.read_excel(val_xlsx)
val_df.columns = [c.strip().lower() for c in val_df.columns]

if "translation" in val_df.columns and "yoruba" in val_df.columns:
    src_col, tgt_col = "translation", "yoruba"
else:
    raise ValueError(f"❌ Columns not found. Found: {val_df.columns.tolist()}")

# TAKE A SMALL SAMPLE (optional, for faster evaluation)
val_sample = val_df.sample(200, random_state=42)  # adjust size as needed

# GENERATE TRANSLATIONS
print("🧠 Generating translations...")
inputs = val_sample[src_col].tolist()
references = val_sample[tgt_col].tolist()

translated = []
for text in inputs:
    batch = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_seq_length)
    batch = {k: v.to(model.device) for k, v in batch.items()}
    with torch.no_grad():
        generated = model.generate(**batch, max_length=max_seq_length)
    translated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    translated.append(translated_text)

# COMPUTE BLEU SCORE
print("📊 Evaluating with BLEU score...")
bleu = sacrebleu.corpus_bleu(translated, [references])
print(f"✅ BLEU Score: {bleu.score:.2f}")

# OPTIONAL: SHOW SAMPLE TRANSLATIONS
print("\n🔍 Sample Translations:")
for i in range(5):
    print(f"\nEN: {inputs[i]}")
    print(f"YO (pred): {translated[i]}")
    print(f"YO (ref):  {references[i]}")


🚀 Loading trained model for evaluation...




🧠 Generating translations...
📊 Evaluating with BLEU score...
✅ BLEU Score: 6.86

🔍 Sample Translations:

EN: stay updated with bbc news yoruba on whatsapp — join us there at the end of this podcast promotion.
alapini continued, saying: ‘we were constantly being cursed and insulted by people during the election campaign, especially online.
YO (pred): dúró sílẹ̀ pẹ̀lú bbc news yorùba lórí whatsapp — darapọ̀ mọ́ wa ní ìparí ìfojúsùn podcast yìí. alapinini ń bá a lọ, ó ní: ‘
YO (ref):  àjáàbalẹ̀ ìròyìn bbc news yorùbá lórí whatsapp rẹ darapọ̀ mọ́ wa nibì end of podcast promotion alapini tẹsiwaje pe ''epe rabandẹ lawọn eeyan n ṣẹ le wa lori lasiko ti a n ṣe ipolongo ibo naa papaa julọ lori ayelujara.

EN: as a child, like we see with most children in sub-saharan africa today, i regularly suffered from malaria.
YO (pred): bí ọmọdé, bí a ṣe rí pàápàá pàápàá pàápàá pẹ̀lú ọ̀pọ̀lọpọ̀ àwọn ọmọdé ní orílẹ̀-ẹ̀-saharan africa lónìí, mo
YO (ref):  gẹ́gẹ́ bí ọmọde, bí a ṣe ń rí pẹ̀lú ọ̀pọ̀lọpọ̀ ọmọ ní

In [None]:
# ============================================
# ⚡ EN→YO TRANSLATION MODEL EVALUATION
# ============================================

import torch
import pandas as pd
from transformers import MarianTokenizer, MarianMTModel
import sacrebleu
from datetime import datetime

# CONFIG
model_dir = "./translation_model_full"     # Your fine-tuned model path
val_xlsx = "validation_dataset.xlsx"               # Validation dataset
output_report = "evaluation_report.txt"
max_seq_length = 80
sample_size = None  # set to e.g. 200 for faster test runs

# DEVICE SETUP
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device.upper()}")

# LOAD MODEL AND TOKENIZER
print("📦 Loading model and tokenizer...")
tokenizer = MarianTokenizer.from_pretrained(model_dir)
model = MarianMTModel.from_pretrained(model_dir).to(device)
model.eval()

# LOAD VALIDATION DATA
print("📂 Loading validation data...")
val_df = pd.read_excel(val_xlsx)
val_df.columns = [c.strip().lower() for c in val_df.columns]

if "translation" in val_df.columns and "yoruba" in val_df.columns:
    src_col, tgt_col = "translation", "yoruba"
else:
    raise ValueError(f"❌ Expected columns 'translation' and 'yoruba'. Found: {val_df.columns.tolist()}")

# OPTIONALLY SAMPLE SUBSET
if sample_size:
    val_df = val_df.sample(sample_size, random_state=42)
print(f"✅ Loaded {len(val_df)} validation samples.")

# GENERATE TRANSLATIONS
print("🧠 Generating translations...")
inputs = val_df[src_col].tolist()
references = val_df[tgt_col].tolist()

translated = []
for i, text in enumerate(inputs):
    batch = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_seq_length)
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        generated = model.generate(**batch, max_length=max_seq_length, num_beams=5)
    translated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    translated.append(translated_text)
    if (i + 1) % 50 == 0:
        print(f"  → {i + 1}/{len(inputs)} sentences processed")

# COMPUTE BLEU SCORE
print("📊 Evaluating with BLEU score...")
bleu = sacrebleu.corpus_bleu(translated, [references])
print(f"✅ BLEU Score: {bleu.score:.2f}")

# SAVE REPORT
print("📝 Saving evaluation report...")
with open(output_report, "w", encoding="utf-8") as f:
    f.write(f"Evaluation Report — {datetime.now()}\n")
    f.write("="*60 + "\n\n")
    f.write(f"Model: {model_dir}\n")
    f.write(f"Validation samples: {len(inputs)}\n")
    f.write(f"BLEU Score: {bleu.score:.2f}\n\n")
    f.write("Sample Translations:\n")
    f.write("-"*60 + "\n")
    for i in range(min(10, len(inputs))):
        f.write(f"\nEN: {inputs[i]}\n")
        f.write(f"YO (pred): {translated[i]}\n")
        f.write(f"YO (ref):  {references[i]}\n")

print(f"✅ Done! Report saved to '{output_report}'")


🚀 Using device: CUDA
📦 Loading model and tokenizer...




📂 Loading validation data...
✅ Loaded 673 validation samples.
🧠 Generating translations...
  → 50/673 sentences processed
  → 100/673 sentences processed
  → 150/673 sentences processed
  → 200/673 sentences processed
  → 250/673 sentences processed
  → 300/673 sentences processed
  → 350/673 sentences processed
  → 400/673 sentences processed
  → 450/673 sentences processed
  → 500/673 sentences processed
  → 550/673 sentences processed
  → 600/673 sentences processed
  → 650/673 sentences processed
📊 Evaluating with BLEU score...
✅ BLEU Score: 6.47
📝 Saving evaluation report...
✅ Done! Report saved to 'evaluation_report.txt'


In [None]:
!pip install gradio==4.44.0 pillow

import gradio as gr
from transformers import MarianMTModel, MarianTokenizer
from PIL import Image, ImageDraw, ImageFont

# --- Load trained model ---
model_dir = "./yoruba_translator"
tokenizer = MarianTokenizer.from_pretrained(model_dir)
model = MarianMTModel.from_pretrained(model_dir)

# --- Generate a simple text logo dynamically ---
logo = Image.new("RGB", (600, 120), color=(13, 71, 36))  # deep green background
draw = ImageDraw.Draw(logo)
try:
    font = ImageFont.truetype("DejaVuSans-Bold.ttf", 60)
except:
    font = ImageFont.load_default()
draw.text((40, 25), "🧠 YoruCentric Translator", font=font, fill=(255, 215, 0))  # gold text
logo_path = "yorucentric_logo.png"
logo.save(logo_path)

# --- Define translation function ---
def translate_text(text, direction):
    if not text.strip():
        return "⚠️ Please enter some text."

    input_text = text
    tokens = tokenizer([input_text], return_tensors="pt", padding=True)
    translated = model.generate(**tokens)
    output = tokenizer.decode(translated[0], skip_special_tokens=True)
    return output

# --- Gradio app interface ---
app = gr.Interface(
    fn=translate_text,
    inputs=[
        gr.Textbox(label="Enter Text", placeholder="Type or paste text here...", lines=4),
        gr.Radio(["English → Yoruba", "Yoruba → English"], label="Select Direction", value="English → Yoruba")
    ],
    outputs=gr.Textbox(label="Translated Output", lines=4),
    title="🧠 YoruCentric Translator",
    description=(
        "A bilingual English ↔ Yoruba neural machine translator trained using Helsinki-NLP MarianMT.\n\n"
        "🌍 *Bringing Yoruba language technology closer to the world.*"
    ),
    theme=gr.themes.Soft(primary_hue="green", secondary_hue="lime"),
    article=f"<center><img src='file/{logo_path}' width='400'><br><br><b>Developed by:</b> NLP Research Project Team</center>"
)

# --- Launch with public link ---
app.launch(share=True)




OSError: Can't load tokenizer for './yoruba_translator'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure './yoruba_translator' is the correct path to a directory containing all relevant files for a MarianTokenizer tokenizer.

In [9]:
!pip install gradio==4.44.0 pillow

import gradio as gr
from transformers import MarianMTModel, MarianTokenizer
from PIL import Image, ImageDraw, ImageFont

# --- Load trained model ---
model_dir = "./translation_model_full" # Corrected directory name
tokenizer = MarianTokenizer.from_pretrained(model_dir)
model = MarianMTModel.from_pretrained(model_dir)

# --- Generate a simple text logo dynamically ---
logo = Image.new("RGB", (600, 120), color=(13, 71, 36))  # deep green background
draw = ImageDraw.Draw(logo)
try:
    font = ImageFont.truetype("DejaVuSans-Bold.ttf", 60)
except:
    font = ImageFont.load_default()
draw.text((40, 25), "🧠 YoruCentric Translator", font=font, fill=(255, 215, 0))  # gold text
logo_path = "yorucentric_logo.png"
logo.save(logo_path)

# --- Define translation function ---
def translate_text(text, direction):
    if not text.strip():
        return "⚠ Please enter some text."

    # Determine source and target languages based on direction
    if direction == "English → Yoruba":
        src_lang = "en"
        tgt_lang = "yor"
    elif direction == "Yoruba → English":
        # Note: The model was trained on EN->YO. For YO->EN, you might need a different model or fine-tune this one.
        # This implementation will attempt YO->EN using the EN->YO model, which may not be accurate.
        src_lang = "yor"
        tgt_lang = "en"
    else:
        return "Invalid translation direction selected."

    # Add language prefix for MarianMT
    input_text = f">>{tgt_lang}<< {text}"

    tokens = tokenizer([input_text], return_tensors="pt", padding=True)
    translated = model.generate(**tokens)
    output = tokenizer.decode(translated[0], skip_special_tokens=True)
    return output

# --- Gradio app interface ---
app = gr.Interface(
    fn=translate_text,
    inputs=[
        gr.Textbox(label="Enter Text", placeholder="Type or paste text here...", lines=4),
        gr.Radio(["English → Yoruba", "Yoruba → English"], label="Select Direction", value="English → Yoruba")
    ],
    outputs=gr.Textbox(label="Translated Output", lines=4),
    title="🧠 YoruCentric Translator",
    description=(
        "A bilingual English ↔ Yoruba neural machine translator trained using Helsinki-NLP MarianMT.\n\n"
        "🌍 Bringing Yoruba language technology closer to the world."
    ),
    theme=gr.themes.Soft(primary_hue="green", secondary_hue="lime"),
    article=f"<center><img src='file/{logo_path}' width='400'><br><br><b>Developed by:</b> NLP Research Project Team</center>"
)

# --- Launch with public link ---
app.launch(share=True)





Colab notebook detected. To show errors in colab notebook, set debug=True in launch()


--------


Running on public URL: https://ee956605530968ef31.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


