In [None]:
%pip install transformers datasets evaluate


In [None]:
%pip install torch

In [None]:
%pip install "accelerate>=0.26.0"

In [None]:
%pip install --upgrade pip

In [5]:
%pip install --upgrade transformers --quiet

Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [14]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("csv", data_files={
    "train": "../Dataset/processed/train.csv",
    "validation": "../Dataset/processed/val.csv",
    "test": "../Dataset/processed/test.csv"
})

def normalize_types(batch):
    # ensure values are strings and trimmed (fast, safe)
    batch["english"] = [str(x).strip() if x is not None else "" for x in batch.get("english", [])]
    batch["igbo"]    = [str(x).strip() if x is not None else "" for x in batch.get("igbo", [])]
    return batch

dataset = dataset.map(normalize_types, batched=True, desc="normalize types and trim")

# Quick checks & summary
for split in ["train", "validation", "test"]:
    ds = dataset[split]
    df = ds.to_pandas()
    total = len(df)
    empty_en = df["english"].eq("").sum()
    empty_ig = df["igbo"].eq("").sum()
    dup = df.duplicated(subset=["english","igbo"]).sum()
    starts_with_quote_en = df["english"].str.startswith('"').sum()
    starts_with_quote_ig = df["igbo"].str.startswith('"').sum()
    print(f"{split}: rows={total} | empty_en={empty_en} | empty_ig={empty_ig} | dup={dup} | starts_quote_en={starts_with_quote_en} | starts_quote_ig={starts_with_quote_ig}")

    if empty_en or empty_ig or dup or starts_with_quote_en or starts_with_quote_ig:
        print("  Sample problematic rows:")
        bad = df[(df["english"]== "") | (df["igbo"]=="") | (df.duplicated(subset=['english','igbo'], keep=False)) | (df["english"].str.startswith('"')) | (df["igbo"].str.startswith('"'))]
        display(bad.head(10))
    else:
        print("  OK — no obvious problems found.")


train: rows=8309 | empty_en=0 | empty_ig=0 | dup=0 | starts_quote_en=0 | starts_quote_ig=0
  OK — no obvious problems found.
validation: rows=1068 | empty_en=0 | empty_ig=0 | dup=0 | starts_quote_en=0 | starts_quote_ig=0
  OK — no obvious problems found.
test: rows=1069 | empty_en=0 | empty_ig=0 | dup=0 | starts_quote_en=0 | starts_quote_ig=0
  OK — no obvious problems found.


In [15]:
import pandas as pd
from datasets import Dataset, DatasetDict

train_df = pd.read_csv("../Dataset/processed/train_tokenized_hf.csv")
val_df   = pd.read_csv("../Dataset/processed/validation_tokenized_hf.csv")

# drop accidental index column from pandas if present
for df in (train_df, val_df):
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
})


In [16]:
from transformers import PreTrainedTokenizerFast

tokenizer_en = PreTrainedTokenizerFast(
    tokenizer_file="../tokenizers/english_tokenizer.json",   
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)

tokenizer_ig = PreTrainedTokenizerFast(
    tokenizer_file="../tokenizers/igbo_tokenizer.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)


In [17]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

def check_special_tokens(tokenizer, name):
    print(f"\nChecking tokenizer: {name}")
    for token in special_tokens:
        token_id = tokenizer.convert_tokens_to_ids(token)
        if token_id == tokenizer.unk_token_id and token != tokenizer.unk_token:
            # Means token wasn't found, showing -1 (missing)
            print(f"{token}: MISSING in vocab")
        else:
            print(f"{token}: ID = {token_id}")

check_special_tokens(tokenizer_en, "English")
check_special_tokens(tokenizer_ig, "Igbo")



Checking tokenizer: English
[UNK]: ID = 0
[PAD]: ID = 1
[CLS]: ID = 2
[SEP]: ID = 3
[MASK]: ID = 4

Checking tokenizer: Igbo
[UNK]: ID = 0
[PAD]: ID = 1
[CLS]: ID = 2
[SEP]: ID = 3
[MASK]: ID = 4


In [19]:
from transformers import AutoTokenizer

model_id = "facebook/nllb-200-distilled-600M"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

print("Tokenizer loaded successfully!")

# Test tokenization
sample_text = "Hello world!"
tokens = tokenizer.tokenize(sample_text)
print("Example tokens:", tokens)

# If you want token IDs:
token_ids = tokenizer.encode(sample_text, add_special_tokens=True)
print("Token IDs:", token_ids)


Tokenizer loaded successfully!
Example tokens: ['▁Hello', '▁world', '!']
Token IDs: [256047, 94124, 15697, 248203, 2]


In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
model.resize_token_embeddings(len(tokenizer))


M2M100ScaledWordEmbedding(256204, 1024, padding_idx=1)

In [21]:
MAX_SOURCE_LENGTH = 128
MAX_TARGET_LENGTH = 128


In [22]:
def preprocess(batch):
    src_texts = [str(x) if x is not None else "" for x in batch['english']]
    tgt_texts = [str(x) if x is not None else "" for x in batch['igbo']]

    inputs = tokenizer_en(
        src_texts,
        truncation=True,
        padding=False,
        max_length=MAX_SOURCE_LENGTH,
        return_attention_mask=True
    )

    labels = tokenizer_ig(
        tgt_texts,
        truncation=True,
        padding=False,
        max_length=MAX_TARGET_LENGTH
    )

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels['input_ids']
    }

cols_to_remove = [c for c in ['english','igbo','english_tokens','igbo_tokens'] if c in dataset['train'].column_names]

tokenized = dataset.map(
    preprocess,
    batched=True,
    remove_columns=cols_to_remove,
    desc="Tokenizing and formatting datasets"
)


Tokenizing and formatting datasets:   0%|          | 0/8309 [00:00<?, ? examples/s]

Tokenizing and formatting datasets:   0%|          | 0/1068 [00:00<?, ? examples/s]

In [23]:
sample = tokenized['train'][0]
print("English (decoded):", tokenizer_en.decode(sample['input_ids'], skip_special_tokens=True))
print("Igbo (decoded):", tokenizer_ig.decode(sample['labels'], skip_special_tokens=True))


English (decoded): why did you leave your former place of work ?
Igbo (decoded): gịnị mere i ji hapụ ebe ị na - arụ n ' oge mbu ?


In [24]:
import transformers, inspect
from transformers import Seq2SeqTrainingArguments

print("transformers version:", transformers.__version__)
print("transformers file:", transformers.__file__)
print("Seq2SeqTrainingArguments module:", Seq2SeqTrainingArguments.__module__)
print("Seq2SeqTrainingArguments file:", inspect.getsourcefile(Seq2SeqTrainingArguments))
print("Seq2SeqTrainingArguments __init__ signature:")
import inspect
print(inspect.signature(Seq2SeqTrainingArguments.__init__))


transformers version: 4.55.0
transformers file: /Users/bigdreams/Documents/langGpt/venv/lib/python3.13/site-packages/transformers/__init__.py
Seq2SeqTrainingArguments module: transformers.training_args_seq2seq
Seq2SeqTrainingArguments file: /Users/bigdreams/Documents/langGpt/venv/lib/python3.13/site-packages/transformers/training_args_seq2seq.py
Seq2SeqTrainingArguments __init__ signature:


In [25]:
# Training Arguments
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

# Training Arguments
train_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    learning_rate=3e-5,
    logging_dir='./logs',
    predict_with_generate=True,
    fp16=False,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    max_steps=5 
    
)



In [35]:
import torch

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)


M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(256204, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(256204, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
       

In [26]:
print("FP16 enabled:", train_args.fp16)  


FP16 enabled: False


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,           
    args=train_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()