In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Pretrain mT5 with span-corruption denoising on OPUS-100 (Hugging Face dataset).

In [2]:
import transformers
print(transformers.__version__)

4.52.4


In [3]:
import logging
import random
from typing import List, Dict

import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

2025-08-31 09:15:58.581421: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756631758.779292      76 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756631758.832848      76 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# ---- Dataset processing ----
def flatten_parallel_to_monolingual(batch):
    out_texts = []
    translations = batch.get("translation", {})
    if isinstance(translations, dict):
        for lang, text in translations.items():
            if text:
                txt = text.strip()
                if len(txt) > 0:
                    out_texts.append({"text": txt, "lang": lang})
    elif isinstance(translations, list):
        for t in translations:
            if isinstance(t, dict):
                for lang, text in t.items():
                    if text:
                        txt = text.strip()
                        if len(txt) > 0:
                            out_texts.append({"text": txt, "lang": lang})
    return {"__flattened": out_texts}

In [5]:
# ---------- Config ----------
MODEL_NAME = "google/mt5-base"
HF_DATASET_NAME = "Helsinki-NLP/opus-100"
OUTPUT_DIR = "./mt5-opus100-denoise"
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 128
NOISE_DENSITY = 0.15
MEAN_NOISE_SPAN_LENGTH = 3.0
TRAIN_SAMPLE_SIZE = 5000  # set small int for debugging
# ----------------------------

In [6]:
def prepare_dataset(split="train"):
    print("prepare datasets coming to method")
    logger.info(f"Loading dataset {HF_DATASET_NAME} split={split} ...")
    ds = load_dataset(HF_DATASET_NAME,'en-si' ,split=split, streaming=False)

    logger.info("Flattening parallel translations into monolingual sentences ...")
    ds = ds.map(flatten_parallel_to_monolingual, batched=False, remove_columns=ds.column_names)

    texts = []
    for item in ds:
        for d in item.get("__flattened", []):
            texts.append(d)
    mono = Dataset.from_list(texts)
    logger.info(f"Monolingual dataset size: {len(mono)}")

    if TRAIN_SAMPLE_SIZE:
        mono = mono.shuffle(seed=42).select(range(min(TRAIN_SAMPLE_SIZE, len(mono))))
        logger.info(f"Using sample size: {len(mono)}")
    return mono

In [7]:
# ---- Fixed T5 span corruption collator ----
class T5DenoisingCollator:
    def __init__(self, tokenizer, noise_density=0.15, mean_noise_span_length=3.0,
                 input_length=256, target_length=128):
        self.tokenizer = tokenizer
        self.noise_density = noise_density
        self.mean_noise_span_length = mean_noise_span_length
        self.input_length = input_length
        self.target_length = target_length
        self.pad_token_id = tokenizer.pad_token_id
        self.vocab_size = tokenizer.vocab_size
        self.sentinel_start = 0  # will map to <extra_id_0> later

    def __call__(self, examples):
        # Convert examples to token IDs
        input_ids_list = [e["input_ids"] for e in examples]
        batch_input_ids = []
        batch_labels = []

        for ids in input_ids_list:
            corrupted, labels = self._span_corrupt(ids)
            batch_input_ids.append(corrupted)
            batch_labels.append(labels)

        batch = {
            "input_ids": torch.tensor(batch_input_ids, dtype=torch.long),
            "labels": torch.tensor(batch_labels, dtype=torch.long),
        }

        # Attention mask
        batch["attention_mask"] = (batch["input_ids"] != self.pad_token_id).long()

        # --- DEBUG PRINTS ---
        # print("Example input_ids:", batch["input_ids"][0][:20])
        # print("Example labels   :", batch["labels"][0][:20])
        # print("Masked tokens in labels:", (batch["labels"][0] != -100).sum().item())

        return batch

    def _span_corrupt(self, tokens: List[int]):
        """T5-style span corruption"""
        tokens = [t for t in tokens if t != self.pad_token_id]
        num_to_mask = max(1, int(len(tokens) * self.noise_density))

        # Determine spans
        span_starts = []
        i = 0
        while i < num_to_mask:
            span_len = max(1, np.random.poisson(self.mean_noise_span_length))
            start = random.randint(0, max(0, len(tokens) - span_len))
            span_starts.append((start, min(len(tokens), start + span_len)))
            i += span_len

        # Sort spans
        span_starts = sorted(span_starts, key=lambda x: x[0])

        corrupted = []
        labels = []
        sentinel_id = self.tokenizer.convert_tokens_to_ids("<extra_id_0>")

        last_idx = 0
        current_sentinel = sentinel_id
        for start, end in span_starts:
            # Add unmasked tokens
            corrupted.extend(tokens[last_idx:start])
            # Add sentinel to input
            corrupted.append(current_sentinel)
            # Add sentinel + masked tokens to labels
            labels.append(current_sentinel)
            labels.extend(tokens[start:end])
            current_sentinel += 1  # next sentinel token
            last_idx = end

        # Add remaining tokens
        corrupted.extend(tokens[last_idx:])
        labels.append(self.pad_token_id)

        # Truncate / pad
        corrupted = corrupted[:self.input_length] + [self.pad_token_id] * max(0, self.input_length - len(corrupted))
        labels = labels[:self.target_length] + [-100] * max(0, self.target_length - len(labels))

        return corrupted, labels


In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
mono = prepare_dataset("train")

prepare datasets coming to method


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/155k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/65.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/153k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/979109 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/979109 [00:00<?, ? examples/s]

In [11]:
mono

Dataset({
    features: ['lang', 'text'],
    num_rows: 5000
})

In [12]:
print(mono[:10]) 

{'lang': ['si', 'en', 'si', 'en', 'si', 'si', 'en', 'en', 'si', 'en'], 'text': ['ඉතින්...', "Um... we'll have the next drink on the plane, okay?", 'හරි ඔයා දැන් ඔහුට කැමති නැද්ද ?', 'He got her...', 'අලුතෙන්ම වකුගඩුවක් ගන්න එක ලොකු වැඩක් බන්.', 'අනේ දෙවියනේ!', 'That is the coolest name ever!', 'Have this.', 'මයිල්ස්!', 'Anjali?']}


In [13]:
tokenized = mono.map(lambda ex: tokenizer(ex["text"], truncation=True,
                                              max_length=MAX_INPUT_LENGTH),
                         batched=True, remove_columns=["text", "lang"])

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [14]:
tokenized

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 5000
})

In [15]:
print(tokenized[:10]) 

{'input_ids': [[3686, 35420, 302, 1], [3048, 302, 787, 277, 1578, 783, 287, 6844, 20561, 351, 287, 30438, 261, 259, 69381, 291, 1], [24277, 3165, 8965, 259, 29799, 3165, 6640, 1489, 16711, 1586, 6114, 17318, 38803, 259, 291, 1], [1669, 5666, 1001, 302, 1], [58255, 234438, 75328, 259, 80583, 163816, 91298, 1939, 26171, 4258, 42285, 22928, 18827, 1939, 259, 176281, 260, 1], [2022, 17635, 7106, 32832, 17635, 309, 1], [7961, 339, 287, 16223, 861, 6535, 14049, 309, 1], [21201, 714, 260, 1], [259, 38524, 4858, 3775, 309, 1], [298, 204413, 291, 1]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1]]}


In [16]:
data_collator = T5DenoisingCollator(tokenizer, NOISE_DENSITY,
                                        MEAN_NOISE_SPAN_LENGTH,
                                        MAX_INPUT_LENGTH,
                                        MAX_TARGET_LENGTH)

In [17]:
training_args = Seq2SeqTrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        fp16=True,
        optim="adafactor",
        learning_rate=1e-4,
        warmup_ratio=0.01,
        num_train_epochs=1,
        logging_steps=500,
        save_steps=10000,
        save_total_limit=3,
        remove_unused_columns=False,
        report_to="none"
    )

In [18]:
trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

  trainer = Seq2SeqTrainer(


In [19]:
print("Start training ...")
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Done.")

Start training ...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


Done.


In [20]:

from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer

In [22]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:

# Step 2: Load your trained model + tokenizer (replace with your paths)
model = AutoModelForSeq2SeqLM.from_pretrained("./mt5-opus100-denoise")
tokenizer = AutoTokenizer.from_pretrained("./mt5-opus100-denoise")

In [26]:
# Step 3: Push to Hugging Face Hub
repo_name = "Eshan210352R/mt5-opus100-denoise-EN-SI"

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"✅ Model uploaded! Check it here: https://huggingface.co/{repo_name}")

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

✅ Model uploaded! Check it here: https://huggingface.co/Eshan210352R/mt5-opus100-denoise-EN-SI


## Perplexity Metric Evaluation

#### What is Perplexity?

- Perplexity (PPL) is a metric used to evaluate language models. It measures how well a probability model predicts a sample

In [80]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import math
import random
import numpy as np

# ---------------- Config ----------------
MODEL_PATH = "./mt5-opus100-denoise"  # pretrained model
HF_DATASET_NAME = "Helsinki-NLP/opus-100"
SRC_LANG = "en"
TGT_LANG = "si"
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_SAMPLES = 10  # subset for demo
NOISE_DENSITY = 0.15
MEAN_NOISE_SPAN_LENGTH = 3.0
# ----------------------------------------

# --- Load dataset ---
print("Loading dataset...")
ds = load_dataset(HF_DATASET_NAME, f"{SRC_LANG}-{TGT_LANG}", split="test")
ds = ds.select(range(NUM_SAMPLES))
texts = [ex["translation"][SRC_LANG] for ex in ds]
print(f"Loaded {len(texts)} examples.")
print("Sample:", texts[:2])

# --- Collator for T5 span corruption ---
class T5DenoisingCollator:
    def __init__(self, tokenizer, noise_density=0.15, mean_noise_span_length=3.0,
                 input_length=256, target_length=128):
        self.tokenizer = tokenizer
        self.noise_density = noise_density
        self.mean_noise_span_length = mean_noise_span_length
        self.input_length = input_length
        self.target_length = target_length
        self.pad_token_id = tokenizer.pad_token_id

    def __call__(self, texts):
        input_ids_list = []
        labels_list = []
        for text in texts:
            enc = self.tokenizer(text, truncation=True, max_length=self.input_length)
            corrupted, labels = self._span_corrupt(enc["input_ids"])
            input_ids_list.append(corrupted)
            labels_list.append(labels)

        batch = {
            "input_ids": torch.tensor(input_ids_list, dtype=torch.long).to(DEVICE),
            "labels": torch.tensor(labels_list, dtype=torch.long).to(DEVICE),
        }
        batch["attention_mask"] = (batch["input_ids"] != self.pad_token_id).long()
        return batch

    def _span_corrupt(self, tokens):
        tokens = [t for t in tokens if t != self.pad_token_id]
        num_to_mask = max(1, int(len(tokens) * self.noise_density))
        spans = []
        i = 0
        while i < num_to_mask:
            span_len = max(1, np.random.poisson(self.mean_noise_span_length))
            start = random.randint(0, max(0, len(tokens)-span_len))
            spans.append((start, min(len(tokens), start+span_len)))
            i += span_len
        spans = sorted(spans, key=lambda x: x[0])

        corrupted = []
        labels = []
        sentinel_id = self.tokenizer.convert_tokens_to_ids("<extra_id_0>")
        current_sentinel = sentinel_id
        last_idx = 0
        for start, end in spans:
            corrupted.extend(tokens[last_idx:start])
            corrupted.append(current_sentinel)
            labels.append(current_sentinel)
            labels.extend(tokens[start:end])
            current_sentinel += 1
            last_idx = end
        corrupted.extend(tokens[last_idx:])
        labels.append(self.pad_token_id)

        # Pad/truncate
        corrupted = corrupted[:self.input_length] + [self.pad_token_id] * max(0, self.input_length-len(corrupted))
        labels = labels[:self.target_length] + [-100] * max(0, self.target_length-len(labels))
        return corrupted, labels

# --- Load model & tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(DEVICE)
model.eval()

collator = T5DenoisingCollator(tokenizer, NOISE_DENSITY, MEAN_NOISE_SPAN_LENGTH,
                               MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)

# --- Evaluate PPL and print sample predictions ---
def evaluate(model, tokenizer, texts, collator, batch_size=4, max_samples_print=5):
    nlls = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch = collator(batch_texts)
        with torch.no_grad():
            outputs = model(**batch)  # labels already in batch
            nlls.append(outputs.loss.item())

            # Print predictions for first batch
            if i == 0:
                preds = model.generate(batch["input_ids"], attention_mask=batch["attention_mask"],
                                       max_length=MAX_TARGET_LENGTH)
                decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
                decoded_labels = tokenizer.batch_decode([[t for t in l if t != -100] for l in batch["labels"]],
                                                        skip_special_tokens=True)
                print("\n--- Sample Reconstructions ---")
                for src, lab, pred in zip(batch_texts[:max_samples_print],
                                          decoded_labels[:max_samples_print],
                                          decoded_preds[:max_samples_print]):
                    print(f"Source   : {src}")
                    print(f"Target   : {lab}")
                    print(f"Predicted: {pred}")
                    print("-----------")

    avg_nll = sum(nlls)/len(nlls)
    ppl = math.exp(avg_nll)
    return ppl

print("\nEvaluating pretrained model...")
ppl = evaluate(model, tokenizer, texts, collator, batch_size=BATCH_SIZE)
print(f"\nPerplexity: {ppl:.2f}")


Loading dataset...
Loaded 10 examples.
Sample: ['Because I believed in destiny and fate..', 'You will get know slowly.']

Evaluating pretrained model...

--- Sample Reconstructions ---
Source   : Because I believed in destiny and fate..
Target   : d in
Predicted: <extra_id_0>..
-----------
Source   : You will get know slowly.
Target   : know slowly
Predicted: <extra_id_0>.
-----------
Source   : Buckle up, guys. I am increasing the speed.
Target   : , <0x00> 
Predicted: <extra_id_0><0x00>s.
-----------
Source   : I'm going to the bathroom again.
Target   : m going
Predicted: <extra_id_0> again.
-----------

Perplexity: 45727738347.61
