In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud

In [3]:
import logging
import random
from typing import List, Dict
import numpy as np
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    pipeline
)
import torch
from tqdm import tqdm
import evaluate

2025-09-01 04:44:00.108387: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756701840.270886      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756701840.319640      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:

# ---------- Config ----------
MODEL_NAME = "google/mt5-base"
HF_DATASET_NAME = "Helsinki-NLP/opus-100"
OUTPUT_DIR = "./mt5-opus100-denoise-final"
 # Change this

MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 128
NOISE_DENSITY = 0.15
MEAN_NOISE_SPAN_LENGTH = 3.0
TRAIN_SAMPLE_SIZE = 1000 # Increased for better training
VAL_SAMPLE_SIZE = 250
LANGUAGE_PAIRS = ['en-si']  # Multiple languages
# ----------------------------

In [5]:
def prepare_datasets():
    """Prepare training and validation datasets from multiple language pairs"""
    train_datasets = []
    val_datasets = []
    
    for lang_pair in LANGUAGE_PAIRS:
        try:
            print(f"Loading {lang_pair} dataset...")
            dataset = load_dataset(HF_DATASET_NAME, lang_pair)
            
            # Process train split
            train_mono = convert_to_monolingual(dataset['train'], lang_pair)
            if TRAIN_SAMPLE_SIZE:
                train_mono = train_mono.select(range(min(TRAIN_SAMPLE_SIZE, len(train_mono))))
            train_datasets.append(train_mono)
            
            # Process validation split if available
            if 'validation' in dataset:
                val_mono = convert_to_monolingual(dataset['validation'], lang_pair)
                if VAL_SAMPLE_SIZE:
                    val_mono = val_mono.select(range(min(VAL_SAMPLE_SIZE, len(val_mono))))
                val_datasets.append(val_mono)
                
        except Exception as e:
            print(f"Failed to load {lang_pair}: {e}")
    
    # Combine all datasets
    train_dataset = concatenate_datasets(train_datasets)
    val_dataset = concatenate_datasets(val_datasets) if val_datasets else None
    
    print(f"Final training dataset size: {len(train_dataset)}")
    if val_dataset:
        print(f"Final validation dataset size: {len(val_dataset)}")
    
    return train_dataset, val_dataset

In [6]:
def convert_to_monolingual(dataset, lang_pair):
    """Convert parallel corpus to monolingual format"""
    src_lang, tgt_lang = lang_pair.split('-')
    
    def process_example(example):
        outputs = []
        if example['translation'].get(src_lang):
            outputs.append({
                "text": example['translation'][src_lang].strip(),
                "lang": src_lang
            })
        if example['translation'].get(tgt_lang):
            outputs.append({
                "text": example['translation'][tgt_lang].strip(),
                "lang": tgt_lang
            })
        return outputs
    
    all_texts = []
    for example in tqdm(dataset, desc=f"Processing {lang_pair}"):
        all_texts.extend(process_example(example))
    
    return Dataset.from_list(all_texts)

In [7]:
class T5SpanCorruptionCollator:
    def __init__(self, tokenizer, noise_density=0.15, mean_noise_span_length=3.0,
                 input_length=256, target_length=128):
        self.tokenizer = tokenizer
        self.noise_density = noise_density
        self.mean_noise_span_length = mean_noise_span_length
        self.input_length = input_length
        self.target_length = target_length
        self.pad_token_id = tokenizer.pad_token_id
        self.sentinel_start_id = tokenizer.convert_tokens_to_ids("<extra_id_0>")
        
    def __call__(self, examples):
        # Get input_ids from tokenized examples
        input_ids_list = [ex["input_ids"] for ex in examples]
        
        corrupted_batch = []
        labels_batch = []
        
        for input_ids in input_ids_list:
            corrupted, labels = self.corrupt_spans(input_ids)
            corrupted_batch.append(corrupted)
            labels_batch.append(labels)
        
        # Pad sequences
        corrupted_batch = self.tokenizer.pad(
            {"input_ids": corrupted_batch},
            padding=True,
            max_length=self.input_length,
            return_tensors="pt",
        )["input_ids"]
        
        labels_batch = self.tokenizer.pad(
            {"input_ids": labels_batch},
            padding=True,
            max_length=self.target_length,
            return_tensors="pt",
        )["input_ids"]
        
        # Replace padding token id with -100 for labels
        labels_batch[labels_batch == self.tokenizer.pad_token_id] = -100
        
        return {
            "input_ids": corrupted_batch,
            "labels": labels_batch,
            "attention_mask": (corrupted_batch != self.tokenizer.pad_token_id).long(),
        }
    
    def corrupt_spans(self, input_ids):
        """Apply span corruption to a sequence"""
        # Remove padding tokens
        input_ids = [id for id in input_ids if id != self.pad_token_id]
        num_tokens = len(input_ids)
        
        # Calculate number of tokens to mask
        num_noise_tokens = int(round(num_tokens * self.noise_density))
        if num_noise_tokens == 0:
            num_noise_tokens = 1  # Ensure at least one token is masked
        
        # Create mask (0=keep, 1=mask)
        mask = np.zeros(num_tokens, dtype=int)
        mask_indices = np.random.choice(
            num_tokens, size=num_noise_tokens, replace=False
        )
        mask[mask_indices] = 1
        
        # Group consecutive masked tokens into spans
        spans = []
        i = 0
        while i < num_tokens:
            if mask[i] == 0:
                i += 1
                continue
            j = i
            while j < num_tokens and mask[j] == 1:
                j += 1
            spans.append((i, j))
            i = j
        
        # Build corrupted sequence and labels
        corrupted = []
        labels = []
        sentinel_id = self.sentinel_start_id
        prev = 0
        
        for start, end in spans:
            # Add non-masked tokens
            corrupted.extend(input_ids[prev:start])
            # Add sentinel token
            corrupted.append(sentinel_id)
            # Add masked tokens to labels with sentinel
            labels.append(sentinel_id)
            labels.extend(input_ids[start:end])
            # Update sentinel ID and prev pointer
            sentinel_id += 1
            prev = end
        
        # Add remaining tokens
        corrupted.extend(input_ids[prev:])
        
        # Truncate if needed
        corrupted = corrupted[:self.input_length]
        labels = labels[:self.target_length]
        
        return corrupted, labels

In [8]:
def evaluate_model(model, tokenizer, eval_dataset, data_collator, num_examples=5):
    """Evaluate the model on validation set and compare with original"""
    print("Evaluating model...")
    
    # Load metrics
    bleu_metric = evaluate.load("bleu")
    rouge_metric = evaluate.load("rouge")
    
    # Create text generation pipeline
    text2text = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
    
    # Select random examples
    eval_examples = eval_dataset.shuffle().select(range(min(num_examples, len(eval_dataset))))
    
    results = []
    for example in eval_examples:
        # Create corrupted input
        corrupted = data_collator.corrupt_spans(tokenizer.encode(example["text"]))[0]
        corrupted_text = tokenizer.decode(corrupted, skip_special_tokens=False)
        
        # Generate reconstruction
        output = text2text(corrupted_text, max_length=MAX_TARGET_LENGTH, num_beams=1)
        reconstructed = output[0]["generated_text"]
        
        results.append({
            "original": example["text"],
            "corrupted": corrupted_text,
            "reconstructed": reconstructed
        })
    
    # Calculate metrics
    references = [res["original"] for res in results]
    predictions = [res["reconstructed"] for res in results]
    
    bleu_score = bleu_metric.compute(predictions=predictions, references=references)
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)
    
    print(f"BLEU Score: {bleu_score['bleu']:.4f}")
    print(f"ROUGE-L Score: {rouge_score['rougeL']:.4f}")
    
    # Print examples
    for i, res in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"Original: {res['original']}")
        print(f"Corrupted: {res['corrupted']}")
        print(f"Reconstructed: {res['reconstructed']}")
    
    return bleu_score, rouge_score, results

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
train_dataset, val_dataset = prepare_datasets()

Loading en-si dataset...


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/155k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/65.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/153k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/979109 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Processing en-si: 100%|██████████| 979109/979109 [00:26<00:00, 36632.27it/s]
Processing en-si: 100%|██████████| 2000/2000 [00:00<00:00, 40736.43it/s]

Final training dataset size: 1000
Final validation dataset size: 250





In [11]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding=False,  # We'll handle padding in the collator
        truncation=True,
        max_length=MAX_INPUT_LENGTH,
        return_attention_mask=False,
    )

tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["lang"]  # Keep "text" for debugging if needed
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
print("Dataset features:", tokenized_train.features)
print("First example:", tokenized_train[0])

Dataset features: {'text': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}
First example: {'text': 'Boone!', 'input_ids': [44509, 405, 309, 1]}


In [13]:
if val_dataset:
    print("comes to this")
    tokenized_val = val_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=val_dataset.column_names,
    )

comes to this


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [14]:
data_collator = T5SpanCorruptionCollator(
        tokenizer,
        NOISE_DENSITY,
        MEAN_NOISE_SPAN_LENGTH,
        MAX_INPUT_LENGTH,
        MAX_TARGET_LENGTH
    )

In [15]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    warmup_ratio=0.01,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=10,
    eval_steps=10,
    save_total_limit=3,
    predict_with_generate=True,  # Important for seq2seq models
    remove_unused_columns=False,
    report_to="none",
    eval_strategy="steps" if val_dataset else "no",  # Changed from evaluation_strategy to eval_strategy
    load_best_model_at_end=True if val_dataset else False,
    metric_for_best_model="eval_loss",  
    greater_is_better=False,  
)

In [16]:
# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val if val_dataset else None,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
    

  trainer = Seq2SeqTrainer(


In [17]:
# Train model
print("Starting training..........")
train_result = trainer.train()
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Starting training..........


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
10,28.9517,22.961605
20,27.7437,21.929482
30,25.3516,24.85342




RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 3872159424 vs 3872159312