In [42]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install sentencepiece
!pip install sacrebleu
!pip install evaluate
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import DatasetDict, Dataset
import os
import torch
import random
from tqdm import tqdm
import sacrebleu
import sentencepiece as spm
from torch.utils.data import DataLoader
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
from transformers import TrainerCallback
import random
import csv
from datetime import datetime
from transformers import (MBart50TokenizerFast, MBart50Tokenizer, 
                    MBartConfig, MBartForConditionalGeneration, DataCollatorForSeq2Seq)

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2

In [43]:
df = pd.read_csv("/kaggle/input/ugce-phone-to-hindi/phone_20k_en2hi_translation.csv")  # update path if needed

# Split into train/validation (e.g., 90/10 split)
df = df[['phones', 'hi_text']]

In [44]:
os.makedirs("tokenizer", exist_ok=True)

In [45]:
with open("tokenizer/all_text.txt", "w", encoding="utf-8") as f:
    for example in df.itertuples():
        f.write(f"{example.phones}\n")
        f.write(f"{example.hi_text}\n")

In [46]:
spm.SentencePieceTrainer.Train(
    input='tokenizer/all_text.txt',
    model_prefix='tokenizer/mbart',
    vocab_size=4000,
    model_type='bpe',
    character_coverage=1.0,
    unk_id=0,
    pad_id=1,
    bos_id=2,
    eos_id=3
)

In [47]:
tokenizer = MBart50Tokenizer(vocab_file="/kaggle/working/tokenizer/mbart.model")

# Save the tokenizer to a directory (this is important for future loading)
tokenizer.save_pretrained("tokenizer/")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/sentencepiece.bpe.model',
 'tokenizer/added_tokens.json')

In [48]:
fast_tokenizer = MBart50TokenizerFast.from_pretrained("tokenizer/")
src_lang = 'en_XX'  # Replace with the correct source language code (e.g., 'en_XX')
tgt_lang = 'hi_IN'  # Replace with the correct target language code (e.g., 'hi_IN')

# Set the tokenizer language codes
fast_tokenizer.src_lang = src_lang
fast_tokenizer.tgt_lang = tgt_lang

In [49]:
special_tokens = {
    "additional_special_tokens": ["<pad>", "<eos>", "<sos>", "<cls>","<phoneme>", "<hi_IN>"]
}

# Add the special tokens to the tokenizer
fast_tokenizer.add_special_tokens(special_tokens)

5

In [50]:
config = MBartConfig(
    vocab_size=len(fast_tokenizer),  # Size of the custom vocabulary
    d_model=512,  # Model dimension (this can vary, adjust as needed)
    encoder_layers=6,  # Number of encoder layers
    decoder_layers=6,  # Number of decoder layers
    encoder_ffn_dim=2048,  # Feed-forward network size
    decoder_ffn_dim=2048,  # Feed-forward network size
    num_heads=8,  # Number of attention heads
    max_position_embeddings=512,  # Maximum length for input sequences
    dropout=0.1,  # Dropout rate for regularization
    activation_function="gelu",  # Activation function used in the model
    pad_token_id=fast_tokenizer.pad_token_id,
    eos_token_id=fast_tokenizer.eos_token_id,
    bos_token_id=fast_tokenizer.bos_token_id,
    decoder_start_token_id = fast_tokenizer.lang_code_to_id["hi_IN"]
)

In [51]:
model = MBartForConditionalGeneration(config=config)
model.resize_token_embeddings(len(fast_tokenizer))

MBartScaledWordEmbedding(4058, 512, padding_idx=1)

In [52]:
def tokenize_function(example):
    model_inputs = tokenizer(
        example["phones"],
        max_length=128,
        padding="max_length",
        truncation=True
    )
    labels = tokenizer(
        example["hi_text"],
        max_length=128,
        padding="max_length",
        truncation=True
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

In [53]:
dataset = Dataset.from_pandas(df)

In [54]:
tokenized_dataset = dataset.map(tokenize_function, batched=False, remove_columns=["phones", "hi_text"])

Map:   0%|          | 0/18636 [00:00<?, ? examples/s]

In [55]:
data_collator = DataCollatorForSeq2Seq(tokenizer=fast_tokenizer, model=model)

In [56]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 18636
})

In [58]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 5

In [63]:
# Load the BLEU metric
bleu_metric = evaluate.load("bleu")

# Compute the BLEU score
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Prepare references as a list of lists
    references = [[label] for label in decoded_labels]

    # Compute BLEU score
    result = bleu_metric.compute(predictions=decoded_preds, references=references)
    return {"bleu": result["bleu"]}

# Split the dataset into train and eval sets
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Print dataset sizes
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart50-hi",
    eval_strategy="epoch",         # Evaluate at the end of each epoch
    save_strategy="epoch",         # Save a checkpoint at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=1,            # Keep only the most recent checkpoint
    num_train_epochs=10,
    predict_with_generate=True,    # Enable prediction during evaluation
    fp16=True,
)

# Create the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define a custom callback for inference after each epoch
class InferenceCallback(TrainerCallback):
    def __init__(self, tokenizer, model, eval_dataset, num_samples=10, output_file="inference_samples.csv"):
        self.tokenizer = tokenizer
        self.model = model
        self.eval_dataset = eval_dataset
        self.num_samples = num_samples
        self.output_file = output_file
        self.sample_indices = random.sample(range(len(self.eval_dataset)), self.num_samples)

        # Decode input_ids and labels from the dataset
        self.sample_texts = [self.tokenizer.decode(self.eval_dataset[i]['input_ids'], skip_special_tokens=True) for i in self.sample_indices]
        self.references = [self.tokenizer.decode(self.eval_dataset[i]['labels'], skip_special_tokens=True) for i in self.sample_indices]
        
        self.outputs = []

    def on_epoch_end(self, args, state, control, **kwargs):
        # Prepare inputs and generate predictions
        inputs = self.tokenizer(self.sample_texts, padding=True, truncation=True, return_tensors="pt").to(self.model.device)
        generated_ids = self.model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
        predictions = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # Save the generated outputs
        for src, ref, pred in zip(self.sample_texts, self.references, predictions):
            self.outputs.append({
                "epoch": state.epoch,
                "source": src,
                "reference": ref,
                "prediction": pred
            })

        # Console output of the predictions
        print(f"\n--- Inference After Epoch {state.epoch:.1f} ---")
        for src, pred, ref in zip(self.sample_texts, predictions, self.references):
            print(f"Source: {src}")
            print(f"Predicted: {pred}")
            print(f"Reference: {ref}")
            print("------")
        print("\n")

        return control

    def on_train_end(self, args, state, control, **kwargs):
        # Save inference samples to CSV after training
        keys = ["epoch", "source", "reference", "prediction"]
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_csv = f"inference_samples_{timestamp}.csv"

        with open(output_csv, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(self.outputs)

        print(f"Inference samples saved to {output_csv}")

# Instantiate the custom inference callback
inference_callback = InferenceCallback(
    tokenizer=tokenizer,
    model=model,
    eval_dataset=eval_dataset,
    num_samples=10  # Adjust number of samples
)

# Move model to the appropriate device (GPU or CPU)
model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Create the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[inference_callback]  # Add the inference callback here
)

Train dataset size: 16772
Eval dataset size: 1864


  trainer = Seq2SeqTrainer(


In [64]:
trainer.train()

<IPython.core.display.Javascript object>

KeyboardInterrupt: 