In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
data_path = '/kaggle/input/umc005/bible'

def load_corpus(file_path):
    file_path = os.path.join(data_path, file_path)
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

train_en = load_corpus('train.en')
train_ur = load_corpus('train.ur')
dev_en = load_corpus('dev.en')
dev_ur = load_corpus('dev.ur')
test_en = load_corpus('test.en')
test_ur = load_corpus('test.ur')
en_corpus = train_en + test_en + dev_en
ur_corpus = train_ur + test_ur + dev_ur

assert len(train_en) == len(train_ur), "Training data misaligned!"
assert len(dev_en) == len(dev_ur), "Validation data misaligned!"
assert len(test_en) == len(test_ur), "Test data misaligned!"
print("Train Dataset Size:", len(train_en))
print("Test Dataset Size:", len(test_en))
print("Dev Dataset Size:", len(dev_en))

Train Dataset Size: 7400
Test Dataset Size: 257
Dev Dataset Size: 300


In [3]:
max_en_vocab = 6000
max_ur_vocab = 7100
en_seq_len = 65 # 70
ur_seq_len = 79 # 84 
import re

def clean_urdu(text):
    # Remove non-Urdu characters (keeping only Urdu script characters and spaces)
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Match Urdu characters and spaces
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = text.strip()  # Remove leading/trailing spaces
    text = 'START ' + text + ' END'  # Add start/end tokens
    return text

def clean_english(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = text.strip()  # Remove leading/trailing spaces
    return text

def clean_corpus(corpus, lang='en'):
    if lang == 'en': return [clean_english(t) for t in corpus]
    elif lang == 'ur': return [clean_urdu(t) for t in corpus]

train_en = clean_corpus(train_en,'en')
train_ur = clean_corpus(train_ur,'ur')
val_en = clean_corpus(dev_en,'en')
val_ur = clean_corpus(dev_ur,'ur')

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
import numpy as np
import torch

def create_dataset(en_texts, ur_texts):
    return Dataset.from_dict({
        "translation": [
            {"en": en, "ur": ur} 
            for en, ur in zip(en_texts, ur_texts)
        ]
    })

# Create train and validation datasets
train_dataset = create_dataset(train_en, train_ur)
val_dataset = create_dataset(val_en, val_ur)

# Load tokenizer and model
model_name = "Helsinki-NLP/opus-mt-en-ur"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocessing function
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["ur"] for ex in examples["translation"]]
    
    model_inputs = tokenizer(
        inputs, 
        max_length=128, 
        truncation=True, 
        padding="max_length",
        return_tensors="pt"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
tokenized_val = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.91M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Map:   0%|          | 0/7400 [00:00<?, ? examples/s]



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [5]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',              # output directory
    num_train_epochs=3,                  # number of training epochs
    per_device_train_batch_size=8,       # batch size for training
    per_device_eval_batch_size=8,        # batch size for evaluation
    warmup_steps=500,                    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                   # weight decay to avoid overfitting
    logging_dir='./logs',                # directory for storing logs
    logging_steps=10,                    # log every 10 steps
    evaluation_strategy="epoch",         # evaluation strategy to use
    save_strategy="epoch",               # save checkpoint every epoch
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Create trainer
trainer = Seq2SeqTrainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments
    train_dataset=tokenized_train,       # training dataset
    eval_dataset=tokenized_val,          # evaluation dataset
)

# Fine-tune the model
trainer.train()

from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
)

# Initialize the Trainer
trainer = Trainer(
    train_dataset=train_data,            # training dataset
    eval_dataset=val_data,               # evaluation dataset
)

# Start fine-tuning
trainer.train()
model.save_pretrained("./en-ur-finetuned")
tokenizer.save_pretrained("./en-ur-finetuned")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113761977777711, max=1.0…

Epoch,Training Loss,Validation Loss
1,0.4693,0.559828
2,0.4262,0.542593
3,0.3267,0.545793


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


TrainOutput(global_step=2775, training_loss=0.5220938926129728, metrics={'train_runtime': 378.0473, 'train_samples_per_second': 58.723, 'train_steps_per_second': 7.34, 'total_flos': 752543701401600.0, 'train_loss': 0.5220938926129728, 'epoch': 3.0})

In [12]:
# Load saved model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("./en-ur-finetuned")
model = AutoModelForSeq2SeqLM.from_pretrained("./en-ur-finetuned")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available(): model = model.cuda()


# Function to translate new text
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text.replace("START ", "").replace(" END", "")

# Example usage
test_text = "What are you doing today?"
translation = translate(test_text)
print(f"English: {test_text}")
print(f"Urdu: {translation}")

English: What are you doing today?
Urdu: آج تم کیا کر رہے ہو ؟ ۔
