In [43]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/combined-2/parallel-corpus.csv


In [2]:
import pandas as pd
df=pd.read_csv('/kaggle/input/combined-2/parallel-corpus.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
df.head()

Unnamed: 0,English,Urdu
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


In [3]:
df.isna().sum()

English    0
Urdu       0
dtype: int64

In [None]:
import os
import re
from datasets import Dataset
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, TrainingArguments, Trainer
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ["WANDB_API_KEY"] = user_secrets.get_secret("WANDB_API_KEY")

df = df.sample(frac=0.7, random_state=42).reset_index(drop=True)

dataset = Dataset.from_pandas(df)

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.tgt_lang = "ur"

max_length = 128
def preprocess_function(examples):
    inputs = [ex.strip() for ex in examples["English"]]
    targets = [ex.strip() for ex in examples["Urdu"]]

    inputs = [re.sub(r"[^\w\s<>'\".,?!]+", "", text) for text in inputs]
    targets = [re.sub(r"[^\w\s<>'\".,?!]+", "", text) for text in targets]

    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["English", "Urdu"])

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

checkpoint_dir = "/kaggle/working/checkpoints"

training_args = TrainingArguments(
    output_dir=checkpoint_dir,
    per_device_train_batch_size=4,
    run_name="mbart50-urdu-finetune",
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",      
    save_total_limit=2,         
    logging_steps=50,
    push_to_hub=False,
    logging_first_step=True,
    report_to=["tensorboard"]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

last_checkpoint = None
if os.path.isdir(checkpoint_dir):
    checkpoints = [ckpt for ckpt in os.listdir(checkpoint_dir) if ckpt.startswith("checkpoint-")]
    if checkpoints:
        checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))
        last_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])
        print(f"Resuming from checkpoint: {last_checkpoint}")

trainer.train(resume_from_checkpoint=last_checkpoint)

trainer.save_model("/kaggle/working/fine_tuned_mbart50")


2025-04-20 12:10:13.260232: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745151013.736157      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745151013.852038      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Map:   0%|          | 0/37899 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]



Step,Training Loss
1,11.3836
50,7.626
100,1.7412
150,0.2337
200,0.2018
250,0.2523
300,0.2383
350,0.1962
400,0.2002
450,0.2087




In [7]:
import os
for dirname, _, filenames in os.walk('/kaggle/outputs'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/outputs/fine_tuned_mbart50/generation_config.json
/kaggle/outputs/fine_tuned_mbart50/training_args.bin
/kaggle/outputs/fine_tuned_mbart50/config.json
/kaggle/outputs/fine_tuned_mbart50/model.safetensors
/kaggle/outputs/checkpoint-14214/generation_config.json
/kaggle/outputs/checkpoint-14214/optimizer.pt
/kaggle/outputs/checkpoint-14214/training_args.bin
/kaggle/outputs/checkpoint-14214/config.json
/kaggle/outputs/checkpoint-14214/model.safetensors
/kaggle/outputs/checkpoint-9476/trainer_state.json
/kaggle/outputs/checkpoint-9476/generation_config.json
/kaggle/outputs/checkpoint-9476/optimizer.pt
/kaggle/outputs/checkpoint-9476/rng_state.pth
/kaggle/outputs/checkpoint-9476/training_args.bin
/kaggle/outputs/checkpoint-9476/config.json
/kaggle/outputs/checkpoint-9476/scheduler.pt
/kaggle/outputs/checkpoint-9476/model.safetensors
/kaggle/outputs/checkpoint-4738/trainer_state.json
/kaggle/outputs/checkpoint-4738/generation_config.json
/kaggle/outputs/checkpoint-4738/optimizer.pt
/ka

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "ur_PK"

model_path = "/kaggle/outputs/fine_tuned_mbart50"
fine_tuned_model = MBartForConditionalGeneration.from_pretrained(model_path)

def translate_english_to_urdu(text):
    encoded_input = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding=True)

    generated_tokens = fine_tuned_model.generate(
        **encoded_input,
        forced_bos_token_id=tokenizer.lang_code_to_id["ur_PK"], 
        max_length=128
    )

    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translated_text



English: it is working
Urdu: یہ کام کر رہا ہے


In [12]:
english_sentence = "Abdul Moiz is a good boy"
urdu_translation = translate_english_to_urdu(english_sentence)
print(f"English: {english_sentence}")
print(f"Urdu: {urdu_translation}")

English: Abdul Moiz is a good boy
Urdu: عبدالمیز ایک اچھا لڑکا ہے
