# Imports

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Loading the Dataset


In [None]:

# Load the dataset
data = pd.read_excel('/content/english_to_urdu_dataset.xlsx')  # Replace with your dataset file

# Rename columns for clarity
data.columns = ['source_text', 'target_text']  # Rename columns for clarity

data

Unnamed: 0,source_text,target_text
0,the book of the generation of jesus christ th...,یسوع مسیح ابن داود ابن ابرہام کا نسب نامہ
1,abraham begat isaac and isaac begat jacob an...,ابراہام سے اضحاق پیدا ہوا اور اضحاق سے یعقوب پ...
2,and judas begat phares and zara of thamar and...,اور یہوداہ سے فارص اور زارح تمر سے پیدا ہوئے ا...
3,and aram begat aminadab and aminadab begat na...,اور رام سے عمینداب پیدا ہوا اور عمینداب سے نحس...
4,and salmon begat booz of rachab and booz bega...,اور سلمون سے بوعز راحب سے پیدا ہوا اور بوعز سے...
...,...,...
9098,Children who are sixteen years old or younger ...,سولہ سال سے کم عمر کے بچے ٹھیٹر میں داخل نہیں ...
9099,She borrowed the book from him many years ago ...,اس نے اس سے یہ کتاب کئی سال پہلے ادھار لی تھی ...
9100,She asked him to not quit his job because they...,اس نے اسے اپنی نوکری چھوڑنے سے منع کیا کیونکہ ...
9101,Tom would've liked to attend Mary's party. Unf...,ٹام میری کی پارٹی میں شامل تو ہونا چاہتا تھا م...


In [None]:
data = data.sample(n=6000, random_state=42).reset_index(drop=True)
data.shape

(6000, 2)

In [None]:
data.isnull().sum()

Unnamed: 0,0
source_text,0
target_text,0


In [None]:
# Convert all entries to strings
data["source_text"] = data["source_text"].astype(str)
data["target_text"] = data["target_text"].astype(str)

# Splits

In [None]:
# Split dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [None]:
train_dataset

Dataset({
    features: ['source_text', 'target_text', '__index_level_0__'],
    num_rows: 5400
})

# Tokenization

In [None]:
from transformers import AutoTokenizer

# Load a tokenizer for the chosen model (e.g., mT5 or mBART)
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [None]:
input_max_len = max([len(tokenizer.encode(text)) for text in data['source_text']])
input_max_len

86

In [None]:
output_max_len = max([len(tokenizer.encode(text)) for text in data['target_text']])
output_max_len

97

In [None]:

# Tokenization function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['source_text'],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        examples['target_text'],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/5400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [None]:
train_dataset[0]

{'input_ids': tensor([250004,   1284,   3229,   1836,  18822,     71,  44171,  21392,  19542,
          59207,     70,   8966,  50509,    214,     70,  60097,   3815,    111,
           2355,    136,     70,   9351,    111,     55,   7921,  96950,   1836,
           3542,    961,   6328,  29367,  15044,    453,    136,  24793,      2,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1

#  Load Pretrained Sequence-to-Sequence Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

results_dir = "/content/drive/MyDrive/results"
model_dir = "/content/drive/MyDrive/my_trans-model"

# Create the results directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [None]:
from transformers import Seq2SeqTrainingArguments

from transformers import AutoModelForSeq2SeqLM

# AutoModelForSeq2SeqLM is a class in the Hugging Face Transformers library that automatically loads a pre-trained sequence-to-sequence model.
# It is used for tasks like machine translation, summarization, and other text generation tasks, where both the encoder and decoder are trained to process input and generate output sequences.

# Load a pre-trained sequence-to-sequence model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")

# Set language-specific tokens if using mBART
model.config.decoder_start_token_id = tokenizer.lang_code_to_id["ur_PK"]  # Urdu token
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "ur_PK"


# Seq2SeqTrainingArguments is a class in Hugging Face's Transformers library designed specifically for training sequence-to-sequence models.
# It provides various training configurations such as batch size, number of epochs, evaluation strategy, and output directory, optimized for tasks like translation, summarization, or text generation.

training_args = Seq2SeqTrainingArguments(
    output_dir=results_dir,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    predict_with_generate=True,
    generation_max_length=128,
)


from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,1.7808,1.433966
2,1.2246,0.44963
3,0.2897,0.425998




TrainOutput(global_step=2025, training_loss=0.911448951061861, metrics={'train_runtime': 3206.0943, 'train_samples_per_second': 5.053, 'train_steps_per_second': 0.632, 'total_flos': 4388437780070400.0, 'train_loss': 0.911448951061861, 'epoch': 3.0})

# Save Model

In [None]:


# Create the directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)

# Save the fine-tuned model
trainer.save_model(model_dir)

# Save the tokenizer explicitly
tokenizer.save_pretrained(model_dir)


loaded_model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_dir)

print("Model and tokenizer reloaded successfully!")


['checkpoint-675', 'checkpoint-1350', 'checkpoint-2025', 'config.json', 'generation_config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'sentencepiece.bpe.model', 'tokenizer.json', 'training_args.bin']
Model and tokenizer reloaded successfully!


# Translation System

In [None]:
def translate_text(text):
    # Tokenize input
    inputs = loaded_tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
    # Generate translation
    outputs = loaded_model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    # Decode the translation
    translation = loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# Example usage
text_to_translate = "The book of the generation of Jesus Christ"
translated_text = translate_text(text_to_translate)
print("Translated text:", translated_text)


Translated text: مسیح یسوع کے زمانہ کی کتاب ۔


In [None]:
# Example usage
text_to_translate = "We appreciate you"
translated_text = translate_text(text_to_translate)
print("Translated text:", translated_text)

Translated text: ہم تمہیں بہت تسلّی دیتے ہیں ۔


In [None]:
# Example list of English sentences to translate
texts_to_translate = [
    "Children who are sixteen years old or younger are not allowed in the theater",
    "She borrowed the book from him many years ago",
    "She asked him to not quit his job because they needed the money",
    "Tom would've liked to attend Mary's party, unfortunately, he couldn't",
    "When you meet someone for the first time, be careful about your impressions"
]
# Translate each sentence and print the result
for sentence in texts_to_translate:
    translated_text = translate_text(sentence)
    print(f"Original: {sentence}")
    print(f"Translated: {translated_text}\n")

Original: Children who are sixteen years old or younger are not allowed in the theater
Translated: سولہ سال سے کم عمر کے بچے تیاح میں داخل نہیں ہوتے ۔

Original: She borrowed the book from him many years ago
Translated: اس نے اس سے کتاب کئی برس پہلے طلب کی تھی ۔

Original: She asked him to not quit his job because they needed the money
Translated: اس نے اس سے کہا کہ اپنے کام کو نہ چھوڑیں کیونکہ وہ پیسے چاہ رہے تھے ۔

Original: Tom would've liked to attend Mary's party, unfortunately, he couldn't
Translated: ٹام نے مریم کی پارٹی میں شرکت کرنا چاہا مگر اس نے نہ کی ۔

Original: When you meet someone for the first time, be careful about your impressions
Translated: جب تم پہلی دفعہ کسی سے ملے ہو، اپنے خیالات پر غور کرو ۔

