In [None]:
# Cell 1: Install required libraries
!pip install transformers[sentencepiece] datasets evaluate sacrebleu -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Cell 2
from google.colab import files
import os
import zipfile
import io

# --- 1. Upload the zip file ---
print("Please upload your .zip file (e.g., quran_dataset.zip)")
uploaded = files.upload()

# Check if a file was uploaded
if not uploaded:
    print("\nNo file was uploaded. Please run the cell again.")
else:
    # Get the name of the uploaded zip file (assuming only one file)
    zip_file_name = list(uploaded.keys())[0]

    # --- 2. Define where to extract the data ---
    extract_path = '/content/quran_data'

    # Create the extraction directory if it doesn't exist
    os.makedirs(extract_path, exist_ok=True)

    # --- 3. Unzip the file ---
    print(f"\nUnzipping '{zip_file_name}' to '{extract_path}'...")

    # Read the uploaded file's content
    zip_data = uploaded[zip_file_name]

    # Use zipfile to extract from the in-memory data
    with zipfile.ZipFile(io.BytesIO(zip_data), 'r') as z:
        z.extractall(extract_path)

    print(f"Successfully unzipped. Your data is now in: {extract_path}")
    print("\nFiles in the directory:")
    !ls -l {extract_path}

Please upload your .zip file (e.g., quran_dataset.zip)


Saving umc005-corpus.zip to umc005-corpus.zip

Unzipping 'umc005-corpus.zip' to '/content/quran_data'...
Successfully unzipped. Your data is now in: /content/quran_data

Files in the directory:
total 8
drwxr-xr-x 2 root root 4096 Nov  2 17:21 bible
drwxr-xr-x 2 root root 4096 Nov  2 17:21 quran


In [None]:
# You need to find the text files, e.g., "quran.en" and "quran.ur"
!ls -l /content/quran_data

# The UMC005 corpus often has files like 'UMC005.en-ur.en' and 'UMC005.en-ur.ur'
# Look at the output of this cell and identify your two files.

total 8
drwxr-xr-x 2 root root 4096 Nov  2 17:21 bible
drwxr-xr-x 2 root root 4096 Nov  2 17:21 quran


In [None]:

!ls -l /content/quran_data/quran

total 7460
-rw-r--r-- 1 root root   16857 Nov  2 17:21 dev.en
-rw-r--r-- 1 root root   23958 Nov  2 17:21 dev.ur
-rw-r--r-- 1 root root 1182350 Nov  2 17:21 Quran-EN
-rw-r--r-- 1 root root 1757854 Nov  2 17:21 Quran-UR
-rw-r--r-- 1 root root 1747185 Nov  2 17:21 Quran-UR-normalized
-rw-r--r-- 1 root root   16825 Nov  2 17:21 test.en
-rw-r--r-- 1 root root   25086 Nov  2 17:21 test.ur
-rw-r--r-- 1 root root 1148668 Nov  2 17:21 train.en
-rw-r--r-- 1 root root 1698141 Nov  2 17:21 train.ur


In [None]:
!ls -lR /content/quran_data

/content/quran_data:
total 8
drwxr-xr-x 2 root root 4096 Nov  2 17:21 bible
drwxr-xr-x 2 root root 4096 Nov  2 17:21 quran

/content/quran_data/bible:
total 6384
-rw-r--r-- 1 root root  979699 Nov  2 17:21 Bible-EN
-rw-r--r-- 1 root root 1600335 Nov  2 17:21 Bible-UR
-rw-r--r-- 1 root root 1479638 Nov  2 17:21 Bible-UR-normalized
-rw-r--r-- 1 root root   42702 Nov  2 17:21 dev.en
-rw-r--r-- 1 root root   66972 Nov  2 17:21 dev.ur
-rw-r--r-- 1 root root   40662 Nov  2 17:21 test.en
-rw-r--r-- 1 root root   60773 Nov  2 17:21 test.ur
-rw-r--r-- 1 root root  896335 Nov  2 17:21 train.en
-rw-r--r-- 1 root root 1351893 Nov  2 17:21 train.ur

/content/quran_data/quran:
total 7460
-rw-r--r-- 1 root root   16857 Nov  2 17:21 dev.en
-rw-r--r-- 1 root root   23958 Nov  2 17:21 dev.ur
-rw-r--r-- 1 root root 1182350 Nov  2 17:21 Quran-EN
-rw-r--r-- 1 root root 1757854 Nov  2 17:21 Quran-UR
-rw-r--r-- 1 root root 1747185 Nov  2 17:21 Quran-UR-normalized
-rw-r--r-- 1 root root   16825 Nov  2 17:21 t

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import os

# --- 1. Define the file paths based on your output ---
train_en_file = '/content/quran_data/quran/train.en'
train_ur_file = '/content/quran_data/quran/train.ur'
test_en_file  = '/content/quran_data/quran/test.en'
test_ur_file  = '/content/quran_data/quran/test.ur'

# --- 2. Helper function to load a pair of files into a Dataset ---
def load_parallel_dataset(en_path, ur_path):
    # Check if files exist
    if not os.path.exists(en_path):
        print(f"Error: File not found at {en_path}")
        return None
    if not os.path.exists(ur_path):
        print(f"Error: File not found at {ur_path}")
        return None

    # Read the lines from the files
    with open(en_path, 'r', encoding='utf-8') as f:
        en_lines = f.read().splitlines()

    with open(ur_path, 'r', encoding='utf-8') as f:
        ur_lines = f.read().splitlines()

    # Create a pandas DataFrame
    df = pd.DataFrame({'en': en_lines, 'ur': ur_lines})

    # Convert the DataFrame to a Hugging Face Dataset object
    return Dataset.from_pandas(df)

# --- 3. Load the train and test datasets ---
train_dataset = load_parallel_dataset(train_en_file, train_ur_file)
test_dataset = load_parallel_dataset(test_en_file, test_ur_file)

# --- 4. Combine them into a single DatasetDict ---
# The rest of the notebook expects this 'raw_datasets' object
# with a 'train' key and a 'test' key.
if train_dataset and test_dataset:
    raw_datasets = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

    print("Successfully loaded pre-split data:")
    print(raw_datasets)

    print("\nExample from the training data:")
    print(raw_datasets['train'][0])
else:
    print("\nFailed to load datasets. Please check file paths.")

Successfully loaded pre-split data:
DatasetDict({
    train: Dataset({
        features: ['en', 'ur'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['en', 'ur'],
        num_rows: 200
    })
})

Example from the training data:
{'en': '\ufeffAll praise be to Allah alone , the Sustainer of all the worlds .', 'ur': '\ufeffسب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے ۔'}


In [None]:
# Cell 5a: Install recommended library
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m26.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
# Cell 5 (Corrected)
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-ur"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

print("Tokenizer loaded.")
# The line that caused the error has been removed.

# Example of tokenization
sample = raw_datasets['train'][0]
inputs = tokenizer(sample['en'])
with tokenizer.as_target_tokenizer():
    labels = tokenizer(sample['ur'])

print(f"\nSample English: {sample['en']}")
print(f"Tokenized Inputs: {inputs['input_ids']}")
print(f"Sample Urdu: {sample['ur']}")
print(f"Tokenized Labels: {labels['input_ids']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Tokenizer loaded.

Sample English: ﻿All praise be to Allah alone , the Sustainer of all the worlds .
Tokenized Inputs: [306, 1343, 51, 15, 43, 1096, 73, 2, 3, 11086, 10, 120, 3, 2241, 73, 5, 0]
Sample Urdu: ﻿سب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے ۔
Tokenized Labels: [126, 1640, 238, 49, 79, 6, 89, 21, 33, 428, 2993, 12, 3550, 1905, 101, 8, 39, 0]




In [None]:
# Cell 6: Create Preprocessing Function
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    # Tokenize the English (input) sentences
    model_inputs = tokenizer(
        examples["en"],
        max_length=max_input_length,
        truncation=True
    )

    # Tokenize the Urdu (target/label) sentences
    # We use 'as_target_tokenizer()' to ensure it's tokenized correctly for labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["ur"],
            max_length=max_target_length,
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Cell 7: Apply Preprocessing to the Entire Dataset
# We use .map() to apply the function to all examples in our dataset
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

print("\nTokenized dataset structure:")
print(tokenized_datasets)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]


Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['en', 'ur', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['en', 'ur', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})


In [None]:
# Cell 8: Load the Pre-trained Model
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

# Load the lightweight Transformer model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# This data collator will dynamically pad sentences in each batch,
# which is more efficient than padding all sentences to the max length.
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
# Cell 9: Define Evaluation Metric (BLEU)
import evaluate
import numpy as np

# Load the BLEU metric from the 'evaluate' library
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode predicted tokens back to text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels (which are ignored in loss) with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode ground-truth tokens back to text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process for BLEU
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute BLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    return result

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Cell 10a: Check version
!pip show transformers

Name: transformers
Version: 4.57.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.12/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers


In [None]:
# Cell 10 (Modified to disable wandb)
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="my_eng_ur_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none",                 # <-- ADD THIS LINE
)

In [None]:
# Cell 11: Initialize the Trainer
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Pass our BLEU function
)

  trainer = Seq2SeqTrainer(


In [None]:
# Cell 12: Start Training!
# This will take a few minutes (approx. 5-10 min in a standard Colab).
# You will see the training loss go down and the BLEU score reported after each epoch.
print("Starting training...")
trainer.train()
print("Training finished.")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting training...


Epoch,Training Loss,Validation Loss,Bleu
1,No log,1.186657,28.153249
2,0.997700,1.146453,30.048582
3,0.750300,1.145079,30.027187




Training finished.


In [None]:
# Cell 13: Quantitative Evaluation (BLEU Score)
# Run the final evaluation on the test set
print("Running final evaluation...")
eval_results = trainer.evaluate()
print("--- Evaluation Results ---")
print(f"  BLEU Score: {eval_results['eval_bleu']:.2f}")
print("--------------------------")

Running final evaluation...


--- Evaluation Results ---
  BLEU Score: 30.03
--------------------------


In [None]:
# Cell 14: Save Your Fine-Tuned Model
# Save the model and tokenizer for later use
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")
print("Model saved to ./final_model")

Model saved to ./final_model


In [None]:
# Cell 15: Qualitative Analysis (Test with Examples)
# This is the most important part for your report.
from transformers import pipeline
import random

# Load a "pipeline" with your new fine-tuned model
translator = pipeline("translation", model="./final_model", tokenizer="./final_model")

print("\n--- Testing with random samples from the test set ---")

for i in range(5):
    # Pick a random sample from the test set
    sample = raw_datasets["test"][random.randint(0, len(raw_datasets["test"]))]

    input_text = sample['en']
    gold_translation = sample['ur']

    # Get the model's translation
    model_translation = translator(input_text)[0]['translation_text']

    print(f"\nSample {i+1}:")
    print(f"  ENGLISH:   {input_text}")
    print(f"  GOLD URDU: {gold_translation}")
    print(f"  MODEL URDU: {model_translation}")

print("\n--- Test with your own sentence ---")
my_sentence = "In the name of God, the Most Gracious, the Most Merciful."
my_translation = translator(my_sentence)[0]['translation_text']
print(f"  ENGLISH:   {my_sentence}")
print(f"  MODEL URDU: {my_translation}")

Device set to use cuda:0



--- Testing with random samples from the test set ---

Sample 1:
  ENGLISH:   And does not promote the cause of feeding the poor ( i . e . does not strive to end the economic exploitation of the poor and the needy ) .
  GOLD URDU: اور محتاج کو کھانا کھلانے کی ترغیب نہیں دیتا یعنی معاشرے سے غریبوں اور محتاجوں کے معاشی استحصال کے خاتمے کی کوشش نہیں کرتا ۔
  MODEL URDU: اور محتاج کو کھانا کھلانے کی ترغیب نہیں دیتا ۔

Sample 2:
  ENGLISH:   But he who is miser and disregards ( spending in the cause of Allah ) .
  GOLD URDU: اور جس نے بخل کیا اور راہ حق میں مال خرچ کرنے سے بے پروا رہا ۔
  MODEL URDU: اور وہ جس نے بخل کیا اور بے پرواہ بنا

Sample 3:
  ENGLISH:   By the sun and by its brightness .
  GOLD URDU: سورج کی قسم اور اس کی روشنی کی قسم ۔
  MODEL URDU: سورج کی قسم اور اس کی روشنی کی

Sample 4:
  ENGLISH:   Beseech : I seek refuge with the Lord Who brought ( the universe ) into existence with ( an ) explosion extremely fast .
  GOLD URDU: آپ عرض کیجئے کہ میں ایک دھماکے سے انتہائی تیزی