In [1]:
# Core
import os
import random
import numpy as np
import pandas as pd

# HuggingFace
from datasets import Dataset, ClassLabel

# Metrics (will be reused later)
from sklearn.metrics import accuracy_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)


In [4]:
DATASET_ROOT = r"C:\Users\areesa\Documents\Urdu_GLUE_xlm_roberta\data\raw\Urdu Paraphrasing\UPPC\UPPC Corpus"
DATA_DIR = os.path.join(DATASET_ROOT, "data")
LABEL_FILE = os.path.join(DATASET_ROOT, "all_files.txt")


In [5]:
print(len(os.listdir(DATA_DIR)))


160


In [6]:
from bs4 import BeautifulSoup

def extract_urdu_text(file_path):
    with open(file_path, encoding="utf-8") as f:
        raw = f.read()
    soup = BeautifulSoup(raw, "xml")
    doc = soup.find("UPPC_document")
    return doc.get_text().strip()


In [7]:
doc_texts = {}

for fname in os.listdir(DATA_DIR):
    doc_texts[fname] = extract_urdu_text(os.path.join(DATA_DIR, fname))

print("Documents loaded:", len(doc_texts))


Documents loaded: 160


In [8]:
pairs = []

with open(LABEL_FILE, encoding="utf-8") as f:
    lines = f.readlines()

for line in lines:
    f1, f2, label = line.strip().split(",")

    pairs.append({
        "sentence1": doc_texts[f1],
        "sentence2": doc_texts[f2],
        "label": 1 if label == "P" else 0
    })

df = pd.DataFrame(pairs)


In [9]:
print(df.shape)
print(df["label"].value_counts())
df.head()


(140, 3)
label
1    75
0    65
Name: count, dtype: int64


Unnamed: 0,sentence1,sentence2,label
0,چودھری رحمت علی 16 نومبر1897 کو مشرقی پنجاب ...,چودھر ی رحمت علی 16 نومبر 1897ء کو ہوشیارپور ک...,0
1,تقریباً 25 سال کی عمر میں آپ صلی اللہ علیہ و آ...,حضرت محمد دیناوی تاریخ میں اہم ترین شخصیت کے ط...,0
2,لیاقت علی خان پاکستان کے پہلے وزیراعظم تھے۔ آپ...,پاکستان کے پہلے وزیر اعظم نواب لیاقت علی خان م...,0
3,مرزا غالب 1797- 1869 اردو زبان کے سب سے بڑے شا...,1797ء سے 1869ء تک کے دور میں مرزا غالب اردو زب...,0
4,ٹیپو سلطان 10 نومبر1750~ 4 مئی 1799 ہندوستان م...,تاریخ کا وہ عظیم نام جس کا نام سنتے ہی اس کے د...,0


In [10]:
df.to_csv("uppc_paraphrase_pairs.csv", index=False)


In [11]:
# Convert DataFrame → HuggingFace Dataset
from datasets import Dataset, ClassLabel

hf_dataset = Dataset.from_pandas(df)
hf_dataset



Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 140
})

In [12]:
# Cast label to ClassLabe
label_feature = ClassLabel(
    num_classes=2,
    names=["not_paraphrase", "paraphrase"]
)

hf_dataset = hf_dataset.cast_column("label", label_feature)


Casting the dataset: 100%|██████████████████████████████████████████████████████████████| 140/140 [00:00<00:00, 70005.07 examples/s]


In [13]:
hf_dataset.features


{'sentence1': Value('string'),
 'sentence2': Value('string'),
 'label': ClassLabel(names=['not_paraphrase', 'paraphrase'])}

In [14]:
# Zero-Shot Dataset
zero_shot_dataset = hf_dataset


In [15]:
# 16-Shot Dataset
df_pos = df[df["label"] == 1]   # paraphrase
df_neg = df[df["label"] == 0]   # not paraphrase

print(len(df_pos), len(df_neg))


75 65


In [16]:
# Take FIRST 16 from each class
df_16shot = pd.concat([
    df_pos.iloc[:16],
    df_neg.iloc[:16]
]).reset_index(drop=True)

df_16shot["label"].value_counts()


label
1    16
0    16
Name: count, dtype: int64

In [17]:
# Convert to HF Dataset
hf_16shot = Dataset.from_pandas(df_16shot)
hf_16shot = hf_16shot.cast_column("label", label_feature)

hf_16shot


Casting the dataset: 100%|████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 15959.30 examples/s]


Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 32
})

In [18]:
# Remaining data for evaluation
df_remaining = df.drop(df_16shot.index).reset_index(drop=True)

hf_16shot_eval = Dataset.from_pandas(df_remaining)
hf_16shot_eval = hf_16shot_eval.cast_column("label", label_feature)

print(len(hf_16shot), len(hf_16shot_eval))


Casting the dataset: 100%|██████████████████████████████████████████████████████████████| 108/108 [00:00<00:00, 96585.25 examples/s]

32 108





In [19]:
# 80 / 20 Stratified Split
hf_80_20 = hf_dataset.train_test_split(
    test_size=0.2,
    seed=SEED,
    stratify_by_column="label"
)

hf_80_20


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 112
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 28
    })
})

In [20]:
from collections import Counter

print("Train labels:", Counter(hf_80_20["train"]["label"]))
print("Test labels:", Counter(hf_80_20["test"]["label"]))


Train labels: Counter({1: 60, 0: 52})
Test labels: Counter({1: 15, 0: 13})


In [21]:
# Load Tokenizer (mBERT)
from transformers import AutoTokenizer

MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [22]:
# Define Tokenization Function (Sentence-Pair)
MAX_LEN = 128

def tokenize_function(batch):
    return tokenizer(
        batch["sentence1"],
        batch["sentence2"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )


In [23]:
# Tokenize All Datasets
# Zero-shot
tokenized_zero = zero_shot_dataset.map(tokenize_function, batched=True)


Map: 100%|████████████████████████████████████████████████████████████████████████████████| 140/140 [00:00<00:00, 577.23 examples/s]


In [24]:
# 16-shot
tokenized_16_train = hf_16shot.map(tokenize_function, batched=True)
tokenized_16_eval  = hf_16shot_eval.map(tokenize_function, batched=True)


Map: 100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 438.62 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:00<00:00, 576.62 examples/s]


In [25]:
# 80 / 20
tokenized_80_train = hf_80_20["train"].map(tokenize_function, batched=True)
tokenized_80_test  = hf_80_20["test"].map(tokenize_function, batched=True)


Map: 100%|████████████████████████████████████████████████████████████████████████████████| 112/112 [00:00<00:00, 540.16 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 425.32 examples/s]


In [26]:
# Set Torch Format (Trainer-Ready)
columns = ["input_ids", "attention_mask", "label"]

tokenized_zero.set_format(type="torch", columns=columns)
tokenized_16_train.set_format(type="torch", columns=columns)
tokenized_16_eval.set_format(type="torch", columns=columns)
tokenized_80_train.set_format(type="torch", columns=columns)
tokenized_80_test.set_format(type="torch", columns=columns)


In [27]:
tokenized_16_train[0]


{'label': tensor(1),
 'input_ids': tensor([   101,    818,  16351,  63764,  10278,    773,  86131,  10502,  21732,
          10250, 101278,  45987, 100595,  13244,  81780,  10278,  38755,  10691,
          19216,  18779, 105449,  10673,  84801,  10691,  75399,  96786,  13185,
            788,  11145,  52437,  13437,  10916,  12427,  24104,  30745,  25306,
          53065,  14634,    769,  24728,  10278,  47889,  10691,    829,  13437,
          29315,  27226,    837,  56744,  15974,  65479,  52874,  12574,  12427,
            788,  60312,  11086,  11689,  29145,  11076,  13141,  12427,  38784,
          53789,    102,    818,  16351,  63764,  10278,    773,  86131,  10502,
          21732,  38755,  10691,  19216,  18779, 105449,  10673,  84801,  10691,
          12427, 108754,    788,  11145,  52437,  13437,  10916,  12427,  30745,
          25306,  53065,  14634,    769,  24728,  10278,  47889,  10691,    829,
          13437,  10250, 101278,  12549,  12611,  13244,  29315,  27226,   

In [28]:
# mBERT: Zero-Shot Experiment
# Load mBERT Model (FRESH)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Freeze Encoder (ZERO-SHOT RULE)
for param in model.base_model.parameters():
    param.requires_grad = False


In [30]:
sum(p.requires_grad for p in model.parameters())


2

In [32]:
# TrainingArguments (STANDARD TEMPLATE)
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results/uppc_mbert_zero",
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    num_train_epochs=20,
    weight_decay=0.01,

    warmup_ratio=0.1,
    lr_scheduler_type="linear",

    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,

    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=True,
    max_grad_norm=1.0,

    report_to="none",
    seed=SEED
)


In [34]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }


In [35]:
# Trainer (Zero-Shot)
from transformers import Trainer

trainer_zero = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_zero,
    eval_dataset=tokenized_zero,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_zero = Trainer(


In [36]:
trainer_zero.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7238,0.725291,0.464286,0.0
2,0.7157,0.723087,0.464286,0.0
3,0.7223,0.720176,0.464286,0.0
4,0.7265,0.717557,0.464286,0.0
5,0.7154,0.715224,0.464286,0.0
6,0.7037,0.713149,0.464286,0.0
7,0.718,0.71137,0.464286,0.0
8,0.7147,0.709881,0.464286,0.0
9,0.7101,0.708807,0.464286,0.0
10,0.7045,0.707799,0.464286,0.0




TrainOutput(global_step=100, training_loss=0.7109502410888672, metrics={'train_runtime': 17.993, 'train_samples_per_second': 155.616, 'train_steps_per_second': 5.558, 'total_flos': 184177738752000.0, 'train_loss': 0.7109502410888672, 'epoch': 20.0})

In [37]:
zero_results = trainer_zero.evaluate()
zero_results


{'eval_loss': 0.7252912521362305,
 'eval_accuracy': 0.4642857142857143,
 'eval_f1': 0.0,
 'eval_runtime': 0.0963,
 'eval_samples_per_second': 1453.526,
 'eval_steps_per_second': 51.912,
 'epoch': 20.0}

In [38]:
# mBERT: 16-Shot Fine-Tuning
# Load a FRESH mBERT Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
sum(p.requires_grad for p in model.parameters())


201

In [40]:
# TrainingArguments
from transformers import TrainingArguments

training_args_16 = TrainingArguments(
    output_dir="./results/uppc_mbert_16shot",
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    num_train_epochs=20,
    weight_decay=0.01,

    warmup_ratio=0.1,
    lr_scheduler_type="linear",

    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,

    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=True,
    max_grad_norm=1.0,

    report_to="none",
    seed=SEED
)


In [41]:
# Trainer (16-Shot)
from transformers import Trainer

trainer_16 = Trainer(
    model=model,
    args=training_args_16,
    train_dataset=tokenized_16_train,
    eval_dataset=tokenized_16_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_16 = Trainer(


In [42]:
trainer_16.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6857,0.70449,0.351852,0.166667
2,0.6953,0.686569,0.638889,0.715328
3,0.679,0.663963,0.694444,0.815642
4,0.6702,0.663963,0.694444,0.815642
5,0.6737,0.647936,0.694444,0.819672
6,0.6718,0.638247,0.694444,0.819672
7,0.6637,0.632883,0.722222,0.829545
8,0.6246,0.635023,0.712963,0.812121
9,0.6172,0.636497,0.703704,0.786667
10,0.5857,0.630697,0.703704,0.777778


TrainOutput(global_step=20, training_loss=0.5405887603759766, metrics={'train_runtime': 50.9825, 'train_samples_per_second': 12.553, 'train_steps_per_second': 0.392, 'total_flos': 42097768857600.0, 'train_loss': 0.5405887603759766, 'epoch': 20.0})

In [43]:
results_16 = trainer_16.evaluate()
results_16



{'eval_loss': 0.6328825354576111,
 'eval_accuracy': 0.7222222222222222,
 'eval_f1': 0.8295454545454546,
 'eval_runtime': 0.0729,
 'eval_samples_per_second': 1481.548,
 'eval_steps_per_second': 54.872,
 'epoch': 20.0}

In [44]:
# mBERT: 80 / 20 Full Fine-Tuning
# Load a FRESH mBERT Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
sum(p.requires_grad for p in model.parameters())


201

In [46]:
# TrainingArguments (80/20)
from transformers import TrainingArguments

training_args_80 = TrainingArguments(
    output_dir="./results/uppc_mbert_80_20",
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    num_train_epochs=20,
    weight_decay=0.01,

    warmup_ratio=0.1,
    lr_scheduler_type="linear",

    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,

    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=True,
    max_grad_norm=1.0,

    report_to="none",
    seed=SEED
)


In [47]:
# Trainer (80/20)
from transformers import Trainer

trainer_80 = Trainer(
    model=model,
    args=training_args_80,
    train_dataset=tokenized_80_train,
    eval_dataset=tokenized_80_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_80 = Trainer(


In [48]:
trainer_80.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7031,0.691197,0.464286,0.285714
2,0.7022,0.681431,0.571429,0.7
3,0.6939,0.688215,0.535714,0.697674
4,0.6859,0.699611,0.535714,0.697674
5,0.6692,0.703387,0.535714,0.697674
6,0.6688,0.647112,0.571429,0.714286
7,0.6502,0.679827,0.571429,0.714286
8,0.6084,0.642665,0.642857,0.722222
9,0.5395,0.632725,0.642857,0.6875
10,0.4709,0.593951,0.714286,0.75


TrainOutput(global_step=80, training_loss=0.4121783971786499, metrics={'train_runtime': 55.8601, 'train_samples_per_second': 40.1, 'train_steps_per_second': 1.432, 'total_flos': 147342191001600.0, 'train_loss': 0.4121783971786499, 'epoch': 20.0})

In [49]:
results_80 = trainer_80.evaluate()
results_80


{'eval_loss': 0.6965670585632324,
 'eval_accuracy': 0.7857142857142857,
 'eval_f1': 0.8333333333333334,
 'eval_runtime': 0.0198,
 'eval_samples_per_second': 1412.75,
 'eval_steps_per_second': 50.455,
 'epoch': 20.0}