In [1]:
# Imports 
# Core
import os
import random
import numpy as np
import pandas as pd

# HuggingFace
from datasets import Dataset, ClassLabel

# Metrics (will be reused)
from sklearn.metrics import accuracy_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Fix Random Seed
SEED = 42

random.seed(SEED)
np.random.seed(SEED)


In [3]:
# Dataset Paths (UPPC)
DATASET_ROOT = r"C:\Users\areesa\Documents\Urdu_GLUE_xlm_roberta\data\raw\Urdu Paraphrasing\UPPC\UPPC Corpus"
DATA_DIR = os.path.join(DATASET_ROOT, "data")
LABEL_FILE = os.path.join(DATASET_ROOT, "all_files.txt")


In [4]:
len(os.listdir(DATA_DIR)) 


160

In [5]:
# XML Text Extraction Function
from bs4 import BeautifulSoup

def extract_urdu_text(file_path):
    with open(file_path, encoding="utf-8") as f:
        raw = f.read()
    soup = BeautifulSoup(raw, "xml")
    doc = soup.find("UPPC_document")
    return doc.get_text().strip()


In [6]:
# Load All Documents
doc_texts = {}

for fname in os.listdir(DATA_DIR):
    doc_texts[fname] = extract_urdu_text(os.path.join(DATA_DIR, fname))

print("Documents loaded:", len(doc_texts))


Documents loaded: 160


In [7]:
# Build the Paraphrase DataFrame
pairs = []

with open(LABEL_FILE, encoding="utf-8") as f:
    lines = f.readlines()

for line in lines:
    f1, f2, label = line.strip().split(",")

    pairs.append({
        "sentence1": doc_texts[f1],
        "sentence2": doc_texts[f2],
        "label": 1 if label == "P" else 0
    })

df = pd.DataFrame(pairs)


In [8]:
print(df.shape)
print(df["label"].value_counts())
df.head()


(140, 3)
label
1    75
0    65
Name: count, dtype: int64


Unnamed: 0,sentence1,sentence2,label
0,چودھری رحمت علی 16 نومبر1897 کو مشرقی پنجاب ...,چودھر ی رحمت علی 16 نومبر 1897ء کو ہوشیارپور ک...,0
1,تقریباً 25 سال کی عمر میں آپ صلی اللہ علیہ و آ...,حضرت محمد دیناوی تاریخ میں اہم ترین شخصیت کے ط...,0
2,لیاقت علی خان پاکستان کے پہلے وزیراعظم تھے۔ آپ...,پاکستان کے پہلے وزیر اعظم نواب لیاقت علی خان م...,0
3,مرزا غالب 1797- 1869 اردو زبان کے سب سے بڑے شا...,1797ء سے 1869ء تک کے دور میں مرزا غالب اردو زب...,0
4,ٹیپو سلطان 10 نومبر1750~ 4 مئی 1799 ہندوستان م...,تاریخ کا وہ عظیم نام جس کا نام سنتے ہی اس کے د...,0


In [9]:
# Convert to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(df)
hf_dataset


Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 140
})

In [10]:
# Cast Label to ClassLabel
label_feature = ClassLabel(
    num_classes=2,
    names=["not_paraphrase", "paraphrase"]
)

hf_dataset = hf_dataset.cast_column("label", label_feature)


Casting the dataset: 100%|██████████████████████████████████████████████████████████████| 140/140 [00:00<00:00, 70013.42 examples/s]


In [11]:
hf_dataset.features


{'sentence1': Value('string'),
 'sentence2': Value('string'),
 'label': ClassLabel(names=['not_paraphrase', 'paraphrase'])}

In [12]:
# Zero-Shot Dataset
zero_shot_dataset = hf_dataset


In [13]:
# 16-Shot Dataset
df_pos = df[df["label"] == 1]   # paraphrase
df_neg = df[df["label"] == 0]   # not paraphrase

print(len(df_pos), len(df_neg))


75 65


In [14]:
# Take FIRST 16 per class
df_16shot = pd.concat([
    df_pos.iloc[:16],
    df_neg.iloc[:16]
]).reset_index(drop=True)

df_16shot["label"].value_counts()


label
1    16
0    16
Name: count, dtype: int64

In [15]:
# Convert to HF Dataset
hf_16shot = Dataset.from_pandas(df_16shot)
hf_16shot = hf_16shot.cast_column("label", label_feature)

hf_16shot


Casting the dataset: 100%|███████████████████████████████████████████████████████████████████████████| 32/32 [00:00<?, ? examples/s]


Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 32
})

In [16]:
# Remaining data for evaluation
df_remaining = df.drop(df_16shot.index).reset_index(drop=True)

hf_16shot_eval = Dataset.from_pandas(df_remaining)
hf_16shot_eval = hf_16shot_eval.cast_column("label", label_feature)

print(len(hf_16shot), len(hf_16shot_eval))


Casting the dataset: 100%|██████████████████████████████████████████████████████████████| 108/108 [00:00<00:00, 80674.06 examples/s]

32 108





In [17]:
# 80 / 20 Stratified Split
hf_80_20 = hf_dataset.train_test_split(
    test_size=0.2,
    seed=SEED,
    stratify_by_column="label"
)

hf_80_20


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 112
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 28
    })
})

In [18]:
from collections import Counter

print("Train labels:", Counter(hf_80_20["train"]["label"]))
print("Test labels:", Counter(hf_80_20["test"]["label"]))


Train labels: Counter({1: 60, 0: 52})
Test labels: Counter({1: 15, 0: 13})


In [19]:
# Load XLM-R Tokenizer
from transformers import AutoTokenizer

MODEL_NAME = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [20]:
# Define Tokenization Function
MAX_LEN = 128

def tokenize_function(batch):
    return tokenizer(
        batch["sentence1"],
        batch["sentence2"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )


In [21]:
# Tokenize All Datasets
# Zero-shot
tokenized_zero = zero_shot_dataset.map(tokenize_function, batched=True)


Map: 100%|████████████████████████████████████████████████████████████████████████████████| 140/140 [00:00<00:00, 860.84 examples/s]


In [22]:
# 16-shot
tokenized_16_train = hf_16shot.map(tokenize_function, batched=True)
tokenized_16_eval  = hf_16shot_eval.map(tokenize_function, batched=True)


Map: 100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 1044.37 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:00<00:00, 966.78 examples/s]


In [23]:
# 80 / 20
tokenized_80_train = hf_80_20["train"].map(tokenize_function, batched=True)
tokenized_80_test  = hf_80_20["test"].map(tokenize_function, batched=True)


Map: 100%|████████████████████████████████████████████████████████████████████████████████| 112/112 [00:00<00:00, 895.49 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 778.02 examples/s]


In [24]:
# Set Torch Format
columns = ["input_ids", "attention_mask", "label"]

tokenized_zero.set_format(type="torch", columns=columns)
tokenized_16_train.set_format(type="torch", columns=columns)
tokenized_16_eval.set_format(type="torch", columns=columns)
tokenized_80_train.set_format(type="torch", columns=columns)
tokenized_80_test.set_format(type="torch", columns=columns)


In [25]:
tokenized_16_train[0]


{'label': tensor(1),
 'input_ids': tensor([     0, 208300,  91990,   8286,    611, 112591,   1819,  14773,    554,
         173991,  21345,    216,  69222,   2437,    870,  46467,  34957,    216,
           7482,   3239,    904,   5086,  32276,  10252,    317,   1541,  69300,
          18900,   4914,  71598,  64170,    140,  31517,    216, 105003,  11712,
         151090, 166486,  36455,  15368,    778,   1541, 220202,    504,   9564,
            288,    715,   1541,  20096,  31975,  91542,   7778,  22407, 131335,
           1901,    498,  35498,    431,   6708,  29000,  11917,  96071, 121379,
              2,      2, 208300,  91990,   8286,  21345,    216,  69222,   2437,
            870,  46467,  34957,    216,   1541, 181839,   5086,  32276,  10252,
            317,   1541,  18900,   4914,  71598,  64170,    140,  31517,    216,
         105003,    611, 112591, 117430,   3088,    554,  11712, 151090,  15368,
            778, 166486,  36455,   1541,  82287,    907,    504,   9564,   

In [26]:
# Load XLM-RoBERTa-Large
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-large",
    num_labels=2
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Freeze Encoder (Zero-Shot Rule)
for param in model.base_model.parameters():
    param.requires_grad = False


In [28]:
sum(p.requires_grad for p in model.parameters())


4

In [30]:
# TrainingArguments (Zero-Shot)
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results/uppc_xlmr_zero",
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    num_train_epochs=20,
    weight_decay=0.01,

    warmup_ratio=0.1,
    lr_scheduler_type="linear",

    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,

    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=True,
    max_grad_norm=1.0,

    report_to="none",
    seed=SEED
)


In [32]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }


In [33]:
# Trainer (Zero-Shot)
from transformers import Trainer

trainer_zero = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_zero,
    eval_dataset=tokenized_zero,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_zero = Trainer(


In [34]:
# Zero-Shot
trainer_zero.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7184,0.700541,0.464286,0.0
2,0.7083,0.694789,0.464286,0.0
3,0.689,0.691643,0.535714,0.697674
4,0.6944,0.69036,0.535714,0.697674
5,0.7068,0.689914,0.535714,0.697674
6,0.7099,0.689701,0.535714,0.697674
7,0.6908,0.689481,0.535714,0.697674
8,0.6807,0.689289,0.535714,0.697674
9,0.6986,0.689101,0.535714,0.697674
10,0.6893,0.688993,0.535714,0.697674


TrainOutput(global_step=100, training_loss=0.6954454016685486, metrics={'train_runtime': 52.0471, 'train_samples_per_second': 53.797, 'train_steps_per_second': 1.921, 'total_flos': 652351954329600.0, 'train_loss': 0.6954454016685486, 'epoch': 20.0})

In [35]:
xlmr_zero_results = trainer_zero.evaluate()
xlmr_zero_results


{'eval_loss': 0.6916434168815613,
 'eval_accuracy': 0.5357142857142857,
 'eval_f1': 0.6976744186046512,
 'eval_runtime': 0.202,
 'eval_samples_per_second': 693.039,
 'eval_steps_per_second': 24.751,
 'epoch': 20.0}

In [36]:
# 16-Shot Fine-Tuning
# Load a FRESH XLM-R Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-large",
    num_labels=2
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
sum(p.requires_grad for p in model.parameters())


393

In [38]:
# TrainingArguments (16-Shot)
from transformers import TrainingArguments

training_args_16 = TrainingArguments(
    output_dir="./results/uppc_xlmr_16shot",
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    num_train_epochs=20,
    weight_decay=0.01,

    warmup_ratio=0.1,
    lr_scheduler_type="linear",

    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,

    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=True,
    max_grad_norm=1.0,

    report_to="none",
    seed=SEED
)


In [39]:
# Trainer (16-Shot)
from transformers import Trainer

trainer_16 = Trainer(
    model=model,
    args=training_args_16,
    train_dataset=tokenized_16_train,
    eval_dataset=tokenized_16_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_16 = Trainer(


In [40]:
trainer_16.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7108,0.640024,0.694444,0.819672
2,0.6959,0.658922,0.694444,0.819672
3,0.7423,0.685045,0.62963,0.733333
4,0.6566,0.702203,0.453704,0.351648
5,0.7395,0.710497,0.407407,0.255814
6,0.6153,0.738483,0.37963,0.192771
7,0.6287,0.755769,0.435185,0.314607
8,0.564,0.725108,0.546296,0.514851
9,0.5953,0.725108,0.546296,0.514851
10,0.5984,0.681505,0.685185,0.721311


TrainOutput(global_step=20, training_loss=0.587109375, metrics={'train_runtime': 227.9017, 'train_samples_per_second': 2.808, 'train_steps_per_second': 0.088, 'total_flos': 149109018132480.0, 'train_loss': 0.587109375, 'epoch': 20.0})

In [41]:
xlmr_16_results = trainer_16.evaluate()
xlmr_16_results


{'eval_loss': 0.5745601058006287,
 'eval_accuracy': 0.8611111111111112,
 'eval_f1': 0.9006622516556292,
 'eval_runtime': 0.1975,
 'eval_samples_per_second': 546.723,
 'eval_steps_per_second': 20.249,
 'epoch': 20.0}

In [42]:
# 80 / 20 Full Fine-Tuning
# Load a FRESH XLM-R Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-large",
    num_labels=2
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
sum(p.requires_grad for p in model.parameters())


393

In [47]:
# TrainingArguments (80/20)
from transformers import TrainingArguments

training_args_80 = TrainingArguments(
    output_dir="./results/uppc_xlmr_80_20",
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    num_train_epochs=20,
    weight_decay=0.01,

    warmup_ratio=0.1,
    lr_scheduler_type="linear",

    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,

    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=True,
    max_grad_norm=1.0,

    report_to="none",
    seed=SEED
)


In [48]:
# Trainer (80/20)
from transformers import Trainer

trainer_80 = Trainer(
    model=model,
    args=training_args_80,
    train_dataset=tokenized_80_train,
    eval_dataset=tokenized_80_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_80 = Trainer(


In [49]:
# Train (80/20)
trainer_80.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7303,0.710153,0.464286,0.0
2,0.6905,0.694318,0.535714,0.697674
3,0.7421,0.74711,0.535714,0.697674
4,0.7211,0.661482,0.535714,0.697674
5,0.6652,0.645194,0.821429,0.848485
6,0.5682,0.650532,0.785714,0.833333
7,0.555,0.505079,0.892857,0.903226
8,0.5159,0.520469,0.892857,0.896552
9,0.4817,0.504669,0.857143,0.882353
10,0.4133,0.623469,0.857143,0.882353


TrainOutput(global_step=80, training_loss=0.45747569054365156, metrics={'train_runtime': 589.6714, 'train_samples_per_second': 3.799, 'train_steps_per_second': 0.136, 'total_flos': 521881563463680.0, 'train_loss': 0.45747569054365156, 'epoch': 20.0})

In [50]:
# Final Evaluation
xlmr_80_results = trainer_80.evaluate()
xlmr_80_results


{'eval_loss': 0.5050789713859558,
 'eval_accuracy': 0.8928571428571429,
 'eval_f1': 0.9032258064516129,
 'eval_runtime': 1.0023,
 'eval_samples_per_second': 27.936,
 'eval_steps_per_second': 0.998,
 'epoch': 20.0}