In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_colwidth", 200)


In [2]:
# Define file paths
train_path = r"C:\Users\areesa\Documents\Urdu_GLUE_xlm_roberta\data\raw\U-STS-B\Final_STSB_train.csv"
dev_path   = r"C:\Users\areesa\Documents\Urdu_GLUE_xlm_roberta\data\raw\U-STS-B\Final_dev_translated.csv"
test_path  = r"C:\Users\areesa\Documents\Urdu_GLUE_xlm_roberta\data\raw\U-STS-B\STSB-test_urdu-translated - Final.tsv"


In [3]:
# Load TRAIN dataset
train_df = pd.read_csv(train_path)


In [4]:
# Inspect
print("Train shape:", train_df.shape)
print("Train columns:", train_df.columns.tolist())
train_df.head(3)


Train shape: (5749, 4)
Train columns: ['index', 'score', 'sentence1', 'sentence2']


Unnamed: 0,index,score,sentence1,sentence2
0,0,5.0,ایک طیارہ اڑان بھر رہا ہے ۔,ایک ہوائی جہاز اڑان بھر رہا ہے ۔
1,1,3.8,ایک آدمی بڑی بانسری بجا رہا ہے ۔,ایک آدمی بانسری بجا رہا ہے ۔
2,2,3.8,ایک آدمی پیزا پر کٹا ہوا پنیر پھیلا رہا ہے ۔,ایک آدمی بغیر پکے ہوئے پیزا پر کٹا ہوا پنیر پھیلا رہا ہے ۔


In [5]:
# Load DEV dataset
dev_df = pd.read_csv(dev_path)


In [6]:
print("Dev shape:", dev_df.shape)
print("Dev columns:", dev_df.columns.tolist())
dev_df.head(3)


Dev shape: (1501, 3)
Dev columns: ['sentence1', 'sentence2', 'scores']


Unnamed: 0,sentence1,sentence2,scores
0,سخت ٹوپی والا آدمی ناچ رہا ہے ۔,سخت ٹوپی پہنے ہوئے آدمی رقص کر رہا ہے ۔,5.0
1,ایک چھوٹا بچہ گھوڑے پر سوار ہے ۔,ایک بچہ گھوڑے پر سوار ہے ۔,4.75
2,ایک آدمی سانپ کو ماؤس کھلا رہا ہے ۔,آدمی سانپ کو چوہا کھلا رہا ہے ۔,5.0


In [7]:
# Load TEST dataset
test_df = pd.read_csv(
    test_path,
    sep="\t",
    engine="python",
    on_bad_lines="skip"
)


In [8]:
# Inspect
print("Test shape:", test_df.shape)
print("Test columns:", test_df.columns.tolist())
test_df.head(3)


Test shape: (1354, 5)
Test columns: [' ', 'sentence1', 'sentence2', 'sentence1_urdu', 'sentence2_urdu']


Unnamed: 0,Unnamed: 1,sentence1,sentence2,sentence1_urdu,sentence2_urdu
0,0,A girl is styling her hair.,A girl is brushing her hair.,ایک لڑکی اپنے بالوں کو اسٹائل کر رہی ہے ۔,ایک لڑکی اپنے بالوں کو برش کر رہی ہے ۔
1,1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,مردوں کا ایک گروہ ساحل پر فٹ بال کھیل رہا ہے ۔,لڑکوں کا ایک گروہ ساحل پر فٹ بال کھیل رہا ہے ۔
2,2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,ایک عورت دوسری عورت کے ٹخنے کو ناپ رہی ہے ۔,ایک عورت دوسری عورت کے ٹخنے کی پیمائش کرتی ہے ۔


In [9]:
# Identify Urdu + score columns
train_df.head(1)
dev_df.head(1)
test_df.head(1)


Unnamed: 0,Unnamed: 1,sentence1,sentence2,sentence1_urdu,sentence2_urdu
0,0,A girl is styling her hair.,A girl is brushing her hair.,ایک لڑکی اپنے بالوں کو اسٹائل کر رہی ہے ۔,ایک لڑکی اپنے بالوں کو برش کر رہی ہے ۔


In [10]:
# Clean & standardize ALL splits
# Clean TRAIN
train_df = train_df[["sentence1", "sentence2", "score"]]

print(train_df.shape)
print(train_df.columns.tolist())
train_df.head(2)


(5749, 3)
['sentence1', 'sentence2', 'score']


Unnamed: 0,sentence1,sentence2,score
0,ایک طیارہ اڑان بھر رہا ہے ۔,ایک ہوائی جہاز اڑان بھر رہا ہے ۔,5.0
1,ایک آدمی بڑی بانسری بجا رہا ہے ۔,ایک آدمی بانسری بجا رہا ہے ۔,3.8


In [11]:
# Clean DEV (rename scores → score)
dev_df = dev_df.rename(columns={"scores": "score"})
dev_df = dev_df[["sentence1", "sentence2", "score"]]

print(dev_df.shape)
print(dev_df.columns.tolist())
dev_df.head(2)


(1501, 3)
['sentence1', 'sentence2', 'score']


Unnamed: 0,sentence1,sentence2,score
0,سخت ٹوپی والا آدمی ناچ رہا ہے ۔,سخت ٹوپی پہنے ہوئے آدمی رقص کر رہا ہے ۔,5.0
1,ایک چھوٹا بچہ گھوڑے پر سوار ہے ۔,ایک بچہ گھوڑے پر سوار ہے ۔,4.75


In [12]:
# Clean TEST
test_df = test_df.rename(columns={
    "sentence1_urdu": "sentence1",
    "sentence2_urdu": "sentence2"
})

test_df = test_df[["sentence1", "sentence2"]]

print(test_df.shape)
print(test_df.columns.tolist())
test_df.head(2)


(1354, 4)
['sentence1', 'sentence1', 'sentence2', 'sentence2']


Unnamed: 0,sentence1,sentence1.1,sentence2,sentence2.1
0,A girl is styling her hair.,ایک لڑکی اپنے بالوں کو اسٹائل کر رہی ہے ۔,A girl is brushing her hair.,ایک لڑکی اپنے بالوں کو برش کر رہی ہے ۔
1,A group of men play soccer on the beach.,مردوں کا ایک گروہ ساحل پر فٹ بال کھیل رہا ہے ۔,A group of boys are playing soccer on the beach.,لڑکوں کا ایک گروہ ساحل پر فٹ بال کھیل رہا ہے ۔


In [13]:
# Final sanity checks
print("Train:", train_df.shape, train_df.columns.tolist())
print("Dev:  ", dev_df.shape, dev_df.columns.tolist())
print("Test: ", test_df.shape, test_df.columns.tolist())

print("Train score range:", train_df["score"].min(), train_df["score"].max())
print("Dev score range:  ", dev_df["score"].min(), dev_df["score"].max())

print("\nMissing values:")
print("Train:\n", train_df.isnull().sum())
print("Dev:\n", dev_df.isnull().sum())
print("Test:\n", test_df.isnull().sum())


Train: (5749, 3) ['sentence1', 'sentence2', 'score']
Dev:   (1501, 3) ['sentence1', 'sentence2', 'score']
Test:  (1354, 4) ['sentence1', 'sentence1', 'sentence2', 'sentence2']
Train score range: 0.0 5.0
Dev score range:   0.0 5.0

Missing values:
Train:
 sentence1    0
sentence2    0
score        0
dtype: int64
Dev:
 sentence1    0
sentence2    0
score        1
dtype: int64
Test:
 sentence1    0
sentence1    0
sentence2    0
sentence2    0
dtype: int64


In [14]:
# ISSUE 1: TEST has duplicate columns
# FIX
test_df = test_df.loc[:, ~test_df.columns.duplicated()]


In [15]:
print(test_df.shape)
print(test_df.columns.tolist())


(1354, 2)
['sentence1', 'sentence2']


In [16]:
# ISSUE 2: DEV has 1 missing score value
# FIX
dev_df = dev_df.dropna(subset=["score"]).reset_index(drop=True)


In [17]:
print(dev_df.isnull().sum())
print("Dev score range:", dev_df["score"].min(), dev_df["score"].max())


sentence1    0
sentence2    0
score        0
dtype: int64
Dev score range: 0.0 5.0


In [18]:
# SANITY CHECK
print("Train:", train_df.shape, train_df.columns.tolist())
print("Dev:  ", dev_df.shape, dev_df.columns.tolist())
print("Test: ", test_df.shape, test_df.columns.tolist())

print("Train score range:", train_df["score"].min(), train_df["score"].max())
print("Dev score range:  ", dev_df["score"].min(), dev_df["score"].max())

print("\nMissing values:")
print("Train:\n", train_df.isnull().sum())
print("Dev:\n", dev_df.isnull().sum())
print("Test:\n", test_df.isnull().sum())


Train: (5749, 3) ['sentence1', 'sentence2', 'score']
Dev:   (1500, 3) ['sentence1', 'sentence2', 'score']
Test:  (1354, 2) ['sentence1', 'sentence2']
Train score range: 0.0 5.0
Dev score range:   0.0 5.0

Missing values:
Train:
 sentence1    0
sentence2    0
score        0
dtype: int64
Dev:
 sentence1    0
sentence2    0
score        0
dtype: int64
Test:
 sentence1    0
sentence2    0
dtype: int64


# STS-B Regression (Zero-Shot Setup)- mBERT

In [19]:
# 8.1 — Imports
import torch
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from scipy.stats import pearsonr


  from .autonotebook import tqdm as notebook_tqdm


In [20]:
# 8.2 — Convert pandas → HuggingFace Dataset
train_hf = Dataset.from_pandas(train_df)
dev_hf   = Dataset.from_pandas(dev_df)


In [21]:
# Sanity check:
train_hf
dev_hf


Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 1500
})

In [22]:
# 8.3 — Load tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [23]:
# 8.4 — Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


In [24]:
train_tokenized = train_hf.map(tokenize_function, batched=True)
dev_tokenized   = dev_hf.map(tokenize_function, batched=True)


Map: 100%|████████████████████████████████████████████████████████████████████████████| 5749/5749 [00:00<00:00, 20261.28 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 24005.31 examples/s]


In [25]:
# 8.5 — Rename label column for Trainer
train_tokenized = train_tokenized.rename_column("score", "labels")
dev_tokenized   = dev_tokenized.rename_column("score", "labels")

train_tokenized.set_format("torch")
dev_tokenized.set_format("torch")


In [26]:
# 8.6 — Load regression model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    problem_type="regression"
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# 8.7 — Freeze encoder (ZERO-SHOT rule)
for param in model.base_model.parameters():
    param.requires_grad = False


In [28]:
# 8.8 — Define Pearson correlation metric
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    pearson_corr = pearsonr(predictions, labels)[0]
    return {"pearson": pearson_corr}


In [30]:
# 8.9 — Training arguments
training_args = TrainingArguments(
    output_dir="./sts_zero_shot",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)


In [31]:
# 8.10 — Create Trainer (NO training yet)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [32]:
# 9.1 — Train
trainer.train()


Epoch,Training Loss,Validation Loss,Pearson
1,10.3304,7.040989,-0.014261
2,7.6084,5.523219,0.039863
3,6.0227,4.548022,0.050169
4,4.7335,3.976195,0.055306
5,4.2272,3.662323,0.059185
6,4.0245,3.480135,0.062448
7,3.7269,3.339381,0.06573
8,3.4885,3.22535,0.068771
9,3.4592,3.119877,0.071714
10,3.2861,3.031608,0.074541


TrainOutput(global_step=3600, training_loss=4.036747055053711, metrics={'train_runtime': 92.1656, 'train_samples_per_second': 1247.538, 'train_steps_per_second': 39.06, 'total_flos': 7563059380055040.0, 'train_loss': 4.036747055053711, 'epoch': 20.0})

In [33]:
zero_shot_results = trainer.evaluate()
zero_shot_results


{'eval_loss': 2.6775097846984863,
 'eval_pearson': 0.09071057289838791,
 'eval_runtime': 0.7308,
 'eval_samples_per_second': 2052.574,
 'eval_steps_per_second': 64.314,
 'epoch': 20.0}

# 16-Shot STS-B (mBERT)

In [34]:
# 10.1 — Create the 16-shot training subset
train_16_df = train_df.iloc[:16].reset_index(drop=True)

print(train_16_df.shape)
train_16_df.head()


(16, 3)


Unnamed: 0,sentence1,sentence2,score
0,ایک طیارہ اڑان بھر رہا ہے ۔,ایک ہوائی جہاز اڑان بھر رہا ہے ۔,5.0
1,ایک آدمی بڑی بانسری بجا رہا ہے ۔,ایک آدمی بانسری بجا رہا ہے ۔,3.8
2,ایک آدمی پیزا پر کٹا ہوا پنیر پھیلا رہا ہے ۔,ایک آدمی بغیر پکے ہوئے پیزا پر کٹا ہوا پنیر پھیلا رہا ہے ۔,3.8
3,تین آدمی شطرنج کھیل رہے ہیں ۔,دو آدمی شطرنج کھیل رہے ہیں ۔,2.6
4,ایک آدمی سیلو کھیل رہا ہے ۔,بیٹھا ہوا آدمی سیلو کھیل رہا ہے ۔,4.25


In [35]:
# 10.2 — Convert to HuggingFace Dataset
train_16_hf = Dataset.from_pandas(train_16_df)


In [36]:
# 10.3 — Tokenize
train_16_tokenized = train_16_hf.map(tokenize_function, batched=True)


Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<?, ? examples/s]


In [37]:
train_16_tokenized = train_16_tokenized.rename_column("score", "labels")
train_16_tokenized.set_format("torch")


In [38]:
# 10.4 — Reload model (fresh) for 16-shot
model_16 = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    problem_type="regression"
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# 10.5 — UNFREEZE encoder
for param in model_16.parameters():
    param.requires_grad = True


In [40]:
# 10.6 — Training arguments
training_args_16 = TrainingArguments(
    output_dir="./sts_16_shot_mbert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)


In [41]:
# 10.7 — Trainer for 16-shot
trainer_16 = Trainer(
    model=model_16,
    args=training_args_16,
    train_dataset=train_16_tokenized,
    eval_dataset=dev_tokenized,   # SAME dev set
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_16 = Trainer(


In [42]:
# 10.8 — Train (16-shot)
trainer_16.train()


Epoch,Training Loss,Validation Loss,Pearson
1,No log,6.601616,0.119207
2,No log,6.601616,0.119207
3,No log,6.601616,0.119207
4,No log,6.059515,0.397516
5,No log,5.662451,0.466566
6,No log,5.314486,0.488529
7,No log,5.000742,0.493885
8,No log,4.719841,0.491761
9,No log,4.469366,0.482875
10,10.583900,4.255556,0.46653


TrainOutput(global_step=20, training_loss=8.640917587280274, metrics={'train_runtime': 86.763, 'train_samples_per_second': 3.688, 'train_steps_per_second': 0.231, 'total_flos': 21048695439360.0, 'train_loss': 8.640917587280274, 'epoch': 20.0})

In [43]:
results_16 = trainer_16.evaluate()
results_16


{'eval_loss': 5.000741958618164,
 'eval_pearson': 0.49388471245765686,
 'eval_runtime': 0.7525,
 'eval_samples_per_second': 1993.422,
 'eval_steps_per_second': 62.461,
 'epoch': 20.0}

# 80/20 Fine-Tuning (mBERT)

In [44]:
# 11.1 — Create the 80/20 split (fixed seed)
from sklearn.model_selection import train_test_split

train_80_df, dev_20_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print(train_80_df.shape, dev_20_df.shape)


(4599, 3) (1150, 3)


In [45]:
# 11.2 — Convert to HuggingFace Datasets
from datasets import Dataset

train_80_hf = Dataset.from_pandas(train_80_df.reset_index(drop=True))
dev_20_hf   = Dataset.from_pandas(dev_20_df.reset_index(drop=True))


In [46]:
# 11.3 — Tokenize (same function, unchanged)
train_80_tok = train_80_hf.map(tokenize_function, batched=True)
dev_20_tok   = dev_20_hf.map(tokenize_function, batched=True)

train_80_tok = train_80_tok.rename_column("score", "labels")
dev_20_tok   = dev_20_tok.rename_column("score", "labels")

train_80_tok.set_format("torch")
dev_20_tok.set_format("torch")


Map: 100%|████████████████████████████████████████████████████████████████████████████| 4599/4599 [00:00<00:00, 23742.22 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████| 1150/1150 [00:00<00:00, 21997.46 examples/s]


In [47]:
# 11.4 — Load a fresh regression model
model_80 = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=1,
    problem_type="regression"
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# 11.5 — Unfreeze encoder (full fine-tuning)
for param in model_80.parameters():
    param.requires_grad = True


In [49]:
# 11.6 — Training arguments
training_args_80 = TrainingArguments(
    output_dir="./sts_80_20_mbert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)


In [50]:
# 11.7 — Trainer (80/20)
trainer_80 = Trainer(
    model=model_80,
    args=training_args_80,
    train_dataset=train_80_tok,
    eval_dataset=dev_20_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_80 = Trainer(


In [51]:
# 11.8 — Train (80/20)
trainer_80.train()


Epoch,Training Loss,Validation Loss,Pearson
1,1.9313,0.836753,0.821926
2,0.7111,0.614985,0.849619
3,0.4371,0.634708,0.852527
4,0.3271,0.65723,0.851789
5,0.223,0.632309,0.85119
6,0.1938,0.598278,0.856694
7,0.149,0.632974,0.851911
8,0.1276,0.60695,0.853182
9,0.1192,0.606476,0.854701
10,0.099,0.608529,0.855614


TrainOutput(global_step=2880, training_loss=0.2213785116871198, metrics={'train_runtime': 374.1292, 'train_samples_per_second': 245.851, 'train_steps_per_second': 7.698, 'total_flos': 6050184395351040.0, 'train_loss': 0.2213785116871198, 'epoch': 20.0})

In [52]:
results_80 = trainer_80.evaluate()
results_80


{'eval_loss': 0.5914236903190613,
 'eval_pearson': 0.8570648431777954,
 'eval_runtime': 0.5718,
 'eval_samples_per_second': 2011.053,
 'eval_steps_per_second': 62.955,
 'epoch': 20.0}