In [1]:
# Imports
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

from scipy.stats import pearsonr


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the CLEANED datasets 
train_path = r"C:\Users\areesa\Documents\Urdu_GLUE_xlm_roberta\data\raw\U-STS-B\Final_STSB_train.csv"
dev_path   = r"C:\Users\areesa\Documents\Urdu_GLUE_xlm_roberta\data\raw\U-STS-B\Final_dev_translated.csv"
test_path  = r"C:\Users\areesa\Documents\Urdu_GLUE_xlm_roberta\data\raw\U-STS-B\STSB-test_urdu-translated - Final.tsv"

train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)

test_df = pd.read_csv(
    test_path,
    sep="\t",
    engine="python",
    on_bad_lines="skip"
)


In [3]:
# Clean TRAIN
train_df = train_df[["sentence1", "sentence2", "score"]]


In [4]:
# Clean DEV
dev_df = dev_df.rename(columns={"scores": "score"})
dev_df = dev_df[["sentence1", "sentence2", "score"]]


In [5]:
# Clean TEST (Urdu only)
test_df = test_df.rename(columns={
    "sentence1_urdu": "sentence1",
    "sentence2_urdu": "sentence2"
})

test_df = test_df[["sentence1", "sentence2"]]


In [6]:
# FINAL sanity check
print("Train:", train_df.shape)
print("Dev:  ", dev_df.shape)
print("Test: ", test_df.shape)

print("Train score range:", train_df["score"].min(), train_df["score"].max())
print("Dev score range:  ", dev_df["score"].min(), dev_df["score"].max())


Train: (5749, 3)
Dev:   (1501, 3)
Test:  (1354, 4)
Train score range: 0.0 5.0
Dev score range:   0.0 5.0


In [7]:
dev_df = dev_df.dropna(subset=["score"]).reset_index(drop=True)


In [8]:
print(dev_df.shape)
print("Dev score range:", dev_df["score"].min(), dev_df["score"].max())


(1500, 3)
Dev score range: 0.0 5.0


In [10]:
test_df = test_df.loc[:, ~test_df.columns.duplicated()]


In [11]:
print(test_df.shape)
print(test_df.columns.tolist())


(1354, 2)
['sentence1', 'sentence2']


In [12]:
print("Train:", train_df.shape)
print("Dev:  ", dev_df.shape)
print("Test: ", test_df.shape)

print("Train score range:", train_df["score"].min(), train_df["score"].max())
print("Dev score range:  ", dev_df["score"].min(), dev_df["score"].max())


Train: (5749, 3)
Dev:   (1500, 3)
Test:  (1354, 2)
Train score range: 0.0 5.0
Dev score range:   0.0 5.0


# Zero-Shot STS-B (XLM-RoBERTa-large)
Encoder frozen

Only the regression head is trained

Metric: Pearson correlation

Same protocol you used for mBERT

In [13]:
# 9.1 — Convert pandas → HuggingFace Dataset
train_hf = Dataset.from_pandas(train_df)
dev_hf   = Dataset.from_pandas(dev_df)


In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")


In [20]:
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


In [21]:
# 9.2 — Tokenization

train_tokenized = train_hf.map(tokenize_function, batched=True)
dev_tokenized   = dev_hf.map(tokenize_function, batched=True)

train_tokenized = train_tokenized.rename_column("score", "labels")
dev_tokenized   = dev_tokenized.rename_column("score", "labels")

train_tokenized.set_format("torch")
dev_tokenized.set_format("torch")


Map: 100%|████████████████████████████████████████████████████████████████████████████| 5749/5749 [00:00<00:00, 25446.55 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 28629.54 examples/s]


In [22]:
train_tokenized.features


{'sentence1': Value('string'),
 'sentence2': Value('string'),
 'labels': Value('float64'),
 'input_ids': List(Value('int32')),
 'attention_mask': List(Value('int8'))}

In [23]:
# Load XLM-RoBERTa-large regression model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-large",
    num_labels=1,
    problem_type="regression"
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Freeze encoder (ZERO-SHOT rule)
for param in model.base_model.parameters():
    param.requires_grad = False


In [25]:
# Pearson correlation metric 
from scipy.stats import pearsonr

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    return {"pearson": pearsonr(predictions, labels)[0]}


In [26]:
# Training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./sts_zero_shot_xlmr",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)


In [27]:
# Create Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [28]:
trainer.train()


Epoch,Training Loss,Validation Loss,Pearson
1,5.0257,2.239072,0.212298
2,2.4017,2.29523,0.244176
3,2.2332,2.270091,0.260579
4,2.2147,2.32013,0.275881
5,2.17,2.317713,0.291385
6,2.1438,2.262391,0.309983
7,2.1009,2.260291,0.318689
8,2.1329,2.348855,0.326886
9,2.0539,2.329639,0.335138
10,2.0765,2.268384,0.339312


TrainOutput(global_step=3600, training_loss=2.1896127319335936, metrics={'train_runtime': 244.7764, 'train_samples_per_second': 469.735, 'train_steps_per_second': 14.707, 'total_flos': 2.678827652660736e+16, 'train_loss': 2.1896127319335936, 'epoch': 20.0})

In [29]:
zero_shot_results = trainer.evaluate()
zero_shot_results


{'eval_loss': 2.28651762008667,
 'eval_pearson': 0.37374383211135864,
 'eval_runtime': 1.9712,
 'eval_samples_per_second': 760.939,
 'eval_steps_per_second': 23.843,
 'epoch': 20.0}

# 16-Shot STS-B (XLM-RoBERTa-large)

In [30]:
# 10.1 — Create the 16-shot subset (from TRAIN)
train_16_df = train_df.iloc[:16].reset_index(drop=True)

print(train_16_df.shape)
train_16_df.head()


(16, 3)


Unnamed: 0,sentence1,sentence2,score
0,ایک طیارہ اڑان بھر رہا ہے ۔,ایک ہوائی جہاز اڑان بھر رہا ہے ۔,5.0
1,ایک آدمی بڑی بانسری بجا رہا ہے ۔,ایک آدمی بانسری بجا رہا ہے ۔,3.8
2,ایک آدمی پیزا پر کٹا ہوا پنیر پھیلا رہا ہے ۔,ایک آدمی بغیر پکے ہوئے پیزا پر کٹا ہوا پنیر پھ...,3.8
3,تین آدمی شطرنج کھیل رہے ہیں ۔,دو آدمی شطرنج کھیل رہے ہیں ۔,2.6
4,ایک آدمی سیلو کھیل رہا ہے ۔,بیٹھا ہوا آدمی سیلو کھیل رہا ہے ۔,4.25


In [31]:
# 10.2 — Convert to HuggingFace Dataset
from datasets import Dataset

train_16_hf = Dataset.from_pandas(train_16_df)


In [32]:
# 10.3 — Tokenize
train_16_tokenized = train_16_hf.map(tokenize_function, batched=True)

train_16_tokenized = train_16_tokenized.rename_column("score", "labels")
train_16_tokenized.set_format("torch")


Map: 100%|█████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 3903.49 examples/s]


In [33]:
# 10.4 — Load a fresh XLM-R regression model
model_16 = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-large",
    num_labels=1,
    problem_type="regression"
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# 10.5 — UNFREEZE the encoder
for param in model_16.parameters():
    param.requires_grad = True


In [36]:
# 10.6 — Training arguments
training_args_16 = TrainingArguments(
    output_dir="./sts_16shot_xlmr",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)


In [37]:
# 10.7 — Create Trainer (16-Shot XLM-R)
trainer_16 = Trainer(
    model=model_16,
    args=training_args_16,
    train_dataset=train_16_tokenized,
    eval_dataset=dev_tokenized,   # SAME dev set
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_16 = Trainer(


In [38]:
# 10.8 — Train (16-Shot)
trainer_16.train()


Epoch,Training Loss,Validation Loss,Pearson
1,No log,8.445621,0.159961
2,No log,8.445621,0.159961
3,No log,8.445621,0.159961
4,No log,8.445621,0.159961
5,No log,6.87951,0.196064
6,No log,3.509936,0.083651
7,No log,3.509936,0.083651
8,No log,2.267016,0.021762
9,No log,2.261277,0.086164
10,10.480400,3.074676,-0.044184


TrainOutput(global_step=20, training_loss=6.243128108978271, metrics={'train_runtime': 1139.8922, 'train_samples_per_second': 0.281, 'train_steps_per_second': 0.018, 'total_flos': 74554257162240.0, 'train_loss': 6.243128108978271, 'epoch': 20.0})

In [39]:
results_16 = trainer_16.evaluate()
results_16


{'eval_loss': 6.879510402679443,
 'eval_pearson': 0.19606448709964752,
 'eval_runtime': 24.4321,
 'eval_samples_per_second': 61.395,
 'eval_steps_per_second': 1.924,
 'epoch': 20.0}

# 80/20 STS-B (XLM-RoBERTa-large)

In [40]:
# 11.1 — Create 80/20 split (fixed seed)
from sklearn.model_selection import train_test_split

train_80_df, dev_20_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print(train_80_df.shape, dev_20_df.shape)


(4599, 3) (1150, 3)


In [41]:
# 11.2 — Convert to HuggingFace Datasets
from datasets import Dataset

train_80_hf = Dataset.from_pandas(train_80_df.reset_index(drop=True))
dev_20_hf   = Dataset.from_pandas(dev_20_df.reset_index(drop=True))



In [42]:
# 11.3 — Tokenize
train_80_tok = train_80_hf.map(tokenize_function, batched=True)
dev_20_tok   = dev_20_hf.map(tokenize_function, batched=True)

train_80_tok = train_80_tok.rename_column("score", "labels")
dev_20_tok   = dev_20_tok.rename_column("score", "labels")

train_80_tok.set_format("torch")
dev_20_tok.set_format("torch")


Map: 100%|████████████████████████████████████████████████████████████████████████████| 4599/4599 [00:00<00:00, 29534.86 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████| 1150/1150 [00:00<00:00, 24416.22 examples/s]


In [43]:
# 11.4 — Load fresh XLM-R-large model
model_80 = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-large",
    num_labels=1,
    problem_type="regression"
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
# 11.5 — Unfreeze encoder
for param in model_80.parameters():
    param.requires_grad = True


In [45]:
# 11.6 — Training arguments
training_args_80 = TrainingArguments(
    output_dir="./sts_80_20_xlmr",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)


In [46]:
# 11.7 — Trainer (80/20)
trainer_80 = Trainer(
    model=model_80,
    args=training_args_80,
    train_dataset=train_80_tok,
    eval_dataset=dev_20_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_80 = Trainer(


In [47]:
trainer_80.train()


Epoch,Training Loss,Validation Loss,Pearson
1,2.2208,0.666134,0.862522
2,0.7838,0.768497,0.862898
3,0.5305,0.677332,0.873607
4,0.4606,0.71575,0.878062
5,0.3221,0.535631,0.881885
6,0.2861,0.485023,0.882931
7,0.2264,0.497422,0.887247
8,0.1802,0.486552,0.885232
9,0.1675,0.529561,0.885179
10,0.1343,0.52848,0.883658


TrainOutput(global_step=2880, training_loss=0.2735405760506789, metrics={'train_runtime': 18576.9747, 'train_samples_per_second': 4.951, 'train_steps_per_second': 0.155, 'total_flos': 2.142968929307136e+16, 'train_loss': 0.2735405760506789, 'epoch': 20.0})

In [48]:
results_80 = trainer_80.evaluate()
results_80


{'eval_loss': 0.4726274311542511,
 'eval_pearson': 0.8914019465446472,
 'eval_runtime': 40.0434,
 'eval_samples_per_second': 28.719,
 'eval_steps_per_second': 0.899,
 'epoch': 20.0}