In [None]:
# !sed -i 's/transformers.generation_utils/transformers.generation/' /usr/local/lib/python3.12/dist-packages/openprompt/pipeline_base.py

In [1]:
# Set device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# ==============================
# Imports
# ==============================
import pandas as pd
import torch
import random
import numpy as np
from collections import Counter
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt.prompts import SoftVerbalizer
from openprompt.prompts import AutomaticVerbalizer
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt import PromptForClassification, PromptDataLoader
from torch.optim import AdamW
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [3]:


# ==============================
# Set random seeds for reproducibility
# ==============================
# üí° Added this block to ensure consistent results across runs
seed = 20
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
# ==============================
# Step 1: Training data (3-shot / few-shot)
# ==============================
train_dataset = [
    #Politics
    InputExample(guid=0, text_a="ÿ®ŸÑŸà⁄Üÿ≥ÿ™ÿßŸÜ €æ ÿßŸäÿ±ÿßŸÜŸä ÿ≥ÿ±ÿ≤ŸÖŸäŸÜ ÿ™ÿßŸÜ ŸªŸäŸáÿ± ÿ≠ŸÖŸÑŸàÿå Ÿæÿß⁄™ ŸÅŸàÿ¨ ÿ¨ÿß 4 ÿ¨ŸàÿßŸÜ ÿ¥ŸáŸäÿØ", label=0),
    InputExample(guid=1, text_a="ÿ≥Ÿæÿ±ŸäŸÖ ⁄™Ÿàÿ±ŸΩ €æ ŸÇÿ±ÿ¢ŸÜ Ÿæÿß⁄™ ÿ¨Ÿä ÿ≠ÿßŸÅÿ∏ ⁄©Ÿä 20 Ÿàÿß⁄åŸà ŸÜŸÖÿ®ÿ± ⁄èŸä⁄ª ÿ®ÿßÿ®ÿ™ Ÿæÿß⁄ªŸÖÿ±ÿßÿØŸä ŸÜŸàŸΩŸäÿ≥ €æ ÿ¨ÿ≥ŸΩÿ≥ ŸÇÿßÿ∂Ÿä ŸÅÿßÿ¶ÿ≤ ÿπŸäÿ≥ŸäŸ∞ ÿ¨Ÿä ÿ≥ÿ±ÿ®ÿ±ÿßŸáŸä €æ 3 ÿ±⁄™ŸÜŸä ÿÆÿßÿµ ÿ®ÿ¶ŸÜ⁄Ü ŸÅŸäÿµŸÑŸà ÿ¨ÿßÿ±Ÿä ⁄™ŸäŸà", label=0),
    InputExample(guid=2, text_a="Ÿàÿßÿ∂ÿ≠ ÿ±ŸáŸä ÿ™Ÿá ⁄™Ÿäÿ±ŸàŸÑ ÿ¶Ÿä ŸÜŸá Ÿæÿ± ⁄™Ÿäÿ™ÿ±ŸäŸÜ ÿ¶Ÿä ÿπŸàÿ±ÿ™ŸÜ ÿß⁄≥Ÿà⁄ªŸä ÿ¢ŸÖÿ±Ÿä⁄™Ÿä ÿµÿØÿ± ÿ™Ÿä ÿ¨ŸÜÿ≥Ÿä ⁄èÿß⁄çÿßÿ¶Ÿä €æ ŸÖŸÑŸàÿ´ Ÿáÿ¨⁄ª ÿ¨ÿß ÿßŸÑÿ≤ÿßŸÖ ŸÑ⁄≥ÿßŸäÿß ÿ¢ŸáŸÜ.", label=0),
    InputExample(guid=3, text_a="ŸæŸäÿßÿØŸÑ ŸÇÿßŸÅŸÑŸä ÿ¨Ÿä ⁄Äÿ±ŸæŸàÿ± ÿ¢ÿ¨ŸäÿßŸÜ ⁄™ÿ±Ÿä ŸÖŸøŸÜ ⁄ØŸÑŸÜ ÿ¨Ÿä Ÿàÿ±⁄©ÿß ⁄™ÿ¶Ÿä Ÿàÿ¶Ÿä.", label=0),

    #Sports
    InputExample(guid=4, text_a="ŸÜŸäŸæÿßŸÑ ÿ¨Ÿä ⁄™ÿ±⁄™ŸäŸΩ ŸÖŸä⁄Ü ⁄äŸä ÿßŸäŸÑ ÿßŸäÿ≥ ŸÖŸäŸø⁄ä ÿ™ÿ≠ÿ™ ŸàŸÜ ⁄äŸä €æ 9 Ÿà⁄™ŸäŸΩŸÜ ÿ™Ÿä ŸäŸà ÿßŸä ÿßŸä ÿ¨Ÿä ŸΩŸäŸÖ ⁄©Ÿä ÿ¥⁄™ÿ≥ÿ™ ⁄èÿ¶Ÿä ⁄á⁄èŸä ÿ¢ŸáŸä", label=1),
    InputExample(guid=5, text_a="ŸáŸà⁄èÿßŸÜŸáŸÜ Ÿªÿ¶Ÿä ÿ≥ŸäŸÖŸä ŸÅÿßÿ¶ŸÜŸÑ €æ Ÿàÿ±Ÿä ÿ¢ÿ±ŸäÿßŸÜÿß ÿ≥Ÿäÿ®ÿßŸÑŸäŸÜ⁄™ÿß ŸäŸàŸÜÿßŸÜ ÿ¨Ÿä ŸÖÿßÿ±Ÿäÿß ÿ≥⁄™ÿßÿ±Ÿä ⁄©Ÿä ÿ¥⁄™ÿ≥ÿ™ ⁄èÿ¶Ÿä ŸÅÿßÿ¶ŸÜŸÑ €æ ÿ¨⁄≥ŸáŸá Ÿ∫ÿßŸáŸä ÿ¢ŸáŸäÿå Ÿáÿß⁄ª ŸÅÿßÿ¶ŸÜŸÑ €æ ÿ¢ÿ±ŸäÿßŸÜÿß ÿ≥Ÿäÿ®ÿßŸÑŸäŸÜ⁄™ÿß €Ω ÿßŸäŸÑÿßŸÜÿß ÿ±Ÿäÿ®ÿß⁄™ŸÜÿß ÿ¢ŸÖŸáŸàŸÜ ÿ≥ÿßŸÖŸáŸàŸÜ ŸøŸäŸÜÿØŸäŸàŸÜ.", label=1),
    InputExample(guid=6, text_a="ÿ¨⁄èŸáŸÜ ÿ™⁄æ ÿ≥ÿ±ŸäŸÑŸÜ⁄™ÿß ŸàŸäŸÖŸÜ ÿ¨Ÿä ŸΩŸäŸÖ 19 ÿßŸàŸàÿ± €æ ÿ±⁄≥Ÿà 3 Ÿà⁄™ŸäŸΩŸÜ ÿ¨Ÿä ŸÜŸÇÿµÿßŸÜ ÿ™Ÿä ŸáÿØŸÅ ŸæŸàÿ±Ÿà ⁄™ÿ±Ÿä ⁄™ÿßŸÖŸäÿßÿ®Ÿä ŸÖÿß⁄ªŸä Ÿàÿ±ÿ™Ÿäÿå ÿ¨ŸÜ €æ ⁄ÜŸÖÿßÿ±Ÿä ÿßÿ™ÿßŸæŸΩŸà 33ÿå Ÿáÿ±ÿ¥ŸäŸøÿß ÿ≥ŸÖÿßÿ±ÿßŸà⁄™ÿ±ŸÖÿß 29ÿå ⁄™ŸàŸäÿ¥ÿß ⁄äŸÑŸáÿßÿ±Ÿä 20 ÿ±ŸÜÿ≥ŸÜ ÿ≥ÿßŸÜ ŸÜŸÖÿßŸäÿßŸÜ ÿ±ŸáŸäŸàŸÜ", label=1),
    InputExample(guid=7, text_a="⁄™ÿ±ÿ≥ŸΩŸäÿßŸÜŸà ÿ±ŸàŸÜÿßŸÑ⁄äŸà 2023ÿπ ÿØŸàÿ±ÿßŸÜ 38 ⁄ØŸàŸÑ ⁄™ÿ±Ÿä ÿ≥⁄ÄŸÜŸä ⁄©ÿßŸÜ ÿß⁄≥ŸäÿßŸÜ ŸÜ⁄™ÿ±Ÿä ŸàŸäŸà", label=1),

    #Crime
    InputExample(guid=8, text_a="Ÿæÿ± ⁄Ø⁄èŸäŸÑ ŸÇŸàŸÖŸÜ ÿ¨Ÿä Ÿá⁄™ ÿ®ŸäÿßŸÜ €æ ⁄ÜŸäŸà ŸàŸäŸà ÿ¢ŸáŸä ÿ™Ÿá ŸÖŸàÿ™ ÿ¨Ÿä ÿ≥ÿ≤ÿß ŸÖÿß⁄ªŸäŸÜÿØ⁄ôŸÜ ÿ¨Ÿà ÿßÿµŸÑ ÿßŸÜ⁄Ø ÿßŸÜ ⁄©ÿßŸÜ Ÿà⁄åŸä⁄™ ÿ¢ŸáŸä.", label=2),
    InputExample(guid=9, text_a="Ÿá⁄™ ÿπŸàÿ±ÿ™ ⁄©Ÿä ÿ≥ŸÜÿØÿ≥ ⁄åŸäÿ°Ÿé ÿ≥ŸÖŸäÿ™ ⁄èŸàŸáÿßÿ±Ÿä ÿßÿ∫Ÿàÿß ⁄™ÿ±Ÿä ŸàŸäÿß.", label=2),
    InputExample(guid=10, text_a="Ÿæÿ±⁄èŸäŸáŸä ŸÖŸä⁄äŸäÿß ŸÖŸàÿ¨ÿ® ÿßŸÜ ⁄≥ÿßŸÑŸáŸá ÿ¨Ÿà ÿßŸÜ⁄™ÿ¥ÿßŸÅ Ÿá⁄™ ÿßŸäÿ±ÿßŸÜŸä ŸÜÿßÿ¶ÿ® Ÿàÿ≤Ÿäÿ± ŸäŸàŸÜÿ≥ ŸæŸÜÿßŸáŸä ⁄™ŸäŸà ÿ¢ŸáŸäÿå ÿ¨ŸÜŸáŸÜ ⁄ÜŸäŸà ÿ¢ŸáŸä ÿ™Ÿá ÿßŸäÿ±ÿßŸÜ €æ ÿßÿ≥⁄™ŸàŸÑ ÿ¨Ÿä ⁄áŸà⁄™ÿ±ŸäŸÜ ⁄©Ÿä ÿ≤Ÿáÿ± ⁄èŸÜŸà ŸàŸäŸà", label=2),
    InputExample(guid=11, text_a="⁄≥⁄ôŸáŸä ÿÆÿØÿßÿ®ÿÆÿ¥ ⁄©Ÿàÿ≥Ÿà €æ ŸÜŸäŸÜ⁄Øÿ±Ÿä ÿ¨Ÿä ŸÇÿ™ŸÑ ÿ¨Ÿà ÿßŸÑÿ≤ÿßŸÖ.", label=2),
]


In [5]:
# ==============================
# Define Classes
# ==============================
classes = ["Politics", "Sports", "Crime"]
label_map = {"Politics": 0, "Sports": 1, "Crime":2}

# # Step 1: Use load_plm with 'roberta' to get the correct WrapperClass
_, _, _, WrapperClass = load_plm("roberta", "roberta-base")  # Just to get the wrapper

# # Step 2: Manually load XLM-RoBERTa model/tokenizer
model_name = "xlm-roberta-large"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
plm = XLMRobertaForMaskedLM.from_pretrained(model_name)

# ==============================
# Load Pretrained Language Model (mBERT)
# ==============================
# plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-multilingual-cased")


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 98f030bb-7054-457d-884d-ac51d686432e)')' thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].


In [6]:

# ==============================
# Define Prompt Template (Manual)
# ==============================
template = ManualTemplate(
    text = '{"placeholder":"text_a"} ŸáŸä ÿ¨ŸÖŸÑÿß ÿ¢ŸáŸä {"mask"}.', #sindhi
    tokenizer=tokenizer,
)


# automatic verbalizer
verbalizer = AutomaticVerbalizer(
    tokenizer=tokenizer,
    classes=classes,
    num_candidates=1000,
    label_word_num_per_class=5,
)

In [7]:



# ==============================
# Create Prompt Model
# ==============================
prompt_model = PromptForClassification(
    template=template,
    plm=plm,
    verbalizer=verbalizer
)

# ==============================
# DataLoader for Training
# ==============================
train_loader = PromptDataLoader(
    dataset=train_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=4,
    shuffle=True  # Even though shuffled, reproducibility is preserved by seeding
)

# ==============================
# Fine-Tuning the Prompt Model
# ==============================
prompt_model.train()
optimizer = AdamW(prompt_model.parameters(), lr=1e-5)

for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        logits = prompt_model(batch)
        loss = torch.nn.CrossEntropyLoss()(logits, batch['label'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

# ==============================
# Load Evaluation Dataset
# ==============================
df = pd.read_csv(r"C:\Users\stdFurqan\Desktop\Sindhi (politics)\split_30.csv")
eval_dataset = [
    InputExample(guid=i, text_a=row['News Headlines'], label=label_map[row['Aspect Category']])
    for i, row in df.iterrows()
]


eval_loader = PromptDataLoader(
    dataset=eval_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=8,
    shuffle=False
)

# ==============================
# Evaluate Model
# ==============================
prompt_model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in eval_loader:
        logits = prompt_model(batch)
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(batch['label'].cpu().tolist())

# ==============================
# Print Classification Report
# ==============================
print("\nüìä Fine-tuned Classification Report:")
print(classification_report(all_labels, all_preds, target_names=classes, digits=4))


tokenizing: 12it [00:00, 1997.21it/s]


Epoch 1 Loss: 5.3057
Epoch 2 Loss: 3.4287
Epoch 3 Loss: 3.8526
Epoch 4 Loss: 3.6253
Epoch 5 Loss: 3.7148


tokenizing: 4967it [00:01, 3867.86it/s]



üìä Fine-tuned Classification Report:
              precision    recall  f1-score   support

    Politics     0.4531    0.3260    0.3792      2282
      Sports     0.2412    0.3149    0.2732      1245
       Crime     0.2935    0.3465    0.3178      1440

    accuracy                         0.3292      4967
   macro avg     0.3293    0.3291    0.3234      4967
weighted avg     0.3537    0.3292    0.3348      4967

