In [None]:
# !sed -i 's/transformers.generation_utils/transformers.generation/' /usr/local/lib/python3.12/dist-packages/openprompt/pipeline_base.py

In [1]:
# Set device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
import torch

if torch.cuda.is_available():
    device = "cuda"
    gpu_name = torch.cuda.get_device_name(0)
    print(f"‚úÖ Using GPU: {gpu_name}")
else:
    device = "cpu"
    print("‚ö†Ô∏è CUDA not available. Using CPU instead.")

print(f"üñ•Ô∏è Device set to: {device}")

‚úÖ Using GPU: NVIDIA GeForce RTX 4080 SUPER
üñ•Ô∏è Device set to: cuda


In [3]:
# ==============================
# Imports
# ==============================
import pandas as pd
import torch
import random
import numpy as np
from collections import Counter
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt.prompts import SoftVerbalizer
from openprompt.prompts import AutomaticVerbalizer
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt import PromptForClassification, PromptDataLoader
from torch.optim import AdamW
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [4]:


# ==============================
# Set random seeds for reproducibility
# ==============================
# üí° Added this block to ensure consistent results across runs
seed = 20
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:

# Step 1: Training data 4 shot inltk news urdu
train_dataset = [
    # entertainment 4shots
    InputExample(guid=0, text_a="ÿß€å⁄©ÿ™ÿß ⁄©ŸæŸàÿ± ŸÜ€í ÿ®ŸÜÿßÿ¶€å €ÅŸÜÿØŸàÿ≥ÿ™ÿßŸÜ ⁄©€å ÿ≥ÿ® ÿ≥€í ÿ®ŸàŸÑ⁄à Ÿà€åÿ® ÿ≥€åÿ±€åÿ≤ ÿß€å⁄©ÿ≥ ÿß€å⁄©ÿ≥ ÿß€å⁄©ÿ≥ ÿå €å€Åÿß⁄∫ ÿØ€å⁄©⁄æ€å⁄∫ Ÿπÿ±€åŸÑÿ±", label=0),
    InputExample(guid=1, text_a="ŸÅŸàŸπŸà ÿ¥ŸàŸπ ⁄©ÿ±ÿßŸÜÿß Ÿæ⁄ëÿß ÿ≥ÿßÿ±€Å ÿπŸÑ€å ÿÆÿßŸÜ ⁄©Ÿà ÿ®€íÿ≠ÿØ ŸÖ€ÅŸÜ⁄Øÿßÿå ÿ≥Ÿàÿ¥ŸÑ ŸÖ€å⁄à€åÿß ÿµÿßÿ±ŸÅ€åŸÜ ŸÜ€í ÿ¨ŸÖ ⁄©ÿ± ⁄©€åÿß Ÿπÿ±ŸàŸÑ", label=0),
    InputExample(guid=2, text_a="ÿπÿßŸÖÿ± ÿÆÿßŸÜ ⁄©€å Ÿàÿ¨€Å ÿ≥€í ⁄Ø⁄æŸÜŸπŸà⁄∫ ÿ®ÿßÿ™⁄æ ÿ±ŸàŸÖ ŸÖ€å⁄∫ ÿ®€åŸπ⁄æ ⁄©ÿ± ÿ±Ÿàÿ¶€å €å€Å ÿßÿØÿß⁄©ÿßÿ±€Åÿå ÿ≥ŸÑŸÖÿßŸÜ ÿÆÿßŸÜ ŸÜ€í ÿ®⁄Üÿß€åÿß", label=0),
    InputExample(guid=3, text_a="ÿß€åÿ¥ÿß ÿßŸÖÿ®ÿßŸÜ€å Ÿæÿ±€å Ÿà€å⁄àŸÜ⁄Ø : ÿ¨ÿ® ÿßŸæŸÜ€í ' €Å€åÿ±Ÿà ' ÿ≥€í ŸÖŸÑŸÜ€í ⁄©€åŸÑÿ¶€í ŸàÿØ€åÿß ÿ®ŸÑŸÜ ŸÜ€í ⁄©€å ÿßÿ≥ŸÖÿ±ÿ™€å ÿß€åÿ±ÿßŸÜ€å ÿ≥€í ÿØÿ±ÿÆŸàÿßÿ≥ÿ™", label=0),


    # Cricket 4 shot
    InputExample(guid=4, text_a="ÿßŸìÿ≥Ÿπÿ±€åŸÑ€åÿß ⁄©€í ÿÆŸÑÿßŸÅ Ÿæ€ÅŸÑ€í Ÿπ€åÿ≥Ÿπ ⁄©€åŸÑÿ¶€í Ÿπ€åŸÖ ÿßŸÜ⁄à€åÿß ⁄©ÿß ÿßÿπŸÑÿßŸÜ ÿå ÿßŸÖ€åÿ¥ ÿßŸàÿ± ÿ¨⁄à€åÿ¨€Å ÿ®ÿß€Åÿ± ÿå ŸÖ⁄Øÿ± ÿßÿ≥ ÿ®⁄ë€í ⁄©⁄æŸÑÿß⁄ë€å ⁄©Ÿà ŸÖŸÑ€å ÿ¨⁄Ø€Å", label=1),
    InputExample(guid=5, text_a="Ÿæÿß⁄©ÿ≥ÿ™ÿßŸÜ€å ⁄©Ÿà⁄Ü ŸÖ⁄©€å ÿ¢ÿ±ÿ™⁄æÿ±ŸÜ€í ÿ®ÿßÿ®ÿ±ÿßÿπÿ∏ŸÖ ⁄©€å ⁄©ÿßÿ±⁄©ÿ±ÿØ⁄Ø€å ÿßŸàÿ±ŸÖÿ≥ÿ™ŸÇÿ®ŸÑ ÿ≥€í ŸÖÿ™ÿπŸÑŸÇ ⁄©€Å€å €å€Å ÿ®ÿßÿ™", label=1),
    InputExample(guid=6, text_a="Ÿæÿß⁄©ÿ≥ÿ™ÿßŸÜ€å Ÿπ€åŸÖ ⁄©Ÿà ŸÑ⁄Øÿß ÿ®⁄ëÿß ÿ¨⁄æŸπ⁄©ÿß ÿå ŸÜ€åŸàÿ≤€å ŸÑ€åŸÜ⁄à ⁄©€í ÿÆŸÑÿßŸÅ ÿ≥€åÿ±€åÿ≤ ÿ≥€í ÿ®⁄æ€å ÿ®ÿß€Åÿ± €ÅŸàÿ≥⁄©ÿ™ÿß €Å€í €å€Å ÿ®⁄ëÿß ⁄©⁄æŸÑÿß⁄ë€å", label=1),
    InputExample(guid=7, text_a="⁄Ø€åŸÜÿØ ÿ®ÿßÿ≤€å ⁄©€í ÿ®ÿπÿØ ÿ®ŸÑ€í ÿ®ÿßÿ≤€å ŸÖ€å⁄∫ ÿ®⁄æ€å ÿ±ÿßÿ¥ÿØ ÿÆÿßŸÜ ⁄©ÿß ÿØ⁄æŸÖÿß⁄©€Å ÿå ÿ™€åŸÜ ⁄Ø€åŸÜÿØŸà⁄∫ ŸÖ€å⁄∫ ÿ™€åŸÜ ⁄Ü⁄æ⁄©€í ŸÑ⁄Øÿß ⁄©ÿ± Ÿπ€åŸÖ ⁄©Ÿà ÿØŸÑÿßÿ¶€å ÿ¨€åÿ™", label=1),

    # crime 4shot
    InputExample(guid=8, text_a="ÿØÿßÿØÿ±€å ÿ≥ÿßŸÜÿ≠€Å: ŸÖÿ¥ÿ™ÿπŸÑ ÿ®⁄æ€å⁄ë ÿßÿÆŸÑÿßŸÇ ⁄©€í ŸæŸàÿ±€í ÿÆÿßŸÜÿØÿßŸÜ ⁄©Ÿà ÿ≤ŸÜÿØ€Å ÿ¨ŸÑÿß ÿØ€åŸÜÿß ⁄Üÿß€Åÿ™€å ÿ™⁄æ€å", label=2),
    InputExample(guid=9, text_a="⁄Ü⁄æ€å⁄ë ⁄Ü⁄æÿß⁄ë ⁄©€å ŸÖÿÆÿßŸÑŸÅÿ™ ⁄©€å ÿ™Ÿà ⁄ØÿßŸà⁄∫ ŸàÿßŸÑŸà⁄∫ ŸÜ€í ÿßÿ≥⁄©ŸàŸÑ ŸÖ€å⁄∫ ⁄Ø⁄æÿ≥ ⁄©ÿ± ⁄©€å ŸæŸπÿßÿ¶€å ÿå 34 ÿ∑ÿßŸÑÿ®ÿßÿ™ ÿ≤ÿÆŸÖ€å", label=2),
    InputExample(guid=10, text_a="ÿßÿ® €åŸàŸæ€å ⁄©€í ÿØ€åŸàÿ±€åÿß ŸÖ€å⁄∫ ŸÖÿ∏ŸÅÿ±ŸæŸàÿ± ÿ¨€åÿ≥€í ÿ≥€å⁄©ÿ≥ ÿ±€å⁄©Ÿπ ⁄©ÿß Ÿæÿ±ÿØ€Å ŸÅÿßÿ¥ÿå 24 ÿÆŸàÿßÿ™€åŸÜ ÿ¢ÿ≤ÿßÿØ ⁄©ÿ±ÿßÿ¶€å ⁄Øÿ¶€å⁄∫", label=2),
    InputExample(guid=11, text_a="ÿ¨ŸÖŸà⁄∫ Ÿà ⁄©ÿ¥ŸÖ€åÿ± : ⁄Ø⁄æÿ± ŸÑŸàŸπ ÿ±€Å€å ÿÆÿßÿ™ŸàŸÜ ⁄©Ÿà ⁄Ø⁄æÿ≥€åŸπ ⁄©ÿ± ŸÑ€í ⁄Ø€åÿß ÿ¨ŸÜ⁄ØŸÑ ÿå ÿßŸìÿ®ÿ±Ÿàÿ±€åÿ≤€å ⁄©€å ÿå ÿ®ÿ™ÿßŸÜ€í Ÿæÿ± ÿ≥ŸÜ⁄Ø€åŸÜ ŸÜÿ™ÿßÿ¶ÿ¨ ⁄©€å ÿØ€å ÿØ⁄æŸÖ⁄©€å", label=2),

]


In [6]:
# ==============================
# Define Classes
# ==============================
classes = ["entertainment", "cricket", "crime"]
label_map = {"entertainment": 0, "cricket": 1, "crime": 2}

# # Step 1: Use load_plm with 'roberta' to get the correct WrapperClass
_, _, _, WrapperClass = load_plm("roberta", "roberta-base")  # Just to get the wrapper

# # Step 2: Manually load XLM-RoBERTa model/tokenizer
model_name = "xlm-roberta-large"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
plm = XLMRobertaForMaskedLM.from_pretrained(model_name)

# ==============================
# Load Pretrained Language Model (mBERT)
# ==============================
# plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-multilingual-cased")




In [7]:

# ==============================
# Define Prompt Template (Manual)
# ==============================
template = ManualTemplate(
    text='{"placeholder":"text_a"} €å€Å ÿ¨ŸÖŸÑ€Å {"mask"} €Å€í€î',
    tokenizer=tokenizer,
)


# automatic verbalizer
verbalizer = AutomaticVerbalizer(
    tokenizer=tokenizer,
    classes=classes,
    num_candidates=1000,
    label_word_num_per_class=5,
)

In [8]:



# ==============================
# Create Prompt Model
# ==============================
prompt_model = PromptForClassification(
    template=template,
    plm=plm,
    verbalizer=verbalizer
)

# ==============================
# DataLoader for Training
# ==============================
train_loader = PromptDataLoader(
    dataset=train_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=4,
    shuffle=True  # Even though shuffled, reproducibility is preserved by seeding
)

# ==============================
# Fine-Tuning the Prompt Model
# ==============================
prompt_model.train()
optimizer = AdamW(prompt_model.parameters(), lr=1e-5)

for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        logits = prompt_model(batch)
        loss = torch.nn.CrossEntropyLoss()(logits, batch['label'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

# ==============================
# Load Evaluation Dataset
# ==============================
df = pd.read_csv(r"C:\Users\stdFurqan\Downloads\INLTK_urdu\test.csv")
eval_dataset = [
    InputExample(guid=i, text_a=row['headline'], label=label_map[row['label']])
    for i, row in df.iterrows()
]


eval_loader = PromptDataLoader(
    dataset=eval_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=8,
    shuffle=False
)

# ==============================
# Evaluate Model
# ==============================
prompt_model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in eval_loader:
        logits = prompt_model(batch)
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(batch['label'].cpu().tolist())

# ==============================
# Print Classification Report
# ==============================
print("\nüìä Fine-tuned Classification Report:")
print(classification_report(all_labels, all_preds, target_names=classes, digits=4))


tokenizing: 12it [00:00, 1496.63it/s]


Epoch 1 Loss: 5.3057
Epoch 2 Loss: 3.4287
Epoch 3 Loss: 3.8526
Epoch 4 Loss: 3.6253
Epoch 5 Loss: 3.7148


tokenizing: 2239it [00:01, 1844.68it/s]



üìä Fine-tuned Classification Report:
               precision    recall  f1-score   support

entertainment     0.5566    0.3252    0.4105      1270
      cricket     0.3279    0.3443    0.3359       700
        crime     0.1155    0.3271    0.1707       269

     accuracy                         0.3314      2239
    macro avg     0.3333    0.3322    0.3057      2239
 weighted avg     0.4321    0.3314    0.3584      2239

