In [None]:
# !pip install "transformers==4.28.0"
# !pip install "tokenizers>=0.19,<0.23"
# !pip install git+https://github.com/thunlp/OpenPrompt.git


In [None]:
# !sed -i 's/transformers.generation_utils/transformers.generation/' /usr/local/lib/python3.12/dist-packages/openprompt/pipeline_base.py

In [None]:
# # Fix all AdamW imports in openprompt
# !grep -rl "from transformers import  AdamW" /usr/local/lib/python3.12/dist-packages/openprompt | xargs sed -i 's/from transformers import  AdamW, get_linear_schedule_with_warmup/from torch.optim import AdamW\nfrom transformers import get_linear_schedule_with_warmup/'


In [1]:
# Set device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
import torch

if torch.cuda.is_available():
    device = "cuda"
    gpu_name = torch.cuda.get_device_name(0)
    print(f"✅ Using GPU: {gpu_name}")
else:
    device = "cpu"
    print("⚠️ CUDA not available. Using CPU instead.")

print(f"🖥️ Device set to: {device}")

✅ Using GPU: NVIDIA GeForce RTX 4080 SUPER
🖥️ Device set to: cuda


In [3]:
# ==============================
# Imports
# ==============================
import pandas as pd
import torch
import random
import numpy as np
from collections import Counter
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt.prompts import SoftVerbalizer
from openprompt.prompts import AutomaticVerbalizer
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt import PromptForClassification, PromptDataLoader
from torch.optim import AdamW
from sklearn.metrics import classification_report



In [4]:
# ==============================
# Set random seeds for reproducibility
# ==============================
seed = 20
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:

# Step 1: Training data 4 shot inltk news urdu
train_dataset = [
    # entertainment 4shots
    InputExample(guid=0, text_a="ایکتا کپور نے بنائی ہندوستان کی سب سے بولڈ ویب سیریز ایکس ایکس ایکس ، یہاں دیکھیں ٹریلر", label=0),
    InputExample(guid=1, text_a="فوٹو شوٹ کرانا پڑا سارہ علی خان کو بےحد مہنگا، سوشل میڈیا صارفین نے جم کر کیا ٹرول", label=0),
    InputExample(guid=2, text_a="عامر خان کی وجہ سے گھنٹوں باتھ روم میں بیٹھ کر روئی یہ اداکارہ، سلمان خان نے بچایا", label=0),
    InputExample(guid=3, text_a="ایشا امبانی پری ویڈنگ : جب اپنے ' ہیرو ' سے ملنے کیلئے ودیا بلن نے کی اسمرتی ایرانی سے درخواست", label=0),


    # Cricket 4 shot
    InputExample(guid=4, text_a="آسٹریلیا کے خلاف پہلے ٹیسٹ کیلئے ٹیم انڈیا کا اعلان ، امیش اور جڈیجہ باہر ، مگر اس بڑے کھلاڑی کو ملی جگہ", label=1),
    InputExample(guid=5, text_a="پاکستانی کوچ مکی آرتھرنے بابراعظم کی کارکردگی اورمستقبل سے متعلق کہی یہ بات", label=1),
    InputExample(guid=6, text_a="پاکستانی ٹیم کو لگا بڑا جھٹکا ، نیوزی لینڈ کے خلاف سیریز سے بھی باہر ہوسکتا ہے یہ بڑا کھلاڑی", label=1),
    InputExample(guid=7, text_a="گیند بازی کے بعد بلے بازی میں بھی راشد خان کا دھماکہ ، تین گیندوں میں تین چھکے لگا کر ٹیم کو دلائی جیت", label=1),

    # crime 4shot
    InputExample(guid=8, text_a="دادری سانحہ: مشتعل بھیڑ اخلاق کے پورے خاندان کو زندہ جلا دینا چاہتی تھی", label=2),
    InputExample(guid=9, text_a="چھیڑ چھاڑ کی مخالفت کی تو گاوں والوں نے اسکول میں گھس کر کی پٹائی ، 34 طالبات زخمی", label=2),
    InputExample(guid=10, text_a="اب یوپی کے دیوریا میں مظفرپور جیسے سیکس ریکٹ کا پردہ فاش، 24 خواتین آزاد کرائی گئیں", label=2),
    InputExample(guid=11, text_a="جموں و کشمیر : گھر لوٹ رہی خاتون کو گھسیٹ کر لے گیا جنگل ، آبروریزی کی ، بتانے پر سنگین نتائج کی دی دھمکی", label=2),

]


In [6]:
# ==============================
# Define Classes
# ==============================
classes = ["entertainment", "cricket", "crime"]
label_map = {"entertainment": 0, "cricket": 1, "crime": 2}

# # # Step 1: Use load_plm with 'roberta' to get the correct WrapperClass
# _, _, _, WrapperClass = load_plm("roberta", "roberta-base")  # Just to get the wrapper

# # # Step 2: Manually load XLM-RoBERTa model/tokenizer
# model_name = "xlm-roberta-large"
# tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
# plm = XLMRobertaForMaskedLM.from_pretrained(model_name)

# ==============================
# Load Pretrained Language Model (mBERT)
# ==============================
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-multilingual-cased")

# ==============================
# Define Prompt Template (Manual)
# ==============================
template = ManualTemplate(
    text='{"placeholder":"text_a"} یہ جملہ {"mask"} ہے۔',
    tokenizer=tokenizer,
)

# ==============================
# Define Verbalizer (Manual)
# ==============================
verbalizer = ManualVerbalizer(
    classes=classes,
    label_words = {
        "entertainment": ["اداکارہ", "ویب سیریز"],# two words gpt
        "cricket": ["کھلاڑی", "ٹیم"],
        "crime": ["تشدد", "آبروریزی"]
    },
    tokenizer=tokenizer,
)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# # Apply PromptDA → augment training data
# augmented_train_dataset = []
# for example in train_dataset:
#     class_name = classes[example.label]
#     class_index = classes.index(class_name)
#     label_words = verbalizer.label_words[class_index]
#     for lw in label_words:
#         augmented_train_dataset.append(InputExample(
#             guid=f"{example.guid}_{lw}",
#             text_a=example.text_a,
#             label=example.label
#         ))


# ==============================
# Create Prompt Model
# ==============================
prompt_model = PromptForClassification(
    template=template,
    plm=plm,
    verbalizer=verbalizer
)


# # DataLoader for training (use augmented data) for when using promptda
# train_loader = PromptDataLoader(
#     dataset=augmented_train_dataset,
#     tokenizer=tokenizer,
#     template=template,
#     tokenizer_wrapper_class=WrapperClass,
#     max_seq_length=128,
#     batch_size=4,
#     shuffle=True,
# )



# # ==============================
# # DataLoader for Training
# # ==============================
train_loader = PromptDataLoader(
    dataset=train_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=4,
    shuffle=True
)

tokenizing: 12it [00:00, 1999.91it/s]


In [8]:
# ==============================
# Fine-Tuning the Prompt Model
# ==============================
prompt_model.train()
optimizer = AdamW(prompt_model.parameters(), lr=1e-5)

for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        logits = prompt_model(batch)
        loss = torch.nn.CrossEntropyLoss()(logits, batch['label'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")


Epoch 1 Loss: 4.3715
Epoch 2 Loss: 2.1278
Epoch 3 Loss: 1.8874
Epoch 4 Loss: 1.5385
Epoch 5 Loss: 0.9933


In [9]:
# ==============================
# Load Evaluation Dataset
# ==============================
df = pd.read_csv(r"C:\Users\Stdfurqan\Downloads\inltk\verbalizer.csv")
eval_dataset = [
    InputExample(guid=i, text_a=row['headline'], label=label_map[row['label']])
    for i, row in df.iterrows()
]

eval_loader = PromptDataLoader(
    dataset=eval_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=8,
    shuffle=False
)

tokenizing: 1493it [00:00, 2730.56it/s]


In [10]:
# ==============================
# Evaluate Model and Collect Predicted MASK Words for Correct Predictions
# ==============================
prompt_model.eval()

correct_predicted_words = {
    "entertainment": [],
    "cricket": [],
    "crime": []
}

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in eval_loader:
        logits = prompt_model(batch)
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(batch['label'].cpu().tolist())

        input_ids = batch['input_ids']
        mask_token_id = tokenizer.mask_token_id

        for i in range(len(preds)):
            true_label = batch['label'][i].item()
            pred_label = preds[i].item()

            if true_label == pred_label:
                # Find MASK token index
                mask_index = (input_ids[i] == mask_token_id).nonzero(as_tuple=True)[0].item()

                # Prepare input for PLM forward to get token logits
                inputs = {
                    "input_ids": input_ids[i].unsqueeze(0),
                    "attention_mask": batch['attention_mask'][i].unsqueeze(0)
                }
                if 'token_type_ids' in batch:
                    inputs["token_type_ids"] = batch['token_type_ids'][i].unsqueeze(0)

                # outputs = plm(**inputs)
                # token_logits = outputs.logits[0, mask_index]
                outputs = prompt_model.plm(**inputs)
                token_logits = outputs.logits[0, mask_index]   # logits for vocab at MASK position

                predicted_token_id = torch.argmax(token_logits).item()
                predicted_word = tokenizer.convert_ids_to_tokens(predicted_token_id).lstrip("##")

                class_name = classes[true_label]
                correct_predicted_words[class_name].append(predicted_word)



In [11]:
# ==============================
# Print Classification Report and Predicted Words Summary
# ==============================
print("\n📊 Fine-tuned Classification Report:")
print(classification_report(all_labels, all_preds, target_names=classes))

print("\nPredicted MASK words for correct predictions by class:")

for cls in classes:
    words = correct_predicted_words[cls]
    word_counts = Counter(words)
    print(f"\nClass '{cls}':")
    print(f"Total correct predictions: {len(words)}")
    print("Most common predicted mask words:")
    for w, cnt in word_counts.most_common(10):
        print(f"  {w} : {cnt}")


📊 Fine-tuned Classification Report:
               precision    recall  f1-score   support

entertainment       0.82      0.30      0.44       847
      cricket       0.64      0.78      0.70       467
        crime       0.24      0.82      0.37       179

     accuracy                           0.51      1493
    macro avg       0.57      0.63      0.50      1493
 weighted avg       0.69      0.51      0.51      1493


Predicted MASK words for correct predictions by class:

Class 'entertainment':
Total correct predictions: 255
Most common predicted mask words:
  نہیں : 106
  فلم : 69
  بھی : 10
  کیا : 8
  کرتا : 8
  شاعر : 7
  [UNK] : 6
  رہا : 4
  ' : 4
  حاصل : 3

Class 'cricket':
Total correct predictions: 364
Most common predicted mask words:
  نہیں : 147
  منتخب : 47
  بازی : 25
  رہا : 24
  کرتا : 19
  کیا : 17
  بھی : 10
  ملی : 9
  ہوتا : 8
  یہ : 8

Class 'crime':
Total correct predictions: 147
Most common predicted mask words:
  نہیں : 70
  رہا : 16
  بھی : 9
  ہوتا : 7
 