In [None]:
# !pip install "transformers==4.28.0"
# !pip install "tokenizers>=0.19,<0.23"
# !pip install git+https://github.com/thunlp/OpenPrompt.git


In [None]:
# !sed -i 's/transformers.generation_utils/transformers.generation/' /usr/local/lib/python3.12/dist-packages/openprompt/pipeline_base.py

In [None]:
# # Fix all AdamW imports in openprompt
# !grep -rl "from transformers import  AdamW" /usr/local/lib/python3.12/dist-packages/openprompt | xargs sed -i 's/from transformers import  AdamW, get_linear_schedule_with_warmup/from torch.optim import AdamW\nfrom transformers import get_linear_schedule_with_warmup/'


In [1]:
# Set device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
import torch

if torch.cuda.is_available():
    device = "cuda"
    gpu_name = torch.cuda.get_device_name(0)
    print(f"✅ Using GPU: {gpu_name}")
else:
    device = "cpu"
    print("⚠️ CUDA not available. Using CPU instead.")

print(f"🖥️ Device set to: {device}")

✅ Using GPU: NVIDIA GeForce RTX 4080 SUPER
🖥️ Device set to: cuda


In [3]:
# ==============================
# Imports
# ==============================
import pandas as pd
import torch
import random
import numpy as np
from collections import Counter
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt.prompts import SoftVerbalizer
from openprompt.prompts import AutomaticVerbalizer
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt import PromptForClassification, PromptDataLoader
from torch.optim import AdamW
from sklearn.metrics import classification_report



In [4]:
# ==============================
# Set random seeds for reproducibility
# ==============================
seed = 20
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:

# Step 1: Training data (4-shot) urduv1
train_dataset = [
    # positive 4shots
    InputExample(guid=0, text_a="کلاسک ویڈیو گیم کی شکل میں بنایا گیا منفرد کیک نیویارک کیک خواہ سالگرہ کا ہویا کسی اور", label=0),
    InputExample(guid=1, text_a="مجاہد اعظم مولانا غلام اعظم نے اپنی جوانی سے بڑھا پے تک صبر عزمیت اور مسلسل جدوجہد کی تاریخ رقم کر دی اس", label=0),
    InputExample(guid=2, text_a="میں آپکی بات سے اتفاق کرتا ہوں لیکن میں سب کی رائے لینا اور سننا پسند کرتا ہوں ایک عادی بھائی ہیں کبھی کام کی بات نئ کی پ سنتا", label=0),
    InputExample(guid=3, text_a="وینا ملک کا بیٹا سوشل میڈیا پر مقبولیت میں شلپا شیٹھی کے بیٹے سے آگے کراچی وینا ملک کے بیٹے ابرام خان", label=0),


    # negative 4 shot
    InputExample(guid=4, text_a="سیالکوٹ ہندوستانی فورسز کی بلا اشتعال فائرنگ پاکستان کے صوبہ پنجاب کے شہر سیالکوٹ کی ورکنگ", label=1),
    InputExample(guid=5, text_a="کراچی میں تمام نیوز چینل کی بندشایم کیو ایم اور پاکستان مسلم لیگ زندہ بادپاکستان ملک ہے شہنشاہوں کا بدمعاشوں کا", label=1),
    InputExample(guid=6, text_a="ﻋﺎﻟﻤﯽ ﺍﺳﺘﻌﻤﺎﺭ ﺍﻣﺮﯾﮑﺎ ﻭ ﺍﺳﺮﺍﺋﯿﻞ ﺍﻭﺭﺍ ﻧﮑﮯ ﻧﻤﮏ ﺧﻮﺍﺭ ﺳﻌﻮﺩﯼ ﻋﺮﺏ ﮐﮯ ﺳﯿﻨﮯﻣﯿﮟ ﺧﻨﺠﺮ ﮐﯽ ﻣﺎﻧﻨﺪ ﻣﺎﮦ ﻣﺤﺮﻡ ﺻﻔﺮﺍﻭﺭ ﺭﺑﯿﻊ ﺍﻻﻭﻝ", label=1),
    InputExample(guid=7, text_a="جناب اسپیکر آج سے نون لیگ کو منافق لیگ کا نام دیا جائے کیونکہ نون لیگ نے سچ تو کبھی بولنا نہیں", label=1),

]


In [6]:
# ==============================
# Define Classes
# ==============================
classes = ["P", "N"]
label_map = {"P":0, "N": 1}

# # # Step 1: Use load_plm with 'roberta' to get the correct WrapperClass
# _, _, _, WrapperClass = load_plm("roberta", "roberta-base")  # Just to get the wrapper

# # # Step 2: Manually load XLM-RoBERTa model/tokenizer
# model_name = "xlm-roberta-large"
# tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
# plm = XLMRobertaForMaskedLM.from_pretrained(model_name)

# ==============================
# Load Pretrained Language Model (mBERT)
# ==============================
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-multilingual-cased")

# ==============================
# Define Prompt Template (Manual)
# ==============================
template = ManualTemplate(
    text='{"placeholder":"text_a"} یہ جملہ {"mask"} ہے۔',
    tokenizer=tokenizer,
)

# ==============================
# Define Verbalizer (Manual)
# ==============================
verbalizer = ManualVerbalizer(
    classes=classes,
    label_words = {
        "P": ["تفریح", "معاشرت"], # two words gpt
        "N": ["سیاست", "تنقید"]
    },
    tokenizer=tokenizer,
)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# # Apply PromptDA → augment training data
# augmented_train_dataset = []
# for example in train_dataset:
#     class_name = classes[example.label]
#     class_index = classes.index(class_name)
#     label_words = verbalizer.label_words[class_index]
#     for lw in label_words:
#         augmented_train_dataset.append(InputExample(
#             guid=f"{example.guid}_{lw}",
#             text_a=example.text_a,
#             label=example.label
#         ))


# ==============================
# Create Prompt Model
# ==============================
prompt_model = PromptForClassification(
    template=template,
    plm=plm,
    verbalizer=verbalizer
)


# # DataLoader for training (use augmented data) for when using promptda
# train_loader = PromptDataLoader(
#     dataset=augmented_train_dataset,
#     tokenizer=tokenizer,
#     template=template,
#     tokenizer_wrapper_class=WrapperClass,
#     max_seq_length=128,
#     batch_size=4,
#     shuffle=True,
# )



# # ==============================
# # DataLoader for Training
# # ==============================
train_loader = PromptDataLoader(
    dataset=train_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=4,
    shuffle=True
)

tokenizing: 8it [00:00, 1600.42it/s]


In [8]:
# ==============================
# Fine-Tuning the Prompt Model
# ==============================
prompt_model.train()
optimizer = AdamW(prompt_model.parameters(), lr=1e-5)

for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        logits = prompt_model(batch)
        loss = torch.nn.CrossEntropyLoss()(logits, batch['label'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")


Epoch 1 Loss: 2.9368
Epoch 2 Loss: 1.0026
Epoch 3 Loss: 1.3897
Epoch 4 Loss: 1.2369
Epoch 5 Loss: 0.3098


In [9]:
# ==============================
# Load Evaluation Dataset
# ==============================
df = pd.read_csv(r"C:\Users\Stdfurqan\Downloads\urdu_v1\verbalizer.csv")
eval_dataset = [
    InputExample(guid=i, text_a=row['Cleaned_Tweet'], label=label_map[row['Class']])
    for i, row in df.iterrows()
]

eval_loader = PromptDataLoader(
    dataset=eval_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=8,
    shuffle=False
)

tokenizing: 490it [00:00, 2358.55it/s]


In [10]:
# ==============================
# Evaluate Model and Collect Predicted MASK Words for Correct Predictions
# ==============================
prompt_model.eval()

correct_predicted_words = {
    "P": [],
    "N": []
}

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in eval_loader:
        logits = prompt_model(batch)
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(batch['label'].cpu().tolist())

        input_ids = batch['input_ids']
        mask_token_id = tokenizer.mask_token_id

        for i in range(len(preds)):
            true_label = batch['label'][i].item()
            pred_label = preds[i].item()

            if true_label == pred_label:
                # Find MASK token index
                mask_index = (input_ids[i] == mask_token_id).nonzero(as_tuple=True)[0].item()

                # Prepare input for PLM forward to get token logits
                inputs = {
                    "input_ids": input_ids[i].unsqueeze(0),
                    "attention_mask": batch['attention_mask'][i].unsqueeze(0)
                }
                if 'token_type_ids' in batch:
                    inputs["token_type_ids"] = batch['token_type_ids'][i].unsqueeze(0)

                # outputs = plm(**inputs)
                # token_logits = outputs.logits[0, mask_index]
                outputs = prompt_model.plm(**inputs)
                token_logits = outputs.logits[0, mask_index]   # logits for vocab at MASK position

                predicted_token_id = torch.argmax(token_logits).item()
                predicted_word = tokenizer.convert_ids_to_tokens(predicted_token_id).lstrip("##")

                class_name = classes[true_label]
                correct_predicted_words[class_name].append(predicted_word)



In [11]:
# ==============================
# Print Classification Report and Predicted Words Summary
# ==============================
print("\n📊 Fine-tuned Classification Report:")
print(classification_report(all_labels, all_preds, target_names=classes))

print("\nPredicted MASK words for correct predictions by class:")

for cls in classes:
    words = correct_predicted_words[cls]
    word_counts = Counter(words)
    print(f"\nClass '{cls}':")
    print(f"Total correct predictions: {len(words)}")
    print("Most common predicted mask words:")
    for w, cnt in word_counts.most_common(10):
        print(f"  {w} : {cnt}")


📊 Fine-tuned Classification Report:
              precision    recall  f1-score   support

           P       0.48      0.89      0.63       240
           N       0.45      0.09      0.15       250

    accuracy                           0.48       490
   macro avg       0.47      0.49      0.39       490
weighted avg       0.47      0.48      0.38       490


Predicted MASK words for correct predictions by class:

Class 'P':
Total correct predictions: 213
Most common predicted mask words:
  نہیں : 61
  بھی : 31
  کیا : 21
  جاری : 16
  یہ : 14
  کرتا : 13
  ہوتا : 12
  میں : 9
  موجود : 6
  مشہور : 4

Class 'N':
Total correct predictions: 22
Most common predicted mask words:
  [UNK] : 5
  بھی : 5
  نہیں : 5
  میں : 3
  معروف : 1
  جاری : 1
  زیر : 1
  شامل : 1
