In [None]:
!pip install "transformers==4.28.0"
!pip install "tokenizers>=0.19,<0.23"
!pip install git+https://github.com/thunlp/OpenPrompt.git


Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/110.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3.tar.gz (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.9/314.9 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: tokenizers
  [1;31merr

In [None]:
!sed -i 's/transformers.generation_utils/transformers.generation/' /usr/local/lib/python3.12/dist-packages/openprompt/pipeline_base.py

In [None]:
# Fix all AdamW imports in openprompt
!grep -rl "from transformers import  AdamW" /usr/local/lib/python3.12/dist-packages/openprompt | xargs sed -i 's/from transformers import  AdamW, get_linear_schedule_with_warmup/from torch.optim import AdamW\nfrom transformers import get_linear_schedule_with_warmup/'


In [1]:
# Set device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# ==============================
# Imports
# ==============================
import pandas as pd
import torch
import random
import numpy as np
from collections import Counter
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt.prompts import SoftVerbalizer
from openprompt.prompts import AutomaticVerbalizer
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt import PromptForClassification, PromptDataLoader
from torch.optim import AdamW
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ==============================
# Set random seeds for reproducibility
# ==============================
seed = 20
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
# ==============================
# Step 1: Training data (3-shot / few-shot)
# ==============================
train_dataset = [
    InputExample(guid=0, text_a="د يک‌شنبې له نښتې وروسته د سپين بولدک-چمن لارې تړل کېدل", label=0),
    InputExample(guid=1, text_a="سرچينې ليکلي: «هېڅوک بايد د خپلو بنسټيزو حقونو او د نورو د حقونو په دفاع کې د خبرو کولو له امله ونه نيول شي.»", label=0),
    InputExample(guid=2, text_a="هلمند کې د يوه جمنازيوم جوړولو چارې پيل شوې", label=0),
    InputExample(guid=3, text_a="د ننګرهار د کرنې رياست د کرنې امر نعمت‌الله اکبري پژواک اژانس ته وويل، په دغه ولايت کې د تېر کال په پرتله د ګنيو په حاصلاتو کې شاوخوا ۱۷ زره ټنه ډېروالى راغلي دي.", label=0),

    InputExample(guid=4, text_a="ننګرهار کې د ترافيکي پېښې له امله څلورو کسانو ته مرګ‌ژوبله اوښتې ده", label=1),
    InputExample(guid=5, text_a="نوموړي وايي، دوي څلور ډوله خلکو ته د پاسپورت ترلاسه کولو کې لومړيتوب ورکوي، محصلين، لوبغاړي، سوداګر او عاجل ناروغان.", label=1),
    InputExample(guid=6, text_a="هغه ويلي وؤ امارت اسلامي افغانستان دبرتانيه سره دپرله پسې تماسونو نه وروسته دغه نيول شوي کسان خوشې کړي اؤ خپل هيواد ته يې سپارلي دي.",label=1),
    InputExample(guid=7, text_a="سرچينې زياته کړې، په دې توګه ملګرو ملتونو خپلو ټولو نارينه او ښځينه کارکوونکو ته لارښوونه وکړه چې تر دويم لارښوود پورې دندو ته ولاړ نه شي.", label=1),

    InputExample(guid=8, text_a="سرچينې زياته کړې، چې اوس‌مهال د ملګرو ملتونو لږ شمېر کارکوونکي د اړينو کارونو د ترسراوي په موخه دندې ته حاضرېږي.", label=2),
    InputExample(guid=9, text_a="اوچا: که بېړنۍ مرستې و نه رسېږي، ميليونونه افغانان به د قحطۍ له ګواښ سره مخ شي", label=2),
    InputExample(guid=10, text_a="د جاپان په وزيراعظم بريد،ملزم ونيولې شو", label=2),
    InputExample(guid=11, text_a="اقتصاد پوهان وايي څو پورې چې نړيوالې مالياتي ادارې د پاکستان د پور پروګرام نه وي بحال کړي او دوست هېوادونو ورته پېسې نه وي ورکړي اقتصادي مشکلات به يې سېوا کيږي.", label=2),
]

In [5]:
# ==============================
# Define Classes
# ==============================
classes = ["Positive", "Negative", "Neutral"]
label_map = {"Positive": 0, "Negative": 1, "Neutral":2}

# # Step 1: Use load_plm with 'roberta' to get the correct WrapperClass
_, _, _, WrapperClass = load_plm("roberta", "roberta-base")  # Just to get the wrapper

# # Step 2: Manually load XLM-RoBERTa model/tokenizer
model_name = "xlm-roberta-large"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
plm = XLMRobertaForMaskedLM.from_pretrained(model_name)

# ==============================
# Load Pretrained Language Model (mBERT)
# ==============================
# plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-multilingual-cased")

# ==============================
# Define Prompt Template (Manual)
# ==============================
template = ManualTemplate(
    # text = '{"placeholder":"text_a"} هذه الجملة {"mask"}.',
    text = '{"placeholder":"text_a"} دا جمله {"mask"} ده.', # pashto
    tokenizer=tokenizer,
)

# ==============================
# Define Verbalizer (Manual)
# ==============================
verbalizer = ManualVerbalizer(
    classes=classes,
    label_words = {
       "Positive": ["ښه", "مثبت"],     # good, success
       "Negative": ["بد", "منفي"],      # bad, protest
       "Neutral":  ["بې‌طرف", "عادي"]       # statement, law
},
    tokenizer=tokenizer,
)



In [6]:
# # Apply PromptDA → augment training data
# augmented_train_dataset = []
# for example in train_dataset:
#     class_name = classes[example.label]
#     class_index = classes.index(class_name)
#     label_words = verbalizer.label_words[class_index]
#     for lw in label_words:
#         augmented_train_dataset.append(InputExample(
#             guid=f"{example.guid}_{lw}",
#             text_a=example.text_a,
#             label=example.label
#         ))


# ==============================
# Create Prompt Model
# ==============================
prompt_model = PromptForClassification(
    template=template,
    plm=plm,
    verbalizer=verbalizer
)


# # DataLoader for training (use augmented data) for when using promptda
# train_loader = PromptDataLoader(
#     dataset=augmented_train_dataset,
#     tokenizer=tokenizer,
#     template=template,
#     tokenizer_wrapper_class=WrapperClass,
#     max_seq_length=128,
#     batch_size=4,
#     shuffle=True,
# )



# # ==============================
# # DataLoader for Training
# # ==============================
train_loader = PromptDataLoader(
    dataset=train_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=4,
    shuffle=True
)

tokenizing: 12it [00:00, ?it/s]


In [7]:
# ==============================
# Fine-Tuning the Prompt Model
# ==============================
prompt_model.train()
optimizer = AdamW(prompt_model.parameters(), lr=1e-5)

for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        logits = prompt_model(batch)
        loss = torch.nn.CrossEntropyLoss()(logits, batch['label'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")


Epoch 1 Loss: 5.9784
Epoch 2 Loss: 2.7517
Epoch 3 Loss: 2.3703
Epoch 4 Loss: 1.5258
Epoch 5 Loss: 1.1293


In [8]:
# ==============================
# Load Evaluation Dataset
# ==============================
df = pd.read_csv(r"C:\Users\stdFurqan\Desktop\pastho_rob\dataset_train_50 - Sheet1.csv")
eval_dataset = [
    InputExample(guid=i, text_a=row['Text'], label=label_map[row['label']])
    for i, row in df.iterrows()
]

eval_loader = PromptDataLoader(
    dataset=eval_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=8,
    shuffle=False
)

tokenizing: 0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1084 > 512). Running this sequence through the model will result in indexing errors
tokenizing: 6448it [00:01, 4394.72it/s]


In [9]:
# ==============================
# Evaluate Model and Collect Predicted MASK Words for Correct Predictions
# ==============================
prompt_model.eval()

correct_predicted_words = {
    "Positive": [],
    "Negative": [],
    "Neutral": []}

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in eval_loader:
        logits = prompt_model(batch)
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(batch['label'].cpu().tolist())

        input_ids = batch['input_ids']
        mask_token_id = tokenizer.mask_token_id

        for i in range(len(preds)):
            true_label = batch['label'][i].item()
            pred_label = preds[i].item()

            if true_label == pred_label:
                # Find MASK token index
                mask_index = (input_ids[i] == mask_token_id).nonzero(as_tuple=True)[0].item()

                # Prepare input for PLM forward to get token logits
                inputs = {
                    "input_ids": input_ids[i].unsqueeze(0),
                    "attention_mask": batch['attention_mask'][i].unsqueeze(0)
                }
                if 'token_type_ids' in batch:
                    inputs["token_type_ids"] = batch['token_type_ids'][i].unsqueeze(0)

                # outputs = plm(**inputs)
                # token_logits = outputs.logits[0, mask_index]
                outputs = prompt_model.plm(**inputs)
                token_logits = outputs.logits[0, mask_index]   # logits for vocab at MASK position

                predicted_token_id = torch.argmax(token_logits).item()
                predicted_word = tokenizer.convert_ids_to_tokens(predicted_token_id).lstrip("##")

                class_name = classes[true_label]
                correct_predicted_words[class_name].append(predicted_word)



In [10]:
# ==============================
# Print Classification Report and Predicted Words Summary
# ==============================
print("\n📊 Fine-tuned Classification Report:")
print(classification_report(all_labels, all_preds, target_names=classes))

print("\nPredicted MASK words for correct predictions by class:")

for cls in classes:
    words = correct_predicted_words[cls]
    word_counts = Counter(words)
    print(f"\nClass '{cls}':")
    print(f"Total correct predictions: {len(words)}")
    print("Most common predicted mask words:")
    for w, cnt in word_counts.most_common(10):
        print(f"  {w} : {cnt}")


📊 Fine-tuned Classification Report:
              precision    recall  f1-score   support

    Positive       0.41      0.45      0.43      2618
    Negative       0.34      0.18      0.24      2035
     Neutral       0.26      0.35      0.30      1795

    accuracy                           0.34      6448
   macro avg       0.34      0.33      0.32      6448
weighted avg       0.35      0.34      0.33      6448


Predicted MASK words for correct predictions by class:

Class 'Positive':
Total correct predictions: 1188
Most common predicted mask words:
  ▁يې : 214
  ▁زور : 78
  ▁نه : 66
  ▁مشهور : 54
  ▁خبره : 46
  ▁تاريخي : 38
  ه : 37
  ▁واقعي : 35
  ▁خبر : 34
  ▁کړې : 33

Class 'Negative':
Total correct predictions: 365
Most common predicted mask words:
  ▁دا : 49
  ▁زور : 29
  ▁يې : 23
  ▁مي : 22
  ▁او : 16
  ▁واقعي : 14
  ▁داعش : 13
  ▁عجيب : 10
  ▁فاجعه : 10
  ▁مشهور : 10

Class 'Neutral':
Total correct predictions: 634
Most common predicted mask words:
  ▁نه : 90
  ▁زور : 88
  ▁