In [1]:
!pip install openprompt



In [2]:
!pip install --upgrade transformers==4.28.0

Collecting transformers==4.28.0
  Using cached transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Using cached tokenizers-0.13.3-cp310-cp310-win_amd64.whl.metadata (6.9 kB)
Using cached transformers-4.28.0-py3-none-any.whl (7.0 MB)
Using cached tokenizers-0.13.3-cp310-cp310-win_amd64.whl (3.5 MB)
Installing collected packages: tokenizers, transformers

  Attempting uninstall: tokenizers

    Found existing installation: tokenizers 0.22.1

    Uninstalling tokenizers-0.22.1:

      Successfully uninstalled tokenizers-0.22.1

  Attempting uninstall: transformers

    Found existing installation: transformers 4.57.0

   -------------------- ------------------- 1/2 [transformers]
    Uninstalling transformers-4.57.0:
   -------------------- ------------------- 1/2 [transformers]
   -------------------- ------------------- 1/2 [transformers]
      Successfully uninstalled transformers-4.57.0
   ------------------

In [3]:
# Set device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# ==============================
# Imports
# ==============================
import pandas as pd
import torch
import random
import numpy as np
from collections import Counter
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt.prompts import SoftVerbalizer
from openprompt.prompts import AutomaticVerbalizer
from openprompt.prompts import SoftTemplate

from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt import PromptForClassification, PromptDataLoader
from transformers import AdamW
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# ==============================
# Set random seeds for reproducibility
# ==============================
seed = 20
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [7]:
# ==============================
# Step 1: Training data (3-shot / few-shot)
# ==============================
train_dataset = [
    InputExample(guid=0, text_a="د يک‌شنبې له نښتې وروسته د سپين بولدک-چمن لارې تړل کېدل", label=0),
    InputExample(guid=1, text_a="سرچينې ليکلي: «هېڅوک بايد د خپلو بنسټيزو حقونو او د نورو د حقونو په دفاع کې د خبرو کولو له امله ونه نيول شي.»", label=0),
    InputExample(guid=2, text_a="هلمند کې د يوه جمنازيوم جوړولو چارې پيل شوې", label=0),
    InputExample(guid=3, text_a="د ننګرهار د کرنې رياست د کرنې امر نعمت‌الله اکبري پژواک اژانس ته وويل، په دغه ولايت کې د تېر کال په پرتله د ګنيو په حاصلاتو کې شاوخوا ۱۷ زره ټنه ډېروالى راغلي دي.", label=0),

    InputExample(guid=4, text_a="ننګرهار کې د ترافيکي پېښې له امله څلورو کسانو ته مرګ‌ژوبله اوښتې ده", label=1),
    InputExample(guid=5, text_a="نوموړي وايي، دوي څلور ډوله خلکو ته د پاسپورت ترلاسه کولو کې لومړيتوب ورکوي، محصلين، لوبغاړي، سوداګر او عاجل ناروغان.", label=1),
    InputExample(guid=6, text_a="هغه ويلي وؤ امارت اسلامي افغانستان دبرتانيه سره دپرله پسې تماسونو نه وروسته دغه نيول شوي کسان خوشې کړي اؤ خپل هيواد ته يې سپارلي دي.",label=1),
    InputExample(guid=7, text_a="سرچينې زياته کړې، په دې توګه ملګرو ملتونو خپلو ټولو نارينه او ښځينه کارکوونکو ته لارښوونه وکړه چې تر دويم لارښوود پورې دندو ته ولاړ نه شي.", label=1),

    InputExample(guid=8, text_a="سرچينې زياته کړې، چې اوس‌مهال د ملګرو ملتونو لږ شمېر کارکوونکي د اړينو کارونو د ترسراوي په موخه دندې ته حاضرېږي.", label=2),
    InputExample(guid=9, text_a="اوچا: که بېړنۍ مرستې و نه رسېږي، ميليونونه افغانان به د قحطۍ له ګواښ سره مخ شي", label=2),
    InputExample(guid=10, text_a="د جاپان په وزيراعظم بريد،ملزم ونيولې شو", label=2),
    InputExample(guid=11, text_a="اقتصاد پوهان وايي څو پورې چې نړيوالې مالياتي ادارې د پاکستان د پور پروګرام نه وي بحال کړي او دوست هېوادونو ورته پېسې نه وي ورکړي اقتصادي مشکلات به يې سېوا کيږي.", label=2),
]

In [8]:
# ==============================
# Define Classes
# ==============================
classes = ["Positive", "Negative", "Neutral"]
label_map = {"Positive": 0, "Negative": 1, "Neutral":2}

# Step 1: Use load_plm with 'roberta' to get the correct WrapperClass
_, _, _, WrapperClass = load_plm("roberta", "roberta-base")  # Just to get the wrapper

# Step 2: Manually load XLM-RoBERTa model/tokenizer
model_name = "xlm-roberta-large"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
plm = XLMRobertaForMaskedLM.from_pretrained(model_name)

# ==============================
# Define Prompt Template (Manual)
# ==============================

template = ManualTemplate(
    text = '{"placeholder":"text_a"} دا جمله {"mask"} ده.', # pashto
    tokenizer=tokenizer,
)







In [9]:
# automatic verbalizer
verbalizer = AutomaticVerbalizer(
    tokenizer=tokenizer,
    classes=classes,
    num_candidates=1000,
    label_word_num_per_class=5,
)

In [10]:
# ==============================
# Create Prompt Model
# ==============================
prompt_model = PromptForClassification(
    template=template,
    plm=plm,
    verbalizer=verbalizer
)


# # DataLoader for training (use augmented data) for when using promptda
train_loader = PromptDataLoader(
    dataset=train_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=4,
    shuffle=True,
)



tokenizing: 12it [00:00, 2000.22it/s]


In [11]:
# ==============================
# Fine-Tuning the Prompt Model
# ==============================
prompt_model.train()
optimizer = AdamW(prompt_model.parameters(), lr=1e-5)

for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        logits = prompt_model(batch)
        loss = torch.nn.CrossEntropyLoss()(logits, batch['label'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")




Epoch 1 Loss: 5.3057
Epoch 2 Loss: 3.4287
Epoch 3 Loss: 3.8526
Epoch 4 Loss: 3.6253
Epoch 5 Loss: 3.7148


In [12]:
# ==============================
# Load Evaluation Dataset
# ==============================
df = pd.read_csv(r"C:\Users\stdFurqan\Desktop\pastho_rob\dataset_test_30 - Sheet1.csv")
eval_dataset = [
    InputExample(guid=i, text_a=row['Text'], label=label_map[row['label']])
    for i, row in df.iterrows()
]

eval_loader = PromptDataLoader(
    dataset=eval_dataset,
    tokenizer=tokenizer,
    template=template,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=8,
    shuffle=False
)

tokenizing: 269it [00:00, 2637.67it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (614 > 512). Running this sequence through the model will result in indexing errors
tokenizing: 3871it [00:01, 2632.06it/s]


In [13]:
# ==============================
# Evaluate Model and Collect Predicted MASK Words for Correct Predictions
# ==============================
prompt_model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in eval_loader:
        logits = prompt_model(batch)
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(batch['label'].cpu().tolist())




In [14]:
# ==============================
# Print Classification Report and Predicted Words Summary
# ==============================
print("\n📊 Fine-tuned Classification Report:")
print(classification_report(all_labels, all_preds, target_names=classes, digits=4))





📊 Fine-tuned Classification Report:
              precision    recall  f1-score   support

    Positive     0.4103    0.3331    0.3677      1573
    Negative     0.2987    0.3104    0.3044      1221
     Neutral     0.2762    0.3398    0.3047      1077

    accuracy                         0.3278      3871
   macro avg     0.3284    0.3278    0.3256      3871
weighted avg     0.3378    0.3278    0.3302      3871

