In [None]:
import os
import shutil

# Get user profile path
user_profile = os.environ["USERPROFILE"]

# Paths to Hugging Face cached models
cached_models = [
    os.path.join(user_profile, r".cache\huggingface\hub\models--bert-base-multilingual-cased"),
    os.path.join(user_profile, r".cache\huggingface\hub\models--xlm-roberta-base")
]

# Remove cached models if they exist
for path in cached_models:
    if os.path.exists(path):
        shutil.rmtree(path)
        print(f"Removed cache: {path}")
    else:
        print(f"No cache found at: {path}")


In [1]:
# Set device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# ==============================
# Imports
# ==============================
import torch
import pandas as pd
import random
import numpy as np
from collections import Counter
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt import PromptForClassification, PromptDataLoader
from torch.optim import AdamW
from sklearn.metrics import classification_report
from collections import defaultdict
from torch.utils.data import DataLoader, Sampler

# ========================================
# Check CUDA
# ========================================
device = "cuda" #if torch.cuda.is_available() else "cpu"
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    print("CUDA version:", torch.version.cuda)
    print("GPU count:", torch.cuda.device_count())

# ========================================
# Seeds for reproducibility
# ========================================
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
GPU name: NVIDIA GeForce RTX 4080 SUPER
CUDA version: 12.1
GPU count: 1


In [3]:
# ==============================
# Define Classes
# ==============================
classes = ["unacc", "acc"]
label_map = {"unacc": 0, "acc": 1}

# # Step 1: Use load_plm with 'roberta' to get the correct WrapperClass
_, _, _, WrapperClass = load_plm("roberta", "roberta-base")  # Just to get the wrapper

# # Step 2: Manually load XLM-RoBERTa model/tokenizer
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
plm = XLMRobertaForMaskedLM.from_pretrained(model_name)

# ==============================
# Load Pretrained Language Model (mBERT)
# ==============================
# plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-multilingual-cased")




In [4]:
# ==============================
# Define Prompt Template (Manual)
# ==============================
template = ManualTemplate(
    # text = '{"placeholder":"text_a"} Ÿáÿ∞Ÿá ÿßŸÑÿ¨ŸÖŸÑÿ© {"mask"}.',
    text='{"placeholder":"text_a"} €å€Å ÿ¨ŸÖŸÑ€Å {"mask"} €Å€í€î',
    tokenizer=tokenizer,
)




templates = [
    ("P1", ManualTemplate(
        text='{"placeholder":"text_a"} €å€Å ÿ¨ŸÖŸÑ€Å {"mask"} €Å€í€î',  # Keep as is
        tokenizer=tokenizer,
    )),
    ("P2", ManualTemplate(
        text='ÿØ€å ⁄Øÿ¶€å ÿπÿ®ÿßÿ±ÿ™: {"placeholder":"text_a"} ⁄©ÿß ŸÖŸàÿßÿØ ÿßŸàÿ± Ÿæ€åÿ∫ÿßŸÖ {"mask"} €Å€í€î',  # generalized for neg/pos
        tokenizer=tokenizer,
    )),
    ("P3", ManualTemplate(
        text='ÿßÿ≥ ÿπÿ®ÿßÿ±ÿ™ ⁄©ÿß ÿ™ÿ¨ÿ≤€å€Å ⁄©ÿ±€å⁄∫: {"placeholder":"text_a"} ÿßÿ≥ ⁄©ÿß ÿßÿ≠ÿ≥ÿßÿ≥ €åÿß ŸÖŸà⁄à {"mask"} €Å€í€î',  # focuses on sentiment/intent
        tokenizer=tokenizer,
    )),
    ("P4", ManualTemplate(
        text='ÿ¨ŸÖŸÑ€Å: {"placeholder":"text_a"} €ÅŸÖ€å⁄∫ ÿ®ÿ™ÿßÿ™ÿß €Å€í ⁄©€Å €å€Å ŸÖŸàÿßÿØ {"mask"} €Å€í€î',  # clear statement aligned with label
        tokenizer=tokenizer,
    )),
    ("P5", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßÿ≥ ÿπÿ®ÿßÿ±ÿ™ ⁄©ÿß ŸÖÿ∑ŸÑÿ® {"mask"} €Å€í€î',
        tokenizer=tokenizer,
    )),
    ("P6", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßÿ≥ ŸÖÿπÿßŸÖŸÑ€í ŸÖ€å⁄∫ ÿ≠ÿ™ŸÖ€å ÿ±ÿßÿ¶€í {"mask"}',
        tokenizer=tokenizer,
    )),
    ("P7", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßÿ≥ ŸÖŸàÿßÿØ ⁄©€å ÿ™ÿ¥ÿ±€åÿ≠ {"mask"}',
        tokenizer=tokenizer,
    )),
    ("P8", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßÿ≥ ÿ≠ŸàÿßŸÑ€í ÿ≥€í ŸÅ€åÿµŸÑ€Å {"mask"}',
        tokenizer=tokenizer,
    )),
    ("P9", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßÿ≥ ŸÖÿ™ŸÜ ⁄©€å ÿØÿ±ÿ¨€Å ÿ®ŸÜÿØ€å {"mask"}',
        tokenizer=tokenizer,
    )),
    ("P10", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßÿ≥ ÿßÿ∏€Åÿßÿ± ⁄©ÿß ŸÜÿ™€åÿ¨€Å {"mask"}',
        tokenizer=tokenizer,
    )), 
]



# ==============================
# Define Verbalizer (Manual)
# ==============================
verbalizer = ManualVerbalizer(
    classes=classes,
    label_words = {
        "acc": ["ÿµÿ≠€åÿ≠","ÿØÿ±ÿ≥ÿ™"], 
       "unacc": ["ŸÜÿß ÿØÿ±ÿ≥ÿ™","ÿ∫ŸÑÿ∑"]
    },
    tokenizer=tokenizer,
)

In [5]:
# ==============================
# Create Prompt Model
# ==============================
prompt_model = PromptForClassification(
    template=template,
    plm=plm,
    verbalizer=verbalizer
)


In [6]:
# ==============================
# Load Evaluation Dataset
# ==============================
df = pd.read_csv(
    r"C:\Users\stdFurqan\Desktop\paft\cola_dataset\final_ColA_Dev_Urdu_labeled - Sheet1.csv"
)

# Make InputExamples
eval_dataset = [
    InputExample(guid=i, text_a=row['Urdu Sentence'], label=label_map[row['label']])
    for i, row in df.iterrows()
]

# ==============================
# 0-Shot Evaluation with Each Template
# ==============================
prompt_model.eval()  # ensure model is in evaluation mode
batch_size = 8    # eval batch size

# Optional: store template order and results
all_pass_patterns = {}

for pass_idx, (prompt_name, current_template) in enumerate(templates, start=1):
    print(f"\nüü¶ 0-Shot Evaluation - Template {prompt_name} ({pass_idx}/{len(templates)})")

    # Create PromptDataLoader with current template
    eval_loader = PromptDataLoader(
        dataset=eval_dataset,
        tokenizer=tokenizer,
        template=current_template,
        tokenizer_wrapper_class=WrapperClass,
        max_seq_length=128,
        batch_size=batch_size,
        shuffle=False
    )

    pass_preds = []
    pass_labels = []

    # Run evaluation
    with torch.no_grad():
        for batch in eval_loader:
            logits = prompt_model(batch)
            preds = torch.argmax(logits, dim=-1)
            pass_preds.extend(preds.cpu().tolist())
            pass_labels.extend(batch['label'].cpu().tolist())

    # Print report immediately after this template
    print(f"\nüìä STS_B Urdu Dev Classification Report - Template {prompt_name}")
    print(classification_report(pass_labels, pass_preds, target_names=classes, digits=4))

    # Store template name (optional)
    all_pass_patterns[f"pass_{pass_idx}"] = prompt_name

# Optional: print template order at the end
print("\n‚úÖ Templates used per pass:", all_pass_patterns)



üü¶ 0-Shot Evaluation - Template P1 (1/10)


tokenizing: 1043it [00:00, 3007.73it/s]



üìä STS_B Urdu Dev Classification Report - Template P1
              precision    recall  f1-score   support

       unacc     0.2222    0.0062    0.0121       322
         acc     0.6905    0.9903    0.8137       721

    accuracy                         0.6865      1043
   macro avg     0.4564    0.4983    0.4129      1043
weighted avg     0.5459    0.6865    0.5662      1043


üü¶ 0-Shot Evaluation - Template P2 (2/10)


tokenizing: 1043it [00:00, 2548.50it/s]



üìä STS_B Urdu Dev Classification Report - Template P2
              precision    recall  f1-score   support

       unacc     0.3423    0.3168    0.3290       322
         acc     0.7047    0.7282    0.7162       721

    accuracy                         0.6012      1043
   macro avg     0.5235    0.5225    0.5226      1043
weighted avg     0.5928    0.6012    0.5967      1043


üü¶ 0-Shot Evaluation - Template P3 (3/10)


tokenizing: 1043it [00:00, 2469.83it/s]



üìä STS_B Urdu Dev Classification Report - Template P3
              precision    recall  f1-score   support

       unacc     0.3022    0.2112    0.2486       322
         acc     0.6895    0.7822    0.7329       721

    accuracy                         0.6059      1043
   macro avg     0.4959    0.4967    0.4908      1043
weighted avg     0.5699    0.6059    0.5834      1043


üü¶ 0-Shot Evaluation - Template P4 (4/10)


tokenizing: 1043it [00:00, 2398.71it/s]



üìä STS_B Urdu Dev Classification Report - Template P4
              precision    recall  f1-score   support

       unacc     0.1818    0.0186    0.0338       322
         acc     0.6871    0.9626    0.8018       721

    accuracy                         0.6711      1043
   macro avg     0.4345    0.4906    0.4178      1043
weighted avg     0.5311    0.6711    0.5647      1043


üü¶ 0-Shot Evaluation - Template P5 (5/10)


tokenizing: 1043it [00:00, 2854.27it/s]



üìä STS_B Urdu Dev Classification Report - Template P5
              precision    recall  f1-score   support

       unacc     0.2588    0.2981    0.2771       322
         acc     0.6637    0.6186    0.6403       721

    accuracy                         0.5197      1043
   macro avg     0.4612    0.4584    0.4587      1043
weighted avg     0.5387    0.5197    0.5282      1043


üü¶ 0-Shot Evaluation - Template P6 (6/10)


tokenizing: 1043it [00:00, 2998.06it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



üìä STS_B Urdu Dev Classification Report - Template P6
              precision    recall  f1-score   support

       unacc     0.3087    1.0000    0.4718       322
         acc     0.0000    0.0000    0.0000       721

    accuracy                         0.3087      1043
   macro avg     0.1544    0.5000    0.2359      1043
weighted avg     0.0953    0.3087    0.1457      1043


üü¶ 0-Shot Evaluation - Template P7 (7/10)


tokenizing: 1043it [00:00, 3422.37it/s]



üìä STS_B Urdu Dev Classification Report - Template P7
              precision    recall  f1-score   support

       unacc     0.2835    0.6957    0.4029       322
         acc     0.6126    0.2150    0.3183       721

    accuracy                         0.3634      1043
   macro avg     0.4481    0.4553    0.3606      1043
weighted avg     0.5110    0.3634    0.3444      1043


üü¶ 0-Shot Evaluation - Template P8 (8/10)


tokenizing: 1043it [00:00, 3317.56it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



üìä STS_B Urdu Dev Classification Report - Template P8
              precision    recall  f1-score   support

       unacc     0.3087    1.0000    0.4718       322
         acc     0.0000    0.0000    0.0000       721

    accuracy                         0.3087      1043
   macro avg     0.1544    0.5000    0.2359      1043
weighted avg     0.0953    0.3087    0.1457      1043


üü¶ 0-Shot Evaluation - Template P9 (9/10)


tokenizing: 1043it [00:00, 3186.07it/s]



üìä STS_B Urdu Dev Classification Report - Template P9
              precision    recall  f1-score   support

       unacc     0.3075    0.9224    0.4612       322
         acc     0.6753    0.0721    0.1303       721

    accuracy                         0.3346      1043
   macro avg     0.4914    0.4972    0.2958      1043
weighted avg     0.5618    0.3346    0.2325      1043


üü¶ 0-Shot Evaluation - Template P10 (10/10)


tokenizing: 1043it [00:00, 3300.70it/s]



üìä STS_B Urdu Dev Classification Report - Template P10
              precision    recall  f1-score   support

       unacc     0.2976    0.8012    0.4340       322
         acc     0.6364    0.1553    0.2497       721

    accuracy                         0.3547      1043
   macro avg     0.4670    0.4783    0.3418      1043
weighted avg     0.5318    0.3547    0.3066      1043


‚úÖ Templates used per pass: {'pass_1': 'P1', 'pass_2': 'P2', 'pass_3': 'P3', 'pass_4': 'P4', 'pass_5': 'P5', 'pass_6': 'P6', 'pass_7': 'P7', 'pass_8': 'P8', 'pass_9': 'P9', 'pass_10': 'P10'}


In [None]:
Roberta

üü¶ 0-Shot Evaluation - Template P1 (1/10)
tokenizing: 1043it [00:00, 3007.73it/s]

üìä STS_B Urdu Dev Classification Report - Template P1
              precision    recall  f1-score   support

       unacc     0.2222    0.0062    0.0121       322
         acc     0.6905    0.9903    0.8137       721

    accuracy                         0.6865      1043
   macro avg     0.4564    0.4983    0.4129      1043
weighted avg     0.5459    0.6865    0.5662      1043


üü¶ 0-Shot Evaluation - Template P2 (2/10)
tokenizing: 1043it [00:00, 2548.50it/s]

üìä STS_B Urdu Dev Classification Report - Template P2
              precision    recall  f1-score   support

       unacc     0.3423    0.3168    0.3290       322
         acc     0.7047    0.7282    0.7162       721

    accuracy                         0.6012      1043
   macro avg     0.5235    0.5225    0.5226      1043
weighted avg     0.5928    0.6012    0.5967      1043


üü¶ 0-Shot Evaluation - Template P3 (3/10)
tokenizing: 1043it [00:00, 2469.83it/s]

üìä STS_B Urdu Dev Classification Report - Template P3
              precision    recall  f1-score   support

       unacc     0.3022    0.2112    0.2486       322
         acc     0.6895    0.7822    0.7329       721

    accuracy                         0.6059      1043
   macro avg     0.4959    0.4967    0.4908      1043
weighted avg     0.5699    0.6059    0.5834      1043


üü¶ 0-Shot Evaluation - Template P4 (4/10)
tokenizing: 1043it [00:00, 2398.71it/s]

üìä STS_B Urdu Dev Classification Report - Template P4
              precision    recall  f1-score   support

       unacc     0.1818    0.0186    0.0338       322
         acc     0.6871    0.9626    0.8018       721

    accuracy                         0.6711      1043
   macro avg     0.4345    0.4906    0.4178      1043
weighted avg     0.5311    0.6711    0.5647      1043


üü¶ 0-Shot Evaluation - Template P5 (5/10)
tokenizing: 1043it [00:00, 2854.27it/s]

üìä STS_B Urdu Dev Classification Report - Template P5
              precision    recall  f1-score   support

       unacc     0.2588    0.2981    0.2771       322
         acc     0.6637    0.6186    0.6403       721

    accuracy                         0.5197      1043
   macro avg     0.4612    0.4584    0.4587      1043
weighted avg     0.5387    0.5197    0.5282      1043


üü¶ 0-Shot Evaluation - Template P6 (6/10)
tokenizing: 1043it [00:00, 2998.06it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P6
              precision    recall  f1-score   support

       unacc     0.3087    1.0000    0.4718       322
         acc     0.0000    0.0000    0.0000       721

    accuracy                         0.3087      1043
   macro avg     0.1544    0.5000    0.2359      1043
weighted avg     0.0953    0.3087    0.1457      1043


üü¶ 0-Shot Evaluation - Template P7 (7/10)
tokenizing: 1043it [00:00, 3422.37it/s]

üìä STS_B Urdu Dev Classification Report - Template P7
              precision    recall  f1-score   support

       unacc     0.2835    0.6957    0.4029       322
         acc     0.6126    0.2150    0.3183       721

    accuracy                         0.3634      1043
   macro avg     0.4481    0.4553    0.3606      1043
weighted avg     0.5110    0.3634    0.3444      1043


üü¶ 0-Shot Evaluation - Template P8 (8/10)
tokenizing: 1043it [00:00, 3317.56it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P8
              precision    recall  f1-score   support

       unacc     0.3087    1.0000    0.4718       322
         acc     0.0000    0.0000    0.0000       721

    accuracy                         0.3087      1043
   macro avg     0.1544    0.5000    0.2359      1043
weighted avg     0.0953    0.3087    0.1457      1043


üü¶ 0-Shot Evaluation - Template P9 (9/10)
tokenizing: 1043it [00:00, 3186.07it/s]

üìä STS_B Urdu Dev Classification Report - Template P9
              precision    recall  f1-score   support

       unacc     0.3075    0.9224    0.4612       322
         acc     0.6753    0.0721    0.1303       721

    accuracy                         0.3346      1043
   macro avg     0.4914    0.4972    0.2958      1043
weighted avg     0.5618    0.3346    0.2325      1043


üü¶ 0-Shot Evaluation - Template P10 (10/10)
tokenizing: 1043it [00:00, 3300.70it/s]

üìä STS_B Urdu Dev Classification Report - Template P10
              precision    recall  f1-score   support

       unacc     0.2976    0.8012    0.4340       322
         acc     0.6364    0.1553    0.2497       721

    accuracy                         0.3547      1043
   macro avg     0.4670    0.4783    0.3418      1043
weighted avg     0.5318    0.3547    0.3066      1043

In [None]:
MBERT
üü¶ 0-Shot Evaluation - Template P1 (1/10)
tokenizing: 1043it [00:00, 3609.98it/s]

üìä STS_B Urdu Dev Classification Report - Template P1
              precision    recall  f1-score   support

       unacc     0.3119    0.6025    0.4110       322
         acc     0.6960    0.4064    0.5131       721

    accuracy                         0.4669      1043
   macro avg     0.5039    0.5044    0.4621      1043
weighted avg     0.5774    0.4669    0.4816      1043


üü¶ 0-Shot Evaluation - Template P2 (2/10)
tokenizing: 1043it [00:00, 1738.66it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P2
              precision    recall  f1-score   support

       unacc     0.0000    0.0000    0.0000       322
         acc     0.6913    1.0000    0.8175       721

    accuracy                         0.6913      1043
   macro avg     0.3456    0.5000    0.4087      1043
weighted avg     0.4779    0.6913    0.5651      1043


üü¶ 0-Shot Evaluation - Template P3 (3/10)
tokenizing: 1043it [00:00, 1631.28it/s]

üìä STS_B Urdu Dev Classification Report - Template P3
              precision    recall  f1-score   support

       unacc     0.3310    0.8634    0.4785       322
         acc     0.7833    0.2205    0.3442       721

    accuracy                         0.4190      1043
   macro avg     0.5571    0.5419    0.4113      1043
weighted avg     0.6436    0.4190    0.3856      1043


üü¶ 0-Shot Evaluation - Template P4 (4/10)
tokenizing: 1043it [00:00, 1837.70it/s]

üìä STS_B Urdu Dev Classification Report - Template P4
              precision    recall  f1-score   support

       unacc     0.3345    0.5807    0.4245       322
         acc     0.7211    0.4840    0.5793       721

    accuracy                         0.5139      1043
   macro avg     0.5278    0.5324    0.5019      1043
weighted avg     0.6017    0.5139    0.5315      1043


üü¶ 0-Shot Evaluation - Template P5 (5/10)
tokenizing: 1043it [00:00, 2850.99it/s]

üìä STS_B Urdu Dev Classification Report - Template P5
              precision    recall  f1-score   support

       unacc     0.3487    0.4224    0.3820       322
         acc     0.7152    0.6477    0.6798       721

    accuracy                         0.5781      1043
   macro avg     0.5319    0.5350    0.5309      1043
weighted avg     0.6020    0.5781    0.5878      1043


üü¶ 0-Shot Evaluation - Template P6 (6/10)
tokenizing: 1043it [00:00, 3509.68it/s]

üìä STS_B Urdu Dev Classification Report - Template P6
              precision    recall  f1-score   support

       unacc     0.3186    0.4472    0.3721       322
         acc     0.6988    0.5728    0.6296       721

    accuracy                         0.5340      1043
   macro avg     0.5087    0.5100    0.5008      1043
weighted avg     0.5814    0.5340    0.5501      1043


üü¶ 0-Shot Evaluation - Template P7 (7/10)
tokenizing: 1043it [00:00, 2102.33it/s]

üìä STS_B Urdu Dev Classification Report - Template P7
              precision    recall  f1-score   support

       unacc     0.1667    0.0093    0.0176       322
         acc     0.6888    0.9792    0.8087       721

    accuracy                         0.6798      1043
   macro avg     0.4277    0.4943    0.4132      1043
weighted avg     0.5276    0.6798    0.5645      1043


üü¶ 0-Shot Evaluation - Template P8 (8/10)
tokenizing: 1043it [00:00, 2303.86it/s]

üìä STS_B Urdu Dev Classification Report - Template P8
              precision    recall  f1-score   support

       unacc     0.3127    0.8789    0.4613       322
         acc     0.7174    0.1373    0.2305       721

    accuracy                         0.3663      1043
   macro avg     0.5150    0.5081    0.3459      1043
weighted avg     0.5925    0.3663    0.3018      1043


üü¶ 0-Shot Evaluation - Template P9 (9/10)
tokenizing: 1043it [00:00, 2174.24it/s]

üìä STS_B Urdu Dev Classification Report - Template P9
              precision    recall  f1-score   support

       unacc     0.2713    0.4255    0.3313       322
         acc     0.6561    0.4896    0.5608       721

    accuracy                         0.4698      1043
   macro avg     0.4637    0.4575    0.4460      1043
weighted avg     0.5373    0.4698    0.4899      1043


üü¶ 0-Shot Evaluation - Template P10 (10/10)
tokenizing: 1043it [00:00, 2084.16it/s]

üìä STS_B Urdu Dev Classification Report - Template P10
              precision    recall  f1-score   support

       unacc     0.3087    1.0000    0.4718       322
         acc     0.0000    0.0000    0.0000       721

    accuracy                         0.3087      1043
   macro avg     0.1544    0.5000    0.2359      1043
weighted avg     0.0953    0.3087    0.1457      1043