In [None]:
import os
import shutil

# Get user profile path
user_profile = os.environ["USERPROFILE"]

# Paths to Hugging Face cached models
cached_models = [
    os.path.join(user_profile, r".cache\huggingface\hub\models--bert-base-multilingual-cased"),
    os.path.join(user_profile, r".cache\huggingface\hub\models--xlm-roberta-base")
]

# Remove cached models if they exist
for path in cached_models:
    if os.path.exists(path):
        shutil.rmtree(path)
        print(f"Removed cache: {path}")
    else:
        print(f"No cache found at: {path}")


In [1]:
# Set device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# ==============================
# Imports
# ==============================
import torch
import pandas as pd
import random
import numpy as np
from collections import Counter
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt import PromptForClassification, PromptDataLoader
from torch.optim import AdamW
from sklearn.metrics import classification_report
from collections import defaultdict
from torch.utils.data import DataLoader, Sampler

# ========================================
# Check CUDA
# ========================================
device = "cuda" #if torch.cuda.is_available() else "cpu"
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    print("CUDA version:", torch.version.cuda)
    print("GPU count:", torch.cuda.device_count())

# ========================================
# Seeds for reproducibility
# ========================================
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
GPU name: NVIDIA GeForce RTX 4080 SUPER
CUDA version: 12.1
GPU count: 1


In [3]:
### Classes ###
classes = ['unrelated', 'distant', 'similar', 'equivalent', 'identical']

### Label Map ###
label_map = {'unrelated': 0, 'distant': 1, 'similar': 2, 'equivalent': 3, 'identical': 4}



# # Step 1: Use load_plm with 'roberta' to get the correct WrapperClass
_, _, _, WrapperClass = load_plm("roberta", "roberta-base")  # Just to get the wrapper

# # # Step 2: Manually load XLM-RoBERTa model/tokenizer
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
plm = XLMRobertaForMaskedLM.from_pretrained(model_name)

# ==============================
# Load Pretrained Language Model (mBERT)
# ==============================
# plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-multilingual-cased")



In [4]:
# ==============================
# Define Prompt Template (Manual)
# ==============================
template = ManualTemplate(
    text='{"placeholder":"text_a"} ÿßŸàÿ± {"placeholder":"text_b"} ÿ¢Ÿæÿ≥ ŸÖ€å⁄∫ {"mask"} €Å€å⁄∫€î',
    tokenizer=tokenizer,
)


templates = [

    ("P1", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßŸàÿ± {"placeholder":"text_b"} ÿ¢Ÿæÿ≥ ŸÖ€å⁄∫ {"mask"} €Å€å⁄∫€î',
        tokenizer=tokenizer,
    )),

    ("P2", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßŸàÿ± {"placeholder":"text_b"} ⁄©€í ŸÖÿπŸÜ€å {"mask"} €Å€å⁄∫€î',
        tokenizer=tokenizer,
    )),

    ("P3", ManualTemplate(
        text='{"placeholder":"text_a"} ⁄©ÿß {"placeholder":"text_b"} ÿ≥€í ÿ™ÿπŸÑŸÇ {"mask"} €Å€í€î',
        tokenizer=tokenizer,
    )),

    ("P4", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßŸàÿ± {"placeholder":"text_b"} ŸÖÿπŸÜ€å ⁄©€í ŸÑÿ≠ÿßÿ∏ ÿ≥€í {"mask"} €Å€å⁄∫€î',
        tokenizer=tokenizer,
    )),

    ("P5", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßŸàÿ± {"placeholder":"text_b"} ÿß€å⁄© ÿØŸàÿ≥ÿ±€í ÿ≥€í {"mask"} €Å€å⁄∫€î',
        tokenizer=tokenizer,
    )),

    ("P6", ManualTemplate(
        text='{"placeholder":"text_a"} ⁄©Ÿà {"placeholder":"text_b"} ÿ≥€í ŸÖŸÑÿß€åÿß ÿ¨ÿßÿ¶€í ÿ™Ÿà ŸÜÿ™€åÿ¨€Å {"mask"} €Å€í€î',
        tokenizer=tokenizer,
    )),

    ("P7", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßŸàÿ± {"placeholder":"text_b"} ⁄©ÿß ŸÖŸÅ€ÅŸàŸÖ {"mask"} ÿ®ŸÜÿ™ÿß €Å€í€î',
        tokenizer=tokenizer,
    )),

    ("P8", ManualTemplate(
        text='{"placeholder":"text_a"} ⁄©€í ŸÖŸÇÿßÿ®ŸÑ€í ŸÖ€å⁄∫ {"placeholder":"text_b"} {"mask"} €Å€í€î',
        tokenizer=tokenizer,
    )),

    ("P9", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßŸàÿ± {"placeholder":"text_b"} ŸÖ€å⁄∫ ŸÖÿπŸÜ€å ⁄©ÿß ÿ±ÿ¥ÿ™€Å {"mask"} €Å€í€î',
        tokenizer=tokenizer,
    )),

    ("P10", ManualTemplate(
        text='{"placeholder":"text_a"} ÿßŸàÿ± {"placeholder":"text_b"} ÿß€å⁄© ÿ¨€åÿ≥€í €ÅŸàŸÜ€í ⁄©€í ŸÑÿ≠ÿßÿ∏ ÿ≥€í {"mask"} €Å€å⁄∫€î',
        tokenizer=tokenizer,
    )),
]









verbalizer = ManualVerbalizer(
    classes=classes,
    label_words={
        "unrelated": ["ÿ∫€åÿ±ŸÖÿ™ÿπŸÑŸÇ", "ÿßŸÑ⁄Ø"],
        "distant": ["⁄©ŸÖ", "ÿØŸàÿ±"],
        "similar": ["ŸÖŸÑÿ™€í", "ŸÖÿ¥ÿßÿ®€Å"],
        "equivalent": ["ÿ™ŸÇÿ±€åÿ®ÿßŸã", "ÿ®ÿ±ÿßÿ®ÿ±"],
        "identical": ["ÿ®ÿßŸÑ⁄©ŸÑ", "ÿß€å⁄©"]
    },
    tokenizer=tokenizer,
)

In [5]:
# ==============================
# Create Prompt Model
# ==============================
prompt_model = PromptForClassification(
    template=template,
    plm=plm,
    verbalizer=verbalizer
)


In [6]:
# ==============================
# Load Evaluation Dataset
# ==============================
df = pd.read_csv(
    r"C:\Users\stdFurqan\Desktop\paft\STS-B\Final_dev_labeled.csv"
)

# Make InputExamples
eval_dataset = [
    InputExample(
        guid=i,
        text_a=row['sentence1'],
        text_b=row['sentence2'],
        label=label_map[row['score_to_labels']]
    )
    for i, row in df.iterrows()
]

# ==============================
# 0-Shot Evaluation with Each Template
# ==============================
prompt_model.eval()  # ensure model is in evaluation mode
batch_size = 8    # eval batch size

# Optional: store template order and results
all_pass_patterns = {}

for pass_idx, (prompt_name, current_template) in enumerate(templates, start=1):
    print(f"\nüü¶ 0-Shot Evaluation - Template {prompt_name} ({pass_idx}/{len(templates)})")

    # Create PromptDataLoader with current template
    eval_loader = PromptDataLoader(
        dataset=eval_dataset,
        tokenizer=tokenizer,
        template=current_template,
        tokenizer_wrapper_class=WrapperClass,
        max_seq_length=128,
        batch_size=batch_size,
        shuffle=False
    )

    pass_preds = []
    pass_labels = []

    # Run evaluation
    with torch.no_grad():
        for batch in eval_loader:
            logits = prompt_model(batch)
            preds = torch.argmax(logits, dim=-1)
            pass_preds.extend(preds.cpu().tolist())
            pass_labels.extend(batch['label'].cpu().tolist())

    # Print report immediately after this template
    print(f"\nüìä STS_B Urdu Dev Classification Report - Template {prompt_name}")
    print(classification_report(pass_labels, pass_preds, target_names=classes, digits=4))

    # Store template name (optional)
    all_pass_patterns[f"pass_{pass_idx}"] = prompt_name

# Optional: print template order at the end
print("\n‚úÖ Templates used per pass:", all_pass_patterns)



üü¶ 0-Shot Evaluation - Template P1 (1/10)


tokenizing: 1500it [00:00, 2029.41it/s]



üìä STS_B Urdu Dev Classification Report - Template P1
              precision    recall  f1-score   support

   unrelated     0.1836    0.2413    0.2085       344
     distant     0.2081    0.1614    0.1818       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.1600    0.0110    0.0205       365
   identical     0.2017    0.6288    0.3054       264

    accuracy                         0.1960      1500
   macro avg     0.1507    0.2085    0.1433      1500
weighted avg     0.1518    0.1960    0.1374      1500


üü¶ 0-Shot Evaluation - Template P2 (2/10)


tokenizing: 1500it [00:00, 2079.86it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



üìä STS_B Urdu Dev Classification Report - Template P2
              precision    recall  f1-score   support

   unrelated     0.2226    0.8140    0.3496       344
     distant     0.2414    0.0827    0.1232       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.1503    0.0871    0.1103       264

    accuracy                         0.2160      1500
   macro avg     0.1229    0.1968    0.1166      1500
weighted avg     0.1184    0.2160    0.1204      1500


üü¶ 0-Shot Evaluation - Template P3 (3/10)


tokenizing: 1500it [00:00, 3158.25it/s]



üìä STS_B Urdu Dev Classification Report - Template P3
              precision    recall  f1-score   support

   unrelated     0.1368    0.0378    0.0592       344
     distant     0.1688    0.8661    0.2826       254
     similar     0.1569    0.0293    0.0494       273
  equivalent     0.3846    0.0137    0.0265       365
   identical     0.1316    0.0189    0.0331       264

    accuracy                         0.1673      1500
   macro avg     0.1957    0.1932    0.0902      1500
weighted avg     0.2053    0.1673    0.0827      1500


üü¶ 0-Shot Evaluation - Template P4 (4/10)


tokenizing: 1500it [00:00, 2737.80it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



üìä STS_B Urdu Dev Classification Report - Template P4
              precision    recall  f1-score   support

   unrelated     0.2273    0.6773    0.3404       344
     distant     0.1916    0.2520    0.2177       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.2500    0.0110    0.0210       365
   identical     0.1680    0.0795    0.1080       264

    accuracy                         0.2147      1500
   macro avg     0.1674    0.2040    0.1374      1500
weighted avg     0.1750    0.2147    0.1390      1500


üü¶ 0-Shot Evaluation - Template P5 (5/10)


tokenizing: 1500it [00:00, 1922.15it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



üìä STS_B Urdu Dev Classification Report - Template P5
              precision    recall  f1-score   support

   unrelated     0.2088    0.3459    0.2604       344
     distant     0.1719    0.6260    0.2697       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.0000    0.0000    0.0000       264

    accuracy                         0.1853      1500
   macro avg     0.0761    0.1944    0.1060      1500
weighted avg     0.0770    0.1853    0.1054      1500


üü¶ 0-Shot Evaluation - Template P6 (6/10)


tokenizing: 1500it [00:00, 1875.17it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



üìä STS_B Urdu Dev Classification Report - Template P6
              precision    recall  f1-score   support

   unrelated     0.2064    0.5436    0.2992       344
     distant     0.0909    0.0079    0.0145       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.2661    0.0795    0.1224       365
   identical     0.1641    0.2879    0.2091       264

    accuracy                         0.1960      1500
   macro avg     0.1455    0.1838    0.1290      1500
weighted avg     0.1564    0.1960    0.1376      1500


üü¶ 0-Shot Evaluation - Template P7 (7/10)


tokenizing: 1500it [00:00, 1961.13it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



üìä STS_B Urdu Dev Classification Report - Template P7
              precision    recall  f1-score   support

   unrelated     0.6000    0.0087    0.0172       344
     distant     0.2083    0.0787    0.1143       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.3571    0.0274    0.0509       365
   identical     0.1831    0.9508    0.3070       264

    accuracy                         0.1893      1500
   macro avg     0.2697    0.2131    0.0979      1500
weighted avg     0.2920    0.1893    0.0897      1500


üü¶ 0-Shot Evaluation - Template P8 (8/10)


tokenizing: 1500it [00:00, 2393.23it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



üìä STS_B Urdu Dev Classification Report - Template P8
              precision    recall  f1-score   support

   unrelated     0.1401    0.0640    0.0878       344
     distant     0.2036    0.1339    0.1615       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.3758    0.3315    0.3523       365
   identical     0.1475    0.4773    0.2254       264

    accuracy                         0.2020      1500
   macro avg     0.1734    0.2013    0.1654      1500
weighted avg     0.1840    0.2020    0.1729      1500


üü¶ 0-Shot Evaluation - Template P9 (9/10)


tokenizing: 1500it [00:00, 2084.20it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



üìä STS_B Urdu Dev Classification Report - Template P9
              precision    recall  f1-score   support

   unrelated     0.1333    0.0058    0.0111       344
     distant     0.1809    0.5079    0.2668       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.4286    0.0082    0.0161       365
   identical     0.2157    0.6250    0.3207       264

    accuracy                         0.1993      1500
   macro avg     0.1917    0.2294    0.1230      1500
weighted avg     0.2035    0.1993    0.1081      1500


üü¶ 0-Shot Evaluation - Template P10 (10/10)


tokenizing: 1500it [00:00, 1992.18it/s]



üìä STS_B Urdu Dev Classification Report - Template P10
              precision    recall  f1-score   support

   unrelated     0.2264    0.8924    0.3612       344
     distant     0.2000    0.0512    0.0815       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.2222    0.0110    0.0209       365
   identical     0.1311    0.0303    0.0492       264

    accuracy                         0.2213      1500
   macro avg     0.1560    0.1970    0.1026      1500
weighted avg     0.1629    0.2213    0.1104      1500


‚úÖ Templates used per pass: {'pass_1': 'P1', 'pass_2': 'P2', 'pass_3': 'P3', 'pass_4': 'P4', 'pass_5': 'P5', 'pass_6': 'P6', 'pass_7': 'P7', 'pass_8': 'P8', 'pass_9': 'P9', 'pass_10': 'P10'}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
ROBERTA
üü¶ 0-Shot Evaluation - Template P1 (1/10)
tokenizing: 1500it [00:00, 2029.41it/s]

üìä STS_B Urdu Dev Classification Report - Template P1
              precision    recall  f1-score   support

   unrelated     0.1836    0.2413    0.2085       344
     distant     0.2081    0.1614    0.1818       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.1600    0.0110    0.0205       365
   identical     0.2017    0.6288    0.3054       264

    accuracy                         0.1960      1500
   macro avg     0.1507    0.2085    0.1433      1500
weighted avg     0.1518    0.1960    0.1374      1500


üü¶ 0-Shot Evaluation - Template P2 (2/10)
tokenizing: 1500it [00:00, 2079.86it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P2
              precision    recall  f1-score   support

   unrelated     0.2226    0.8140    0.3496       344
     distant     0.2414    0.0827    0.1232       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.1503    0.0871    0.1103       264

    accuracy                         0.2160      1500
   macro avg     0.1229    0.1968    0.1166      1500
weighted avg     0.1184    0.2160    0.1204      1500


üü¶ 0-Shot Evaluation - Template P3 (3/10)
tokenizing: 1500it [00:00, 3158.25it/s]

üìä STS_B Urdu Dev Classification Report - Template P3
              precision    recall  f1-score   support

   unrelated     0.1368    0.0378    0.0592       344
     distant     0.1688    0.8661    0.2826       254
     similar     0.1569    0.0293    0.0494       273
  equivalent     0.3846    0.0137    0.0265       365
   identical     0.1316    0.0189    0.0331       264

    accuracy                         0.1673      1500
   macro avg     0.1957    0.1932    0.0902      1500
weighted avg     0.2053    0.1673    0.0827      1500


üü¶ 0-Shot Evaluation - Template P4 (4/10)
tokenizing: 1500it [00:00, 2737.80it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P4
              precision    recall  f1-score   support

   unrelated     0.2273    0.6773    0.3404       344
     distant     0.1916    0.2520    0.2177       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.2500    0.0110    0.0210       365
   identical     0.1680    0.0795    0.1080       264

    accuracy                         0.2147      1500
   macro avg     0.1674    0.2040    0.1374      1500
weighted avg     0.1750    0.2147    0.1390      1500


üü¶ 0-Shot Evaluation - Template P5 (5/10)
tokenizing: 1500it [00:00, 1922.15it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P5
              precision    recall  f1-score   support

   unrelated     0.2088    0.3459    0.2604       344
     distant     0.1719    0.6260    0.2697       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.0000    0.0000    0.0000       264

    accuracy                         0.1853      1500
   macro avg     0.0761    0.1944    0.1060      1500
weighted avg     0.0770    0.1853    0.1054      1500


üü¶ 0-Shot Evaluation - Template P6 (6/10)
tokenizing: 1500it [00:00, 1875.17it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P6
              precision    recall  f1-score   support

   unrelated     0.2064    0.5436    0.2992       344
     distant     0.0909    0.0079    0.0145       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.2661    0.0795    0.1224       365
   identical     0.1641    0.2879    0.2091       264

    accuracy                         0.1960      1500
   macro avg     0.1455    0.1838    0.1290      1500
weighted avg     0.1564    0.1960    0.1376      1500


üü¶ 0-Shot Evaluation - Template P7 (7/10)
tokenizing: 1500it [00:00, 1961.13it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P7
              precision    recall  f1-score   support

   unrelated     0.6000    0.0087    0.0172       344
     distant     0.2083    0.0787    0.1143       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.3571    0.0274    0.0509       365
   identical     0.1831    0.9508    0.3070       264

    accuracy                         0.1893      1500
   macro avg     0.2697    0.2131    0.0979      1500
weighted avg     0.2920    0.1893    0.0897      1500


üü¶ 0-Shot Evaluation - Template P8 (8/10)
tokenizing: 1500it [00:00, 2393.23it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P8
              precision    recall  f1-score   support

   unrelated     0.1401    0.0640    0.0878       344
     distant     0.2036    0.1339    0.1615       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.3758    0.3315    0.3523       365
   identical     0.1475    0.4773    0.2254       264

    accuracy                         0.2020      1500
   macro avg     0.1734    0.2013    0.1654      1500
weighted avg     0.1840    0.2020    0.1729      1500


üü¶ 0-Shot Evaluation - Template P9 (9/10)
tokenizing: 1500it [00:00, 2084.20it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P9
              precision    recall  f1-score   support

   unrelated     0.1333    0.0058    0.0111       344
     distant     0.1809    0.5079    0.2668       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.4286    0.0082    0.0161       365
   identical     0.2157    0.6250    0.3207       264

    accuracy                         0.1993      1500
   macro avg     0.1917    0.2294    0.1230      1500
weighted avg     0.2035    0.1993    0.1081      1500


üü¶ 0-Shot Evaluation - Template P10 (10/10)
tokenizing: 1500it [00:00, 1992.18it/s]

üìä STS_B Urdu Dev Classification Report - Template P10
              precision    recall  f1-score   support

   unrelated     0.2264    0.8924    0.3612       344
     distant     0.2000    0.0512    0.0815       254
     similar     0.0000    0.0000    0.0000       273
  equivalent     0.2222    0.0110    0.0209       365
   identical     0.1311    0.0303    0.0492       264

    accuracy                         0.2213      1500
   macro avg     0.1560    0.1970    0.1026      1500
weighted avg     0.1629    0.2213    0.1104      1500

In [None]:
MBERT
üü¶ 0-Shot Evaluation - Template P1 (1/10)
tokenizing: 1500it [00:01, 1190.24it/s]

üìä STS_B Urdu Dev Classification Report - Template P1
              precision    recall  f1-score   support

   unrelated     0.1250    0.0029    0.0057       344
     distant     0.2212    0.0984    0.1362       254
     similar     0.3019    0.0586    0.0982       273
  equivalent     0.2000    0.0027    0.0054       365
   identical     0.1794    0.8977    0.2991       264

    accuracy                         0.1867      1500
   macro avg     0.2055    0.2121    0.1089      1500
weighted avg     0.2013    0.1867    0.0962      1500


üü¶ 0-Shot Evaluation - Template P2 (2/10)
tokenizing: 1500it [00:01, 1430.39it/s]

üìä STS_B Urdu Dev Classification Report - Template P2
              precision    recall  f1-score   support

   unrelated     0.2277    0.0669    0.1034       344
     distant     0.2710    0.1142    0.1607       254
     similar     0.3333    0.0073    0.0143       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.1844    0.8977    0.3060       264

    accuracy                         0.1940      1500
   macro avg     0.2033    0.2172    0.1169      1500
weighted avg     0.1912    0.1940    0.1074      1500


üü¶ 0-Shot Evaluation - Template P3 (3/10)
tokenizing: 1500it [00:00, 1866.93it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P3
              precision    recall  f1-score   support

   unrelated     0.2645    0.3576    0.3041       344
     distant     0.2750    0.0433    0.0748       254
     similar     0.2357    0.1209    0.1598       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.1544    0.5000    0.2359       264

    accuracy                         0.1993      1500
   macro avg     0.1859    0.2043    0.1549      1500
weighted avg     0.1773    0.1993    0.1530      1500


üü¶ 0-Shot Evaluation - Template P4 (4/10)
tokenizing: 1500it [00:01, 1117.28it/s]

üìä STS_B Urdu Dev Classification Report - Template P4
              precision    recall  f1-score   support

   unrelated     0.0000    0.0000    0.0000       344
     distant     0.2047    0.2047    0.2047       254
     similar     0.2667    0.0147    0.0278       273
  equivalent     0.0909    0.0027    0.0053       365
   identical     0.1866    0.8523    0.3061       264

    accuracy                         0.1880      1500
   macro avg     0.1498    0.2149    0.1088      1500
weighted avg     0.1382    0.1880    0.0949      1500


üü¶ 0-Shot Evaluation - Template P5 (5/10)
tokenizing: 1500it [00:00, 1841.44it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P5
              precision    recall  f1-score   support

   unrelated     0.0000    0.0000    0.0000       344
     distant     0.1709    0.7087    0.2754       254
     similar     0.1803    0.0403    0.0659       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.2416    0.3523    0.2866       264

    accuracy                         0.1893      1500
   macro avg     0.1186    0.2202    0.1256      1500
weighted avg     0.1043    0.1893    0.1091      1500


üü¶ 0-Shot Evaluation - Template P6 (6/10)
tokenizing: 1500it [00:00, 1724.04it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P6
              precision    recall  f1-score   support

   unrelated     0.0000    0.0000    0.0000       344
     distant     0.1814    0.1535    0.1663       254
     similar     0.1667    0.0623    0.0907       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.1733    0.7765    0.2833       264

    accuracy                         0.1740      1500
   macro avg     0.1043    0.1985    0.1081      1500
weighted avg     0.0915    0.1740    0.0945      1500


üü¶ 0-Shot Evaluation - Template P7 (7/10)
tokenizing: 1500it [00:01, 1108.51it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P7
              precision    recall  f1-score   support

   unrelated     0.0000    0.0000    0.0000       344
     distant     0.1737    0.1142    0.1378       254
     similar     0.2424    0.0586    0.0944       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.1755    0.8409    0.2904       264

    accuracy                         0.1780      1500
   macro avg     0.1183    0.2027    0.1045      1500
weighted avg     0.1044    0.1780    0.0916      1500


üü¶ 0-Shot Evaluation - Template P8 (8/10)
tokenizing: 1500it [00:00, 1883.37it/s]

üìä STS_B Urdu Dev Classification Report - Template P8
              precision    recall  f1-score   support

   unrelated     0.1081    0.0116    0.0210       344
     distant     0.1930    0.0433    0.0707       254
     similar     0.2903    0.0330    0.0592       273
  equivalent     0.3158    0.0164    0.0312       365
   identical     0.1748    0.8977    0.2926       264

    accuracy                         0.1780      1500
   macro avg     0.2164    0.2004    0.0950      1500
weighted avg     0.2179    0.1780    0.0867      1500


üü¶ 0-Shot Evaluation - Template P9 (9/10)
tokenizing: 1500it [00:00, 1836.65it/s]
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\stdFurqan\anaconda3\envs\py310\lib\site-packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

üìä STS_B Urdu Dev Classification Report - Template P9
              precision    recall  f1-score   support

   unrelated     0.0000    0.0000    0.0000       344
     distant     0.2095    0.0866    0.1226       254
     similar     0.1875    0.0110    0.0208       273
  equivalent     0.0000    0.0000    0.0000       365
   identical     0.1748    0.9129    0.2934       264

    accuracy                         0.1773      1500
   macro avg     0.1144    0.2021    0.0873      1500
weighted avg     0.1004    0.1773    0.0762      1500


üü¶ 0-Shot Evaluation - Template P10 (10/10)
tokenizing: 1500it [00:00, 1691.58it/s]

üìä STS_B Urdu Dev Classification Report - Template P10
              precision    recall  f1-score   support

   unrelated     0.0000    0.0000    0.0000       344
     distant     0.1619    0.1339    0.1466       254
     similar     0.2500    0.0037    0.0072       273
  equivalent     0.3333    0.0027    0.0054       365
   identical     0.1839    0.8939    0.3051       264

    accuracy                         0.1813      1500
   macro avg     0.1858    0.2068    0.0929      1500
weighted avg     0.1864    0.1813    0.0812      1500