# STS vs LLM


## Imports


In [21]:
import os
import logging
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from textwrap import dedent
import json
import re
import time
from vllm import LLM, SamplingParams # type: ignore
import gc
import torch
from dotenv import load_dotenv

load_dotenv()
os.environ.setdefault("VLLM_LOG_LEVEL", "ERROR")
logging.getLogger("vllm").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.WARNING)


## Data


In [6]:
file_path = 'Ka-ChatBot_BenchMark.xlsx'
faq_ex = pd.read_excel(file_path, sheet_name='faq')
samples_ex = pd.read_excel(file_path, sheet_name='samples')
labels = samples_ex['gt_idx'].to_list()
VALID_FAQ_IDS = set(faq_ex["idx"].tolist())
faq_lookup = faq_ex.set_index("idx")

## Helper functions


In [16]:
def normalizer(text) -> str:
	if text is None:
		return ""
	t = str(text)
	t = t.replace("ي", "ی").replace("ك", "ک")
	t = t.replace("\u200c", " ").replace("\u200f", "").replace("\u200a", "")
	t = " ".join(t.split())
	return t


FEW_SHOT_BLOCKS = {
	"en": dedent("""\
Example 1
Allowed idx list: [101, 305]
FAQ list:
idx=101: Resetting email or portal password.
idx=305: Tracking the status of an online order.
Query:
I forgot the password to my email portal.
Valid JSON:
{"idx": 101}

Example 2
Allowed idx list: [12, 77]
FAQ list:
idx=12: Updating the billing address on my account.
idx=77: Canceling an active subscription or plan.
Query:
Please cancel my current subscription immediately.
Valid JSON:
{"idx": 77}
"""),
	"fa": dedent("""\
مثال ۱
لیست idx مجاز: [101, 305]
لیست FAQ:
idx=101: بازیابی یا تغییر رمز عبور ایمیل یا پنل کاربری.
idx=305: پیگیری وضعیت سفارش اینترنتی.
پرسش:
رمز عبور پنل ایمیلم را فراموش کرده‌ام.
خروجی JSON معتبر:
{"idx": 101}

مثال ۲
لیست idx مجاز: [12, 77]
لیست FAQ:
idx=12: تغییر آدرس صورت‌حساب در حساب کاربری.
idx=77: لغو اشتراک یا طرح فعال.
پرسش:
لطفاً اشتراک فعلی من را لغو کنید.
خروجی JSON معتبر:
{"idx": 77}
"""),
}

PROMPT_RULES = {
	"en": dedent("""\
You are a strict classifier. Always output exactly one valid JSON object and nothing else.
Rules:
1) Select exactly one idx from the allowed list.
2) The response must match this schema exactly: {{"idx": <integer from the allowed list>}}.
3) Do NOT write explanations, code fences, or extra words.
4) If unsure, pick the idx that best matches the query; never return null.

Here are correct examples you must imitate:
{few_shot}

Now solve the real task.
Allowed idx list: [{allowed_ids}]
FAQ list:
{faq_block}

Query:
{query}

Return ONLY the JSON object.
"""),
	"fa": dedent("""\
شما یک دسته‌بند دقیق هستید. همیشه فقط یک شیء JSON معتبر تولید کنید و هیچ متن دیگری ننویسید.
قوانین:
1) دقیقاً یک idx از لیست مجاز انتخاب کنید.
2) پاسخ باید دقیقاً با این قالب باشد: {{"idx": <عدد صحیح از لیست مجاز>}}.
3) هیچ توضیحی، متن اضافه، کدفنس، یا علامت اضافی ننویسید.
4) اگر مطمئن نیستید، بهترین گزینه را انتخاب کنید؛ هرگز null برنگردانید.

این‌ها مثال‌های صحیح هستند و باید دقیقاً از آن‌ها تقلید کنید:
{few_shot}

اکنون مسئلهٔ واقعی را حل کنید.
لیست idx مجاز: [{allowed_ids}]
لیست FAQ:
{faq_block}

پرسش:
{query}

فقط و فقط شیء JSON را برگردانید.
"""),
}

RETRY_SUFFIX = {
	"en": "\n\nPrevious output was invalid. Reply again with ONLY: {\"idx\": <allowed integer>}.",
	"fa": "\n\nپاسخ قبلی نامعتبر بود. فقط همین را برگردانید: {\"idx\": <عدد صحیح از لیست مجاز>}.",
}

GUIDED_REGEX = r'^\s*\{\s*"idx"\s*:\s*\d+\s*\}\s*$'

_DIGIT_TRANS = str.maketrans(
	"۰۱۲۳۴۵۶۷۸۹٠١٢٣٤٥٦٧٨٩",
	"01234567890123456789",
)


def to_ascii_digits(text: str) -> str:
	return str(text).translate(_DIGIT_TRANS)


def extract_json_like(text: str) -> str:
	t = text.strip()
	t = re.sub(r"^```(?:json)?\\s*", "", t, flags=re.IGNORECASE)
	t = re.sub(r"\\s*```$", "", t)
	l = t.find("{")
	r = t.rfind("}")
	if l != -1 and r != -1 and r > l:
		return t[l : r + 1]
	return t


def build_prompt(language, faq_subset, sample, allowed_ids):
	if language not in PROMPT_RULES:
		raise ValueError(f"Unsupported language: {language}")
	faq_lines = []
	for idx, r in faq_subset.iterrows():
		faq_lines.append(f"idx={int(idx)}: {normalizer(r['faq'])}")
	faq_block = "\n".join(faq_lines)
	sample_text = normalizer(sample)
	allowed_str = ", ".join(str(int(i)) for i in allowed_ids)
	prompt = PROMPT_RULES[language].format(
		few_shot=FEW_SHOT_BLOCKS[language],
		allowed_ids=allowed_str,
		faq_block=faq_block,
		query=sample_text,
	)
	return prompt


def parse_idx(text):
	t = to_ascii_digits(text)
	t = extract_json_like(t)
	try:
		obj = json.loads(t.strip())
		idx = obj.get("idx", None)
		idx = int(idx)
		return (idx, idx in VALID_FAQ_IDS)
	except Exception:
		pass
	m = re.search(r'idx\s*[:=]\s*("?)(\d+)\1', t)
	if m:
		idx = int(m.group(2))
		return (idx, idx in VALID_FAQ_IDS)
	m = re.search(r'^\s*\{\s*"idx"\s*:\s*(\d+)\s*\}\s*$', t)
	if m:
		idx = int(m.group(1))
		return (idx, idx in VALID_FAQ_IDS)
	return (None, False)

## STS


### model: multilingual-e5-base


In [31]:
model_name = "intfloat/multilingual-e5-base"
model = SentenceTransformer(model_name)

### embeding


In [None]:
faq_inputs = [f"passage: {t}" for t in faq_ex['faq'].apply(normalizer).to_list()]
sample_inputs = [f"query: {t}" for t in samples_ex['sample'].apply(normalizer).to_list()]


faq_enc = model.encode(faq_inputs, batch_size=64, normalize_embeddings=True, show_progress_bar=True)
sample_enc = model.encode(sample_inputs, batch_size=64, normalize_embeddings=True, show_progress_bar=True)

S = faq_enc @ sample_enc.T

best_j = np.argmax(S, axis=0)
top1_score = S[best_j, np.arange(S.shape[1])]
pred_idx_sts = [faq_ex['idx'][j] for j in best_j]

samples_ex['sts_idx'] = pred_idx_sts
samples_ex['sts_score'] = top1_score

acc = (samples_ex['sts_idx'] == samples_ex['gt_idx']).mean()
err = 1 - acc
print(f"STS Accuracy: {acc:.4f}, Error Rate: {err:.4f}")

print("\nGround Truth vs STS result: (top 10 samples)")
display(samples_ex[['gt_idx','sts_idx']].head(10))

wrong = samples_ex[samples_ex["sts_idx"] != samples_ex["gt_idx"]].copy()
print(f"Wrong samples with lowest STS scores: (top 10 samples out of {len(wrong)})")
wrong_sorted = wrong.sort_values("sts_score", ascending=True).head(10)
display(wrong_sorted[["sample", "gt_idx", "sts_idx", "sts_score"]])

Batches: 100%|██████████| 1/1 [00:00<00:00, 37.50it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  8.43it/s]

STS Accuracy: 0.5914, Error Rate: 0.4086

Ground Truth vs STS result: (top 10 samples)





Unnamed: 0,gt_idx,sts_idx
0,1,1
1,1,6
2,1,1
3,1,1
4,1,20
5,1,1
6,1,20
7,1,20
8,2,3
9,2,3


Wrong samples with lowest STS scores: (top 10 samples out of 38)


Unnamed: 0,sample,gt_idx,sts_idx,sts_score
63,چکونه میتوان رمزعبورراتعقیرداد,13,9,0.80997
26,چگونه شماره موبایل خود را تایید کنم,5,3,0.824448
69,سلام خسته نباشید من برام از برنامه پیام اومد و...,16,20,0.830442
19,سلام \nشماره همراهی که به بانک معرفی کرده بودن...,5,13,0.836262
61,سلام روز بخیر شرایط دریافت وام با امتیاز به چه...,12,8,0.838129
80,سلام خسته نباشید \nبنده برای وام ازدواج اقدام ...,21,16,0.838673
42,سلام\nخسته نباشید\nمیخواستم بدانم که پرداخت اق...,8,4,0.840247
8,سلام وقت بخیر\nچجوری میتونم یک حساب جدید از طر...,2,3,0.840808
73,سلام روزبخیر\nچرا دسترسی به فایل حسابهای سالها...,18,5,0.843183
58,سلام خسته نباشید امتیاز تسهیلات من چرا زیاد نم...,11,10,0.843339


### top-k acc


In [None]:
def acc_top_k(S,faq, samples, k=3):
	faq_idx = faq['idx'].to_numpy()
	gt_idx = samples['gt_idx'].to_numpy()
	
	topk_rows = np.argsort(-S, axis=0)[:k, :]
	topk_idx = faq_idx[topk_rows]
	hit = (topk_idx == gt_idx).any(axis=0)
	
	return hit.mean(), topk_idx

for k in [1, 3, 5]:
	acc, top_idxs = acc_top_k(S, faq_ex, samples_ex, k=k)
	print(f"Top-{k} Accuracy: {acc:.4f}")

Top-1 Accuracy: 0.5914
Top-3 Accuracy: 0.8387
Top-5 Accuracy: 0.8925


### classification report


In [34]:
print(classification_report(samples_ex['gt_idx'].to_numpy(), samples_ex['sts_idx'].to_numpy(), labels=faq_ex['idx'].to_numpy(), zero_division=0))

              precision    recall  f1-score   support

           1       0.67      0.50      0.57         8
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         4
           4       0.33      0.67      0.44         3
           5       0.78      0.70      0.74        10
           6       0.50      0.43      0.46         7
           7       0.50      0.50      0.50         2
           8       0.57      0.57      0.57         7
           9       0.83      1.00      0.91         5
          10       0.86      0.86      0.86         7
          11       0.00      0.00      0.00         4
          12       0.50      0.33      0.40         3
          13       0.00      0.00      0.00         2
          14       1.00      0.50      0.67         2
          15       1.00      1.00      1.00         3
          16       0.00      0.00      0.00         2
          17       1.00      0.50      0.67         2
          18       1.00    

### save to xlsx


In [None]:
with pd.ExcelWriter(file_path, engine="openpyxl", mode='a', if_sheet_exists='replace') as writer:
	faq_ex.to_excel(writer, sheet_name='faq', index=False)
	samples_ex.to_excel(writer, sheet_name='samples', index=False)
print(f"Results saved to {file_path}")

Results saved to Ka-ChatBot_BenchMark.xlsx


## LLM


### llm models:

- google/gemma-3-4b-it
- Qwen/Qwen2.5-7B-Instruct


In [22]:
def evalute_llm(model, *, language, samples_df):
	# gemma-3 models require bf16; others can stay on fp16
	dtype = "bfloat16" if ("gemma-3" in model or "gemma3" in model) else "half"
	
	llm = LLM(
		model=model,
		dtype=dtype,
		trust_remote_code=True,
		gpu_memory_utilization=0.9,
		max_model_len=4096,
	)

	try:
		sampling = SamplingParams(
			temperature=0.0,
			top_p=1.0,
			max_tokens=32,
			stop=["\n\n"],
			guided_regex=GUIDED_REGEX,
		)
	except TypeError:
		sampling = SamplingParams(
			temperature=0.0,
			top_p=1.0,
			max_tokens=32,
			stop=["\n\n"],
		)
 
	def run_llm(prompts):
		outs = llm.generate(prompts, sampling_params=sampling)
		return [o.outputs[0].text for o in outs]

	sample_texts = samples_df['sample'].to_list()
	all_ids = [int(i) for i in faq_lookup.index.tolist()]
	faq_subset_all = faq_lookup.loc[all_ids]
	candidate_lists = [all_ids for _ in sample_texts]
	prompts = [build_prompt(language, faq_subset_all, sample_text, all_ids) for sample_text in sample_texts]

	t0 = time.time()
	raw = run_llm(prompts)
	t1 = time.time() - t0
	print(f"[{language}] LLM total time: {t1:.2f}s | per sample: {1000*t1/len(prompts):.1f} ms")
	
	pred = [None] * len(prompts)
	ok = [False] * len(prompts)
	allowed_sets = [set(ids) for ids in candidate_lists]

	def parse_and_store(i, text):
		idx, parse_ok = parse_idx(text)
		if parse_ok and idx not in allowed_sets[i]:
			parse_ok = False
			idx = None
		pred[i] = idx
		ok[i] = parse_ok

	for i, out in enumerate(raw):
		parse_and_store(i, out)

	max_retries = 1
	for _ in range(max_retries):
		retry_ids = [i for i, flag in enumerate(ok) if not flag]
		if not retry_ids:
			break
		retry_prompts = [
			prompts[i] + RETRY_SUFFIX[language]
			for i in retry_ids
		]
		retry_raw = run_llm(retry_prompts)
		for row_idx, out in zip(retry_ids, retry_raw):
			raw[row_idx] = out
			parse_and_store(row_idx, out)

	y_pred = np.array(pred, dtype=object)
  
	acc = (y_pred == samples_df['gt_idx']).mean()
	err = 1 - acc
	ok_rate = float(np.mean(ok))
	ok_mask = np.array(ok, dtype=bool)

	labels = faq_ex["idx"].to_numpy()
	if ok_mask.any():
		y_pred_ok = y_pred[ok_mask].astype(labels.dtype, copy=False)
		y_true_ok = samples_df['gt_idx'][ok_mask].astype(labels.dtype, copy=False)
		report_ok = classification_report(y_true_ok, y_pred_ok, labels=labels, zero_division=0)
	else:
		report_ok = "No parseable outputs (parse_ok_rate=0)."
  
	del llm
	gc.collect()
	try:
		torch.cuda.empty_cache()
	except Exception:
		pass

	return {
		"language": language,
		"model": model,
		"n_samples": len(prompts),
		"total_time_s": t1,
		"ms_per_sample": 1000.0 * t1 / max(1, len(prompts)),
		"acc": float(acc),
		"err": float(err),
		"parse_ok_rate": ok_rate,
		"raw": raw,
		"pred": pred,
		"ok": ok,
		"report_parse_ok_only": report_ok,
	}

In [23]:
models = [
	"google/gemma-3-4b-it",
	"Qwen/Qwen2.5-7B-Instruct"
]

languages = ["en", "fa"]
results = []
samples_by_language = {}
PRED_COLUMN_MAP = {
    ("en", "google/gemma-3-4b-it"): "en_gemma",
    ("fa", "google/gemma-3-4b-it"): "fa_gemma",
    ("en", "Qwen/Qwen2.5-7B-Instruct"): "en_qwen",
    ("fa", "Qwen/Qwen2.5-7B-Instruct"): "fa_qwen",
}

for lang in languages:
    print(f"\n=== Running language: {lang} ===")
    lang_samples = samples_ex.copy()
    for m in models:
        print(f"Evaluating LLM model: {m}")
        r = evalute_llm(m, language=lang, samples_df=lang_samples)
        results.append(r)
        safe_name = r["model"].replace("/", "_").replace("-", "_")
        lang_samples[f"llm_idx__{lang}__{safe_name}"] = r["pred"]
        lang_samples[f"llm_ok__{lang}__{safe_name}"] = r["ok"]
        print(f"acc={r['acc']:.4f} err={r['err']:.4f} parse_ok_rate={r['parse_ok_rate']:.4f}")
        print(f"time={r['total_time_s']:.2f}s | {r['ms_per_sample']:.1f} ms/sample\n")
        print("Classification report (parse_ok only):\n")
        print(r["report_parse_ok_only"])
    samples_by_language[lang] = lang_samples

samples_en_df = samples_by_language.get("en")
samples_fa_df = samples_by_language.get("fa")

samples_combined = samples_ex.copy()
for r in results:
    col_name = PRED_COLUMN_MAP.get((r["language"], r["model"]))
    if col_name:
        samples_combined[col_name] = r["pred"]

samples_ex = samples_combined

metrics_columns = ["language", "model", "n_samples", "total_time_s", "ms_per_sample", "acc", "err", "parse_ok_rate"]
metrics_df = pd.DataFrame([{k: v for k, v in r.items() if k in metrics_columns} for r in results])
metrics_en_df = metrics_df[metrics_df["language"] == "en"].reset_index(drop=True)
metrics_fa_df = metrics_df[metrics_df["language"] == "fa"].reset_index(drop=True)

print("\nSamples (EN) preview:")
display(samples_en_df.head() if samples_en_df is not None else None)

print("\nSamples (FA) preview:")
display(samples_fa_df.head() if samples_fa_df is not None else None)

print("\nCombined samples with prediction columns:")
display(samples_ex.head())

print("\nMetrics (EN):")
display(metrics_en_df)

print("\nMetrics (FA):")
display(metrics_fa_df)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.



=== Running language: en ===
Evaluating LLM model: google/gemma-3-4b-it


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


[0;36m(EngineCore_DP0 pid=51427)[0;0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
[0;36m(EngineCore_DP0 pid=51427)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`
Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.57it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.64s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.49s/it]
[0;36m(EngineCore_DP0 pid=51427)[0;0m 
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:03<00:00, 16.40it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35

[en] LLM total time: 1.29s | per sample: 13.9 ms


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


acc=0.7849 err=0.2151 parse_ok_rate=1.0000
time=1.29s | 13.9 ms/sample

Classification report (parse_ok only):

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         8
           2       0.50      1.00      0.67         2
           3       1.00      0.75      0.86         4
           4       1.00      1.00      1.00         3
           5       1.00      0.60      0.75        10
           6       0.86      0.86      0.86         7
           7       1.00      0.50      0.67         2
           8       1.00      1.00      1.00         7
           9       0.62      1.00      0.77         5
          10       1.00      0.57      0.73         7
          11       0.50      1.00      0.67         4
          12       1.00      0.33      0.50         3
          13       0.33      1.00      0.50         2
          14       0.67      1.00      0.80         2
          15       0.00      0.00      0.00         3
          16       0.00

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


[0;36m(EngineCore_DP0 pid=51777)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`
Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:03,  1.12s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:02<00:02,  1.21s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:03<00:01,  1.20s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:04<00:00,  1.22s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:04<00:00,  1.21s/it]
[0;36m(EngineCore_DP0 pid=51777)[0;0m 
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:05<00:00, 10.08it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:02<00:00, 12.10it/s]
Adding requests: 100%|██████████| 93/93 [00:00<00:00, 623.34it/s]
Processed prompts: 100%|██████████| 93/93 [00:03<00:00, 25.66it/s, est. speed input: 29429.73 toks/

[en] LLM total time: 3.78s | per sample: 40.6 ms


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


acc=0.8065 err=0.1935 parse_ok_rate=1.0000
time=3.78s | 40.6 ms/sample

Classification report (parse_ok only):

              precision    recall  f1-score   support

           1       0.80      1.00      0.89         8
           2       0.50      1.00      0.67         2
           3       0.75      0.75      0.75         4
           4       1.00      1.00      1.00         3
           5       1.00      0.70      0.82        10
           6       1.00      0.57      0.73         7
           7       0.25      0.50      0.33         2
           8       1.00      1.00      1.00         7
           9       0.71      1.00      0.83         5
          10       1.00      0.86      0.92         7
          11       0.50      0.75      0.60         4
          12       0.00      0.00      0.00         3
          13       1.00      0.50      0.67         2
          14       0.50      1.00      0.67         2
          15       0.00      0.00      0.00         3
          16       0.50

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


[0;36m(EngineCore_DP0 pid=52132)[0;0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
[0;36m(EngineCore_DP0 pid=52132)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`
Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.60it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.58it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.58it/s]
[0;36m(EngineCore_DP0 pid=52132)[0;0m 
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:03<00:00, 16.16it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35

[fa] LLM total time: 1.27s | per sample: 13.7 ms


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


acc=0.7312 err=0.2688 parse_ok_rate=1.0000
time=1.27s | 13.7 ms/sample

Classification report (parse_ok only):

              precision    recall  f1-score   support

           1       0.89      1.00      0.94         8
           2       0.67      1.00      0.80         2
           3       1.00      0.75      0.86         4
           4       1.00      1.00      1.00         3
           5       0.80      0.40      0.53        10
           6       0.83      0.71      0.77         7
           7       0.50      0.50      0.50         2
           8       1.00      1.00      1.00         7
           9       0.38      1.00      0.56         5
          10       0.86      0.86      0.86         7
          11       0.50      0.75      0.60         4
          12       1.00      0.33      0.50         3
          13       0.40      1.00      0.57         2
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         3
          16       0.00

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


[0;36m(EngineCore_DP0 pid=52469)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`
Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:03,  1.09s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:02<00:02,  1.21s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:03<00:01,  1.20s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:04<00:00,  1.24s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:04<00:00,  1.22s/it]
[0;36m(EngineCore_DP0 pid=52469)[0;0m 
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:05<00:00, 10.09it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:02<00:00, 12.00it/s]
Adding requests: 100%|██████████| 93/93 [00:00<00:00, 561.18it/s]
Processed prompts: 100%|██████████| 93/93 [00:03<00:00, 23.47it/s, est. speed input: 33309.26 toks/

[fa] LLM total time: 4.13s | per sample: 44.4 ms


Adding requests: 100%|██████████| 15/15 [00:00<00:00, 531.32it/s]
Processed prompts: 100%|██████████| 15/15 [00:02<00:00,  7.02it/s, est. speed input: 10451.25 toks/s, output: 165.68 toks/s]


acc=0.6774 err=0.3226 parse_ok_rate=0.8925
time=4.13s | 44.4 ms/sample

Classification report (parse_ok only):

              precision    recall  f1-score   support

           1       0.67      0.50      0.57         4
           2       0.29      1.00      0.44         2
           3       0.75      1.00      0.86         3
           4       1.00      1.00      1.00         3
           5       1.00      0.50      0.67        10
           6       1.00      0.67      0.80         6
           7       0.33      0.50      0.40         2
           8       1.00      1.00      1.00         7
           9       1.00      0.60      0.75         5
          10       1.00      1.00      1.00         4
          11       0.60      0.75      0.67         4
          12       0.00      0.00      0.00         2
          13       0.40      1.00      0.57         2
          14       0.33      0.50      0.40         2
          15       0.00      0.00      0.00         3
          16       0.50

Unnamed: 0,sample,gt_idx,sts_idx,sts_score,llm_idx__google_gemma_3_4b_it,llm_ok__google_gemma_3_4b_it,llm_idx__Qwen_Qwen2.5_7b_Instruct,llm_ok__Qwen_Qwen2.5_7b_Instruct,llm_idx__en__google_gemma_3_4b_it,llm_ok__en__google_gemma_3_4b_it,llm_idx__en__Qwen_Qwen2.5_7B_Instruct,llm_ok__en__Qwen_Qwen2.5_7B_Instruct
0,سلام وقت بخیر من میخوام افتتاح حساب کنم ولی ت...,1,1,0.885865,1,True,,False,1,True,1,True
1,سلام من کارت ملی ندارم\nاما شناسنامه عکسدار و ...,1,6,0.856472,1,True,2.0,True,1,True,1,True
2,درود وقت بخیر برای افتتاح حساب تو قسمت احراز ...,1,1,0.870513,1,True,,False,1,True,1,True
3,سلام عکس روی شناسنامه من بدون ریشه وای الان ری...,1,1,0.853145,1,True,,False,1,True,1,True
4,سلام من بدا یز ثبت نام مشکل دارم من در بانک ب...,1,20,0.858814,1,True,,False,1,True,1,True



Samples (FA) preview:


Unnamed: 0,sample,gt_idx,sts_idx,sts_score,llm_idx__google_gemma_3_4b_it,llm_ok__google_gemma_3_4b_it,llm_idx__Qwen_Qwen2.5_7b_Instruct,llm_ok__Qwen_Qwen2.5_7b_Instruct,llm_idx__fa__google_gemma_3_4b_it,llm_ok__fa__google_gemma_3_4b_it,llm_idx__fa__Qwen_Qwen2.5_7B_Instruct,llm_ok__fa__Qwen_Qwen2.5_7B_Instruct
0,سلام وقت بخیر من میخوام افتتاح حساب کنم ولی ت...,1,1,0.885865,1,True,,False,1,True,,False
1,سلام من کارت ملی ندارم\nاما شناسنامه عکسدار و ...,1,6,0.856472,1,True,2.0,True,1,True,2.0,True
2,درود وقت بخیر برای افتتاح حساب تو قسمت احراز ...,1,1,0.870513,1,True,,False,1,True,1.0,True
3,سلام عکس روی شناسنامه من بدون ریشه وای الان ری...,1,1,0.853145,1,True,,False,1,True,,False
4,سلام من بدا یز ثبت نام مشکل دارم من در بانک ب...,1,20,0.858814,1,True,,False,1,True,,False



Combined samples with prediction columns:


Unnamed: 0,sample,gt_idx,sts_idx,sts_score,llm_idx__google_gemma_3_4b_it,llm_ok__google_gemma_3_4b_it,llm_idx__Qwen_Qwen2.5_7b_Instruct,llm_ok__Qwen_Qwen2.5_7b_Instruct,en_gemma,en_qwen,fa_gemma,fa_qwen
0,سلام وقت بخیر من میخوام افتتاح حساب کنم ولی ت...,1,1,0.885865,1,True,,False,1,1,1,
1,سلام من کارت ملی ندارم\nاما شناسنامه عکسدار و ...,1,6,0.856472,1,True,2.0,True,1,1,1,2.0
2,درود وقت بخیر برای افتتاح حساب تو قسمت احراز ...,1,1,0.870513,1,True,,False,1,1,1,1.0
3,سلام عکس روی شناسنامه من بدون ریشه وای الان ری...,1,1,0.853145,1,True,,False,1,1,1,
4,سلام من بدا یز ثبت نام مشکل دارم من در بانک ب...,1,20,0.858814,1,True,,False,1,1,1,



Metrics (EN):


Unnamed: 0,language,model,n_samples,total_time_s,ms_per_sample,acc,err,parse_ok_rate
0,en,google/gemma-3-4b-it,93,1.292481,13.897642,0.784946,0.215054,1.0
1,en,Qwen/Qwen2.5-7B-Instruct,93,3.778668,40.630843,0.806452,0.193548,1.0



Metrics (FA):


Unnamed: 0,language,model,n_samples,total_time_s,ms_per_sample,acc,err,parse_ok_rate
0,fa,google/gemma-3-4b-it,93,1.272614,13.684027,0.731183,0.268817,1.0
1,fa,Qwen/Qwen2.5-7B-Instruct,93,4.133215,44.443174,0.677419,0.322581,0.892473


In [None]:
ac = samples_ex
ac.drop('llm_idx__Qwen_Qwen2.5_7b_Instruct', axis=1, inplace=True)
ac.drop('llm_ok__Qwen_Qwen2.5_7b_Instruct', axis=1, inplace=True)
ac.drop('llm_idx__google_gemma_3_4b_it', axis=1, inplace=True)
ac.drop('llm_ok__google_gemma_3_4b_it', axis=1, inplace=True)

### save to xlsx


In [31]:
with pd.ExcelWriter(file_path, engine="openpyxl", mode='a', if_sheet_exists='replace') as writer:
	faq_ex.to_excel(writer, sheet_name='faq', index=False)
	samples_ex.to_excel(writer, sheet_name='samples', index=False)
print(f"Results saved to {file_path}")

Results saved to Ka-ChatBot_BenchMark.xlsx
