# STS vs LLM


## Imports


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from textwrap import dedent
import json
import re
import time
from vllm import LLM, SamplingParams # type: ignore
import gc
import torch

## Data


In [79]:
file_path = 'Ka-ChatBot_BenchMark.xlsx'
faq_ex = pd.read_excel(file_path, sheet_name='faq')
samples_ex = pd.read_excel(file_path, sheet_name='samples')
labels = samples_ex['gt_idx'].to_list()
VALID_FAQ_IDS = set(faq_ex["idx"].tolist())

## Helper functions


In [None]:
def normalizer(text) -> str:
	if text is None:
		return ""
	t = str(text)
	t = t.replace("ي", "ی").replace("ك", "ک")
	t = t.replace("\u200c", " ").replace("\u200f", "").replace("\u200a", "")
	t = " ".join(t.split())
	return t

def build_prompt(faq, sample):
	faq_lines = []
	for _, r in faq.iterrows():
		faq_lines.append(f"idx={r['idx']}: {normalizer(r['faq'])}")
	faq_block = "\n".join(faq_lines)
	sample_text = normalizer(sample)
	prompt = dedent(f"""\
		You are a classifier. Choose the single best matching FAQ idx.
		FAQ list:
		{faq_block}
		
		Query:
		{sample_text}
		
		Return only valid Json(no extra text):
		{{"idx": <best matching faq idx>}}
	""")
	return prompt

def parse_idx(text):
	try:
		obj = json.loads(text.strip())
		idx = obj.get("idx", None)
		return (idx, idx in VALID_FAQ_IDS)
	except Exception:
		pass
	
	m = re.search(r'idx\s*[:=]\s*("?)(\d+)\1', text)
	if m:
		idx = int(m.group(2))
		return (idx, idx in VALID_FAQ_IDS)
	
	return (None, False)

## STS


### model: multilingual-e5-base


In [31]:
model_name = "intfloat/multilingual-e5-base"
model = SentenceTransformer(model_name)

### embeding


In [None]:
faq_inputs = [f"passage: {t}" for t in faq_ex['faq'].apply(normalizer).to_list()]
sample_inputs = [f"query: {t}" for t in samples_ex['sample'].apply(normalizer).to_list()]


faq_enc = model.encode(faq_inputs, batch_size=64, normalize_embeddings=True, show_progress_bar=True)
sample_enc = model.encode(sample_inputs, batch_size=64, normalize_embeddings=True, show_progress_bar=True)

S = faq_enc @ sample_enc.T

best_j = np.argmax(S, axis=0)
top1_score = S[best_j, np.arange(S.shape[1])]
pred_idx_sts = [faq_ex['idx'][j] for j in best_j]

samples_ex['sts_idx'] = pred_idx_sts
samples_ex['sts_score'] = top1_score

acc = (samples_ex['sts_idx'] == samples_ex['gt_idx']).mean()
err = 1 - acc
print(f"STS Accuracy: {acc:.4f}, Error Rate: {err:.4f}")

print("\nGround Truth vs STS result: (top 10 samples)")
display(samples_ex[['gt_idx','sts_idx']].head(10))

wrong = samples_ex[samples_ex["sts_idx"] != samples_ex["gt_idx"]].copy()
print(f"Wrong samples with lowest STS scores: (top 10 samples out of {len(wrong)})")
wrong_sorted = wrong.sort_values("sts_score", ascending=True).head(10)
display(wrong_sorted[["sample", "gt_idx", "sts_idx", "sts_score"]])

Batches: 100%|██████████| 1/1 [00:00<00:00, 37.50it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  8.43it/s]

STS Accuracy: 0.5914, Error Rate: 0.4086

Ground Truth vs STS result: (top 10 samples)





Unnamed: 0,gt_idx,sts_idx
0,1,1
1,1,6
2,1,1
3,1,1
4,1,20
5,1,1
6,1,20
7,1,20
8,2,3
9,2,3


Wrong samples with lowest STS scores: (top 10 samples out of 38)


Unnamed: 0,sample,gt_idx,sts_idx,sts_score
63,چکونه میتوان رمزعبورراتعقیرداد,13,9,0.80997
26,چگونه شماره موبایل خود را تایید کنم,5,3,0.824448
69,سلام خسته نباشید من برام از برنامه پیام اومد و...,16,20,0.830442
19,سلام \nشماره همراهی که به بانک معرفی کرده بودن...,5,13,0.836262
61,سلام روز بخیر شرایط دریافت وام با امتیاز به چه...,12,8,0.838129
80,سلام خسته نباشید \nبنده برای وام ازدواج اقدام ...,21,16,0.838673
42,سلام\nخسته نباشید\nمیخواستم بدانم که پرداخت اق...,8,4,0.840247
8,سلام وقت بخیر\nچجوری میتونم یک حساب جدید از طر...,2,3,0.840808
73,سلام روزبخیر\nچرا دسترسی به فایل حسابهای سالها...,18,5,0.843183
58,سلام خسته نباشید امتیاز تسهیلات من چرا زیاد نم...,11,10,0.843339


### top-k acc


In [None]:
def acc_top_k(S,faq, samples, k=3):
	faq_idx = faq['idx'].to_numpy()
	gt_idx = samples['gt_idx'].to_numpy()
	
	topk_rows = np.argsort(-S, axis=0)[:k, :]
	topk_idx = faq_idx[topk_rows]
	hit = (topk_idx == gt_idx).any(axis=0)
	
	return hit.mean(), topk_idx

for k in [1, 3, 5]:
	acc, top_idxs = acc_top_k(S, faq_ex, samples_ex, k=k)
	print(f"Top-{k} Accuracy: {acc:.4f}")

Top-1 Accuracy: 0.5914
Top-3 Accuracy: 0.8387
Top-5 Accuracy: 0.8925


### classification report


In [34]:
print(classification_report(samples_ex['gt_idx'].to_numpy(), samples_ex['sts_idx'].to_numpy(), labels=faq_ex['idx'].to_numpy(), zero_division=0))

              precision    recall  f1-score   support

           1       0.67      0.50      0.57         8
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         4
           4       0.33      0.67      0.44         3
           5       0.78      0.70      0.74        10
           6       0.50      0.43      0.46         7
           7       0.50      0.50      0.50         2
           8       0.57      0.57      0.57         7
           9       0.83      1.00      0.91         5
          10       0.86      0.86      0.86         7
          11       0.00      0.00      0.00         4
          12       0.50      0.33      0.40         3
          13       0.00      0.00      0.00         2
          14       1.00      0.50      0.67         2
          15       1.00      1.00      1.00         3
          16       0.00      0.00      0.00         2
          17       1.00      0.50      0.67         2
          18       1.00    

### save to xlsx


In [None]:
with pd.ExcelWriter(file_path, engine="openpyxl", mode='a', if_sheet_exists='replace') as writer:
	faq_ex.to_excel(writer, sheet_name='faq', index=False)
	samples_ex.to_excel(writer, sheet_name='samples', index=False)
print(f"Results saved to {file_path}")

Results saved to Ka-ChatBot_BenchMark.xlsx


## LLM


### llm models:

- google/gemma-3-4b-it
- Qwen/Qwen2.5-7B-Instruct


In [None]:
def evalute_llm(model):
	
	llm = LLM(
		model=model,
		dtype="half",
		trust_remote_code=True,
		gpu_memory_utilization=0.9,
		max_model_len=4096,
	)

	sampling = SamplingParams(
		temperature=0.0,
		top_p=1.0,
		max_tokens=64,
		stop=["\n\n"]
	)
 
	def run_llm(prompts):
		outs = llm.generate(prompts, sampling_params=sampling)
		return [o.outputs[0].text for o in outs]

	prompts = [build_prompt(faq_ex, s) for s in samples_ex['sample'].to_list()]

	t0 = time.time()
	raw = run_llm(prompts)
	t1 = time.time() - t0
	print(f"LLM total time: {t1:.2f}s | per sample: {1000*t1/len(prompts):.1f} ms")
	
	pred = []
	ok = []
	for out in raw:
		idx, parse_ok = parse_idx(out)
		pred.append(idx)
		ok.append(parse_ok)

	y_pred = np.array(pred, dtype=object)
  
	acc = (y_pred == samples_ex['gt_idx']).mean()
	err = 1 - acc
	ok_rate = float(np.mean(ok))
	ok_mask = np.array(ok, dtype=bool)

	labels = faq_ex["idx"].to_numpy()
	if ok_mask.any():
		y_pred_ok = y_pred[ok_mask].astype(labels.dtype, copy=False)
		y_true_ok = samples_ex['gt_idx'][ok_mask].astype(labels.dtype, copy=False)
		report_ok = classification_report(y_true_ok, y_pred_ok, labels=labels, zero_division=0)
	else:
		report_ok = "No parseable outputs (parse_ok_rate=0)."
  
	del llm
	gc.collect()
	try:
		torch.cuda.empty_cache()
	except Exception:
		pass

	return {
		"model": model,
		"n_samples": len(prompts),
		"total_time_s": t1,
		"ms_per_sample": 1000.0 * t1 / max(1, len(prompts)),
		"acc": float(acc),
		"err": float(err),
		"parse_ok_rate": ok_rate,
		"raw": raw,
		"pred": pred,
		"ok": ok,
		"report_parse_ok_only": report_ok,
	}

In [None]:
models = [
	"google/gemma-3-4b-it",
	"Qwen/Qwen2.5-7b-Instruct"
]

results = []
for m in models:
    print(f"\nEvaluating LLM model: {m}")
    r = evalute_llm(m)
    results.append(r)
    
    print(f"acc={r['acc']:.4f} err={r['err']:.4f} parse_ok_rate={r['parse_ok_rate']:.4f}")
    print(f"time={r['total_time_s']:.2f}s | {r['ms_per_sample']:.1f} ms/sample\n")
    print("Classification report (parse_ok only):\n")
    print(r["report_parse_ok_only"])
    
summary = pd.DataFrame([{k: v for k, v in r.items() if k not in ("raw","pred","ok","report_parse_ok_only")} for r in results])
display(summary)

for r in results:
    safe_name = r["model"].replace("/", "_").replace("-", "_")
    samples_ex[f"llm_idx__{safe_name}"] = r["pred"]
    samples_ex[f"llm_ok__{safe_name}"] = r["ok"]

### save to xlsx


In [None]:
with pd.ExcelWriter(file_path, engine="openpyxl", mode='a', if_sheet_exists='replace') as writer:
	faq_ex.to_excel(writer, sheet_name='faq', index=False)
	samples_ex.to_excel(writer, sheet_name='samples', index=False)
print(f"Results saved to {file_path}")