<a href="https://colab.research.google.com/github/Benyormin/Question_answering/blob/main/AI_TASK_E_F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q sentence-transformers gdown tqdm

## Loading Dataset

In [None]:
# === Imports ===
import os
import json
import zipfile
import shutil
import gdown
from typing import Dict, List, Tuple

# === Helper functions ===
def download_drive_file(file_id: str, dest_path: str):
    """Download file from Google Drive given a file_id."""
    url = f"https://drive.google.com/uc?id={file_id}"
    print("Downloading from:", url)
    gdown.download(url, dest_path, quiet=False)


def unzip_to(zip_path: str, dest_dir: str):
    print("Unzipping", zip_path, "->", dest_dir)
    if os.path.exists(dest_dir):
        print("dest exists, removing and re-creating")
        shutil.rmtree(dest_dir)
    os.makedirs(dest_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(dest_dir)
    print("   extracted")


def load_dataset(path: str) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, List[str]]]:
    """Load dataset JSON into queries, corpus, relevant docs."""
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    queries = d["queries"]          # qid -> question text
    corpus = d["corpus"]            # docid -> chunk text
    relevant = d["relevant_docs"]   # qid -> [docid, ...]
    return queries, corpus, relevant


# === Google Drive File IDs ===
GDRIVE_FILE_ID_SENTENCE = "1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH"
GDRIVE_FILE_ID_WORD     = "19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k"

# === Step 1: Download ===
download_drive_file(GDRIVE_FILE_ID_SENTENCE, "sentence_dataset.zip")
download_drive_file(GDRIVE_FILE_ID_WORD, "word_dataset.zip")

# === Unzip ===
unzip_to("sentence_dataset.zip", "sentence_dataset")
unzip_to("word_dataset.zip", "word_dataset")




Downloading from: https://drive.google.com/uc?id=1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH


Downloading...
From: https://drive.google.com/uc?id=1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH
To: /content/sentence_dataset.zip
100%|██████████| 122k/122k [00:00<00:00, 14.1MB/s]


Downloading from: https://drive.google.com/uc?id=19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k


Downloading...
From: https://drive.google.com/uc?id=19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k
To: /content/word_dataset.zip
100%|██████████| 134k/134k [00:00<00:00, 2.77MB/s]

Unzipping sentence_dataset.zip -> sentence_dataset
   extracted
Unzipping word_dataset.zip -> word_dataset
   extracted





In [None]:
# === Load datasets ===
sentence_queries_train, sentence_corpus_train, sentence_relevant_train = load_dataset('/content/sentence_dataset/train_dataset.json')
sentence_queries_val, sentence_corpus_val, sentence_relevant_val = load_dataset('/content/sentence_dataset/val_dataset.json')

word_queries_train, word_corpus_train, word_relevant_train = load_dataset("/content/word_dataset/train_dataset.json")
word_queries_val, word_corpus_val, word_relevant_val = load_dataset("/content/word_dataset/val_dataset.json")

print("Sentence dataset size:", len(sentence_queries_train), "queries,", len(sentence_corpus_train), "docs")
print("Word-based dataset size:", len(word_queries_train), "queries,", len(word_corpus_train), "docs")


Sentence dataset size: 309 queries, 78 docs
Word-based dataset size: 366 queries, 93 docs


In [None]:
sentence_queries_train, sentence_corpus_train, sentence_relevant_train

({'7645a669-81df-4cd7-a2b2-ca503100b1e0': 'دیکلوفناک چگونه با التهاب و درد مقابله می کند؟',
  '0df0cc7c-0dbc-4d02-a4c7-3d9a6c320502': 'دیکلوفناک به صورت چند ملح در بازار موجود است؟',
  'b7f349cb-7927-40c0-ba47-857906fc2ee1': 'تفاوت دیکلوفناک سدیم و پتاسیم در چیست؟',
  '86c5b2ec-aeab-466c-8875-17c37f52e950': 'شیاف دیکلوفناک چه علائم آرتریت مفاصل را تسکین می دهد؟',
  'a6601503-7266-458d-9d07-85681526d987': 'شیاف دیکلوفناک برای تسکین درد قاعدگی چقدر زمان می برد؟',
  'b87c36b2-91e3-47f5-a63a-3911868feece': 'آمپول دیکلوفناک چه علائمی را در بیماری آرتروز آرام می کند؟',
  'd0fffdb7-e7ca-4c00-a046-a8919821aaa9': 'دیکلوفناک تولید کدام آنزیم التهاب ساز را مهار می کند؟',
  'be587b32-cdab-49fa-b09e-ad289d759d6b': 'دُز بالای آمپول دیکلوفناک به عنوان چه نوع مسکنی شناخته می شود؟',
  '6a4a27f8-8bf7-41c1-b23f-2923513b4202': 'با کاهش میزان مواد شیمیایی التهابی در بدن چه اتفاقی می افتد؟',
  '2da0f6d4-ddf8-4a58-ab01-afcc45b53032': 'پزشک معالج بعد از تجویز دیکلوفناک چه چیزی به بیمار می دهد؟',
  'dd9810ad-b

## Preprocessing

In [None]:
import random
# ---------- CONFIG ----------
RND_SEED = 42
VAL_FRAC = 0.10
POS_PER_QUERY = 1
TRAIN_BATCH_SIZE = 16

# Paths to datasets
SENT_TRAIN_PATH = "/content/sentence_dataset/train_dataset.json"
SENT_VAL_PATH   = "/content/sentence_dataset/val_dataset.json"
WORD_TRAIN_PATH = "/content/word_dataset/train_dataset.json"
WORD_VAL_PATH   = "/content/word_dataset/val_dataset.json"

OUT_DIR = "/content/dataset_splits"
os.makedirs(OUT_DIR, exist_ok=True)
random.seed(RND_SEED)

In [None]:
import os
import json
import random
from typing import Dict, List, Tuple
from sentence_transformers import InputExample, SentencesDataset, SentenceTransformer
from torch.utils.data import DataLoader


# ---------- Helpers ----------
def save_split_json(queries, corpus, relevant, out_path):
    obj = {"queries": queries, "corpus": corpus, "relevant_docs": relevant}
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def split_train_holdout(train_queries, train_corpus, train_relevant, val_frac=0.1, seed=42):
    """Split queries into train and validation. Keep full corpus for both."""
    qids = list(train_queries.keys())
    random.Random(seed).shuffle(qids)
    n_valid = max(1, int(len(qids) * val_frac))
    valid_qids = set(qids[:n_valid])
    train_qids = list(qids[n_valid:])

    train_q_map = {qid: train_queries[qid] for qid in train_qids}
    valid_q_map = {qid: train_queries[qid] for qid in valid_qids}

    # full corpus kept intact
    return (
        (train_q_map, dict(train_corpus), {qid: train_relevant.get(qid, []) for qid in train_qids}),
        (valid_q_map, dict(train_corpus), {qid: train_relevant.get(qid, []) for qid in valid_qids})
    )

def build_input_examples(queries, corpus, relevant, pos_per_query=1, seed=42, use_all_positives=False):
    """Convert queries + positives into InputExamples. Negatives are handled by MNLR during training."""
    rng = random.Random(seed)
    examples = []
    for qid, qtext in queries.items():
        positives = relevant.get(qid, [])
        if not positives:
            continue
        chosen = positives if use_all_positives else rng.sample(positives, min(len(positives), pos_per_query))
        for docid in chosen:
            if docid in corpus:
                examples.append(InputExample(texts=[qtext, corpus[docid]]))
    return examples

# ---------- Main function ----------
def prepare_dataset_for_training(
    train_path, test_path, name,
    base_model_name="intfloat/multilingual-e5-base",  # model name now parameterized
    val_frac=0.1, pos_per_query=1, seed=42,
    out_dir=OUT_DIR, train_batch_size=16
):
    """Prepare train/valid splits and dataloaders for fine-tuning."""
    # load original files
    def _load(path):
        with open(path, "r", encoding="utf-8") as f:
            d = json.load(f)
        return d["queries"], d["corpus"], d["relevant_docs"]

    train_q, train_c, train_rel = _load(train_path)
    test_q, test_c, test_rel    = _load(test_path)

    # split
    (new_train_q, new_train_c, new_train_rel), (new_valid_q, new_valid_c, new_valid_rel) = split_train_holdout(
        train_q, train_c, train_rel, val_frac=val_frac, seed=seed
    )

    # save splits
    base_out = os.path.join(out_dir, name)
    os.makedirs(base_out, exist_ok=True)
    save_split_json(new_train_q, new_train_c, new_train_rel, os.path.join(base_out, "train_split.json"))
    save_split_json(new_valid_q, new_valid_c, new_valid_rel, os.path.join(base_out, "valid_split.json"))
    save_split_json(test_q, test_c, test_rel, os.path.join(base_out, "test_dataset.json"))

    # build examples
    train_examples = build_input_examples(new_train_q, new_train_c, new_train_rel, pos_per_query, seed, True)
    valid_examples = build_input_examples(new_valid_q, new_valid_c, new_valid_rel, pos_per_query, seed, True)


    # create datasets
    model_for_tokenization = SentenceTransformer(base_model_name)
    train_dataset = SentencesDataset(train_examples, model_for_tokenization)
    valid_dataset = SentencesDataset(valid_examples, model_for_tokenization)

    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=train_batch_size)

    return {
        "name": name,
        "train_examples": len(train_examples),
        "valid_examples": len(valid_examples),
        "train_loader": train_loader,
        "valid_loader": valid_loader,
        "base_model": base_model_name,
        "split_dir": base_out
    }



In [None]:

info_sentence = prepare_dataset_for_training(SENT_TRAIN_PATH, SENT_VAL_PATH, "sentence",
                                             base_model_name="intfloat/multilingual-e5-base")

info_word = prepare_dataset_for_training(WORD_TRAIN_PATH, WORD_VAL_PATH, "word",
                                         base_model_name="intfloat/multilingual-e5-base")

train_loader_sentence = info_sentence["train_loader"]
valid_loader_sentence = info_sentence["valid_loader"]
train_loader_word     = info_word["train_loader"]
valid_loader_word     = info_word["valid_loader"]

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
info_sentence

{'name': 'sentence',
 'train_examples': 279,
 'valid_examples': 30,
 'train_loader': <torch.utils.data.dataloader.DataLoader at 0x7a1f0baea840>,
 'valid_loader': <torch.utils.data.dataloader.DataLoader at 0x7a1f0a976f30>,
 'base_model': 'intfloat/multilingual-e5-base',
 'split_dir': '/content/dataset_splits/sentence'}

In [None]:
train_loader_sentence

<torch.utils.data.dataloader.DataLoader at 0x7eba64c800b0>

## Fine-Tuning

### multilingual-e5-base

In [None]:
import os, json, math, random
from datetime import datetime
from typing import Dict, List
import numpy as np
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses

# ---- Config (change if needed) ----
BASE_MODEL = "intfloat/multilingual-e5-base"   # your chosen model
SAVE_ROOT = "/content/models"
os.makedirs(SAVE_ROOT, exist_ok=True)

TRAIN_BATCH_SIZE = 8
EPOCHS = 4
LR = 2e-5
RND_SEED = 42
random.seed(RND_SEED)
np.random.seed(RND_SEED)

In [None]:
# Required extra imports
import math
import torch
from datetime import datetime
from sentence_transformers import losses

# ---- Helper: evaluate (re-uses your evaluate_cosine_mrr idea) ----
def encode_texts(model: SentenceTransformer, texts: List[str], batch_size:int=64, normalize:bool=True):
    embs = model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    if normalize:
        norms = np.linalg.norm(embs, axis=1, keepdims=True)
        norms[norms==0] = 1.0
        embs = embs / norms
    return embs

def evaluate_cosine_mrr(model: SentenceTransformer,
                        queries_map: Dict[str,str],
                        corpus_map: Dict[str,str],
                        relevant_map: Dict[str, List[str]],
                        batch_size:int = 64,
                        max_rank:int = 1000):
    qids = list(queries_map.keys())
    corpus_ids = list(corpus_map.keys())
    query_texts = [queries_map[q] for q in qids]
    corpus_texts = [corpus_map[c] for c in corpus_ids]

    print("Encoding corpus...")
    corpus_embs = encode_texts(model, corpus_texts, batch_size=batch_size, normalize=True)
    print("Encoding queries...")
    query_embs = encode_texts(model, query_texts, batch_size=batch_size, normalize=True)

    sims = np.dot(query_embs, corpus_embs.T)
    top1_cosines = sims.max(axis=1)
    mean_top1_cosine = float(np.mean(top1_cosines))

    rr_list = []
    hit1 = 0
    per_query = []
    for i, qid in enumerate(qids):
        sim_row = sims[i]
        order = np.argsort(-sim_row)
        rel_set = set(relevant_map.get(qid, []))
        first_rank = None
        for rank_idx, doc_idx in enumerate(order, start=1):
            if corpus_ids[doc_idx] in rel_set:
                first_rank = rank_idx
                break
            if rank_idx >= max_rank:
                break
        rr = 1.0/first_rank if first_rank is not None else 0.0
        rr_list.append(rr)
        if first_rank == 1:
            hit1 += 1
        top1_idx = order[0]
        per_query.append({
            "qid": qid,
            "top1_id": corpus_ids[top1_idx],
            "top1_score": float(sim_row[top1_idx]),
            "first_rel_rank": first_rank,
            "reciprocal_rank": rr
        })

    mrr = float(np.mean(rr_list)) if len(rr_list)>0 else 0.0
    hit1_rate = hit1 / float(len(qids)) if len(qids)>0 else 0.0
    return {"mean_top1_cosine": mean_top1_cosine, "mrr": mrr, "hit@1": hit1_rate, "per_query": per_query}

# ---- Helper: build all-positive InputExamples from split JSON file paths ----
def load_split_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    return d["queries"], d["corpus"], d["relevant_docs"]

def build_allpos_input_examples(queries: Dict[str,str], corpus: Dict[str,str], relevant: Dict[str,List[str]]):
    examples = []
    for qid, qtext in queries.items():
        positives = relevant.get(qid, []) or []
        for docid in positives:
            doc_text = corpus.get(docid)
            if doc_text is None:
                continue
            # label=1.0 indicates positive pair for CosineSimilarityLoss
            examples.append(InputExample(texts=[qtext, doc_text], label=1.0))
    return examples

# ---- your split files ----
TRAIN_SPLIT = "/content/dataset_splits/sentence/train_split.json"
VALID_SPLIT = "/content/dataset_splits/sentence/valid_split.json"

assert os.path.exists(TRAIN_SPLIT) and os.path.exists(VALID_SPLIT), f"Missing split files: {TRAIN_SPLIT}, {VALID_SPLIT}"

# ---- Load splits and build examples ----
train_queries, train_corpus, train_relevant = load_split_json(TRAIN_SPLIT)
valid_queries, valid_corpus, valid_relevant = load_split_json(VALID_SPLIT)

train_examples = build_allpos_input_examples(train_queries, train_corpus, train_relevant)
valid_examples = build_allpos_input_examples(valid_queries, valid_corpus, valid_relevant)

print("Train examples (all positives):", len(train_examples))
print("Valid examples (all positives):", len(valid_examples))

# ---- Create DataLoaders  ----
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
valid_dataloader = DataLoader(valid_examples, shuffle=False, batch_size=TRAIN_BATCH_SIZE)

# ---- Device, mixed precision and grad accumulation configuration ----
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
if device == "cuda":
    torch.cuda.empty_cache()

# gradient accumulation hyperparam (used only for computing effective steps; NOT passed to fit())
GRADIENT_ACCUMULATION_STEPS = 1  # set to 1 if you don't want accumulation

# ---- Load model ----
print("Loading base model:", BASE_MODEL)
model = SentenceTransformer(BASE_MODEL)
# Move model to device if desired (SentenceTransformer.fit will also handle device, but explicit move is fine)
if device == "cuda":
    model = model.to(device)

# ---- Loss ----
train_loss = losses.CosineSimilarityLoss(model=model)   # positive pairs labelled 1.0

# ---- Warmup steps heuristic using effective steps (account for grad-accum) ----
# Calculate number of optimization steps per epoch = ceil(len(dataloader) / grad_accum)
steps_per_epoch_effective = math.ceil(len(train_dataloader) / float(GRADIENT_ACCUMULATION_STEPS))
total_training_steps = steps_per_epoch_effective * EPOCHS
warmup_steps = int(0.1 * total_training_steps)
print(f"train examples: {len(train_examples)}, dataloader batches: {len(train_dataloader)}, "
      f"grad_accum={GRADIENT_ACCUMULATION_STEPS}, effective_steps_per_epoch={steps_per_epoch_effective}, "
      f"total_steps={total_training_steps}, warmup_steps={warmup_steps}")


Train examples (all positives): 279
Valid examples (all positives): 30
Device: cuda
Loading base model: intfloat/multilingual-e5-base
train examples: 279, dataloader batches: 35, grad_accum=1, effective_steps_per_epoch=35, total_steps=140, warmup_steps=14


In [None]:
# ---- Train ----
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
SAVE_DIR = os.path.join(SAVE_ROOT, f"sentence_e5_allpos_{timestamp}")
os.makedirs(SAVE_DIR, exist_ok=True)

print("Starting training...")

# now run training with validation
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    optimizer_params={"lr": LR},
    output_path=SAVE_DIR,
    show_progress_bar=True,
    use_amp=(device == "cuda")  # AMP for CUDA
)

print("Training finished. Model saved to:", SAVE_DIR)

Starting training...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbeny11min[0m ([33mbeny11min-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Training finished. Model saved to: /content/models/sentence_e5_allpos_20250903_124010


### distiluse-base-multilingual-cased-v1

In [None]:

info_sentence = prepare_dataset_for_training(SENT_TRAIN_PATH, SENT_VAL_PATH, "sentence",
                                             base_model_name="sentence-transformers/distiluse-base-multilingual-cased-v1"
)

train_loader_sentence = info_sentence["train_loader"]
valid_loader_sentence = info_sentence["valid_loader"]

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
# ---- Imports ----
import os, json, math, random
from datetime import datetime
from typing import Dict, List
import numpy as np
from torch.utils.data import DataLoader

from sentence_transformers import SentenceTransformer, InputExample, losses

# ---- Config (change if needed) ----
BASE_MODEL = "sentence-transformers/distiluse-base-multilingual-cased-v1"
SAVE_ROOT = "/content/models"
os.makedirs(SAVE_ROOT, exist_ok=True)

TRAIN_BATCH_SIZE = 8
EPOCHS = 4
LR = 2e-5
RND_SEED = 42
random.seed(RND_SEED)
np.random.seed(RND_SEED)

In [None]:
TRAIN_SPLIT = "/content/dataset_splits/sentence/train_split.json"
VALID_SPLIT = "/content/dataset_splits/sentence/valid_split.json"

assert os.path.exists(TRAIN_SPLIT) and os.path.exists(VALID_SPLIT), f"Missing split files: {TRAIN_SPLIT}, {VALID_SPLIT}"

# ---- Load splits and build examples ----
train_queries, train_corpus, train_relevant = load_split_json(TRAIN_SPLIT)
valid_queries, valid_corpus, valid_relevant = load_split_json(VALID_SPLIT)

train_examples = build_allpos_input_examples(train_queries, train_corpus, train_relevant)
valid_examples = build_allpos_input_examples(valid_queries, valid_corpus, valid_relevant)

print("Train examples (all positives):", len(train_examples))
print("Valid examples (all positives):", len(valid_examples))

# ---- Create DataLoaders  ----
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
valid_dataloader = DataLoader(valid_examples, shuffle=False, batch_size=TRAIN_BATCH_SIZE)

# ---- Device, mixed precision and grad accumulation configuration ----
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
if device == "cuda":
    torch.cuda.empty_cache()

# gradient accumulation hyperparam (used only for computing effective steps; NOT passed to fit())
GRADIENT_ACCUMULATION_STEPS = 1  # set to 1 if you don't want accumulation

# ---- Load model ----
print("Loading base model:", BASE_MODEL)
model = SentenceTransformer(BASE_MODEL)
# Move model to device if desired (SentenceTransformer.fit will also handle device, but explicit move is fine)
if device == "cuda":
    model = model.to(device)

# ---- Loss ----
train_loss = losses.CosineSimilarityLoss(model=model)   # positive pairs labelled 1.0

# ---- Warmup steps heuristic using effective steps (account for grad-accum) ----
# Calculate number of optimization steps per epoch = ceil(len(dataloader) / grad_accum)
steps_per_epoch_effective = math.ceil(len(train_dataloader) / float(GRADIENT_ACCUMULATION_STEPS))
total_training_steps = steps_per_epoch_effective * EPOCHS
warmup_steps = int(0.1 * total_training_steps)
print(f"train examples: {len(train_examples)}, dataloader batches: {len(train_dataloader)}, "
      f"grad_accum={GRADIENT_ACCUMULATION_STEPS}, effective_steps_per_epoch={steps_per_epoch_effective}, "
      f"total_steps={total_training_steps}, warmup_steps={warmup_steps}")


Train examples (all positives): 279
Valid examples (all positives): 30
Device: cuda
Loading base model: sentence-transformers/distiluse-base-multilingual-cased-v1
train examples: 279, dataloader batches: 35, grad_accum=1, effective_steps_per_epoch=35, total_steps=140, warmup_steps=14


In [None]:
# ---- Train ----
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
SAVE_DIR = os.path.join(SAVE_ROOT, f"sentence_distiluse-base-multilingual-cased-v1_{timestamp}")
os.makedirs(SAVE_DIR, exist_ok=True)

print("Starting training...")

# now run training with validation
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    optimizer_params={"lr": LR},
    output_path=SAVE_DIR,
    show_progress_bar=True,
    use_amp=(device == "cuda")  # AMP for CUDA
)

print("Training finished. Model saved to:", SAVE_DIR)

Starting training...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


Training finished. Model saved to: /content/models/sentence_distiluse-base-multilingual-cased-v1_20250903_124209


## Testing

If you are running all the cels the FINETUNEMODEL_DIR is different for you. check the /content/models

In [None]:
try:
    import faiss
except Exception:
    # CPU faiss
    !pip install faiss-cpu -q
    import faiss

# ------------- Imports -------------
import os, json, glob, math, time
import numpy as np
import pandas as pd
from typing import Dict, List
from sentence_transformers import SentenceTransformer
from datetime import datetime

# ------------- Config: set paths -------------
TEST_SPLIT = "/content/dataset_splits/sentence/test_dataset.json"
OUT_ROOT = "/content/results/semantic_search_faiss"
os.makedirs(OUT_ROOT, exist_ok=True)

# If you want to force a specific finetuned model directory, set this (leave None to auto-find)
FINETUNED_MODEL_DIR = "/content/models/sentence_e5_allpos_20250903_124010"
FINETUNED_MODEL_DIR2 = "/content/models/sentence_distiluse-base-multilingual-cased-v1_20250903_124209"

In [None]:


# ------------- Helpers -------------
def load_split_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    return d["queries"], d["corpus"], d["relevant_docs"]



def normalize_embeddings(embs: np.ndarray, eps: float=1e-12):
    norms = np.linalg.norm(embs, axis=1, keepdims=True)
    norms[norms < eps] = 1.0
    return embs / norms

# compute metrics function (same semantics as you used before)
def compute_metrics_for_k(retrieved_ids: List[List[str]], relevant_map: Dict[str, List[str]], qids: List[str], k:int):
    # identical to your previous implementation, returns dict with hit@k, precision@k, recall@k, mrr, map@k
    n = len(qids)
    hits = 0
    precision_sum = 0.0
    recall_sum = 0.0
    rr_sum = 0.0
    ap_sum = 0.0
    for i, qid in enumerate(qids):
        retrieved = retrieved_ids[i][:k]
        relevant = set(relevant_map.get(qid, []))
        if len(relevant) == 0:
            continue
        # hit@k
        is_hit = len(set(retrieved) & relevant) > 0
        if is_hit:
            hits += 1
        # precision@k
        rel_retrieved = len(set(retrieved) & relevant)
        precision_sum += rel_retrieved / float(k)
        # recall@k
        recall_sum += rel_retrieved / float(len(relevant))
        # reciprocal rank & average precision
        rr = 0.0
        ap = 0.0
        first_rel_rank = None
        num_rel_found = 0
        for rank_idx, docid in enumerate(retrieved, start=1):
            if docid in relevant:
                num_rel_found += 1
                if first_rel_rank is None:
                    first_rel_rank = rank_idx
                ap += (num_rel_found / rank_idx)
        if first_rel_rank is not None:
            rr = 1.0 / float(first_rel_rank)
            ap = ap / float(len(relevant))
        else:
            rr = 0.0
            ap = 0.0
        rr_sum += rr
        ap_sum += ap
    valid_qids = [q for q in qids if len(relevant_map.get(q, []))>0]
    m = len(valid_qids)
    if m == 0:
        return {"hit@k": None, "precision@k": None, "recall@k": None, "mrr": None, "map@k": None}
    return {
        "hit@k": hits / m,
        "precision@k": precision_sum / m,
        "recall@k": recall_sum / m,
        "mrr": rr_sum / m,
        "map@k": ap_sum / m
    }

# ------------- Load test split -------------
assert os.path.exists(TEST_SPLIT), f"Test split not found: {TEST_SPLIT}"
q_map, corpus_map, relevant_map = load_split_json(TEST_SPLIT)
qids = list(q_map.keys())
corpus_ids = list(corpus_map.keys())
query_texts = [q_map[q] for q in qids]
corpus_texts = [corpus_map[c] for c in corpus_ids]
print(f"Loaded test: {len(qids)} queries, {len(corpus_ids)} corpus chunks")


# ------------- Candidate models to evaluate -------------
models_to_eval = []

models_to_eval.append(("finetuned_distiluse-base-multilingual-cased-v1", FINETUNED_MODEL_DIR2))

if FINETUNED_MODEL_DIR is not None and os.path.isdir(FINETUNED_MODEL_DIR):
    models_to_eval.append(("finetuned_sentence_model_multilingual_e5", FINETUNED_MODEL_DIR))


Loaded test: 79 queries, 20 corpus chunks


In [None]:

# ------------- Main loop: encode, build FAISS, retrieve, evaluate -------------
TOPK_LIST = [1,3,5,10]
summary_rows = []
per_query_outputs = {}

for model_tag, model_spec in models_to_eval:
    print("\n=== Evaluating model:", model_tag, "->", model_spec)
    t0 = time.time()
    # load model
    model = SentenceTransformer(model_spec)
    # encode corpus and queries (batched inside encode)
    print("Encoding corpus with", model_tag)
    corpus_embs = model.encode(corpus_texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
    print("Encoding queries with", model_tag)
    query_embs = model.encode(query_texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
    # normalize
    corpus_embs = normalize_embeddings(corpus_embs)
    query_embs = normalize_embeddings(query_embs)

    # Build FAISS index for inner-product (cosine on normalized vectors)
    dim = corpus_embs.shape[1]
    # use IndexFlatIP for simplicity (CPU & GPU supported variants exist)
    index = faiss.IndexFlatIP(dim)
    # if you want GPU: faiss.index_cpu_to_all_gpus(index)
    index.add(corpus_embs.astype(np.float32))
    print("FAISS index built, n = ", index.ntotal)

    # Search top-K (max TOPK)
    max_k = max(TOPK_LIST)
    D, I = index.search(query_embs.astype(np.float32), max_k)   # D: inner products, I: indices into corpus_embs
    # map indices -> ids and scores
    retrieved_ids_all = [[corpus_ids[idx] for idx in row] for row in I.tolist()]
    retrieved_scores_all = D.tolist()   # inner product (cosine if normalized)

    # mean_top1_cosine
    mean_top1_cos = float(np.mean([row[0] for row in D])) if D.shape[0]>0 else float('nan')

    # compute metrics for each topk
    for k in TOPK_LIST:
        metrics = compute_metrics_for_k(retrieved_ids_all, relevant_map, qids, k)
        summary_rows.append({
            "dataset": "sentence_test",
            "retriever": "faiss_ip",
            "model": model_tag,
            "top_k": k,
            "EM": metrics["hit@k"] if k==1 else None,
            "hit@k": metrics["hit@k"],
            "precision@k": metrics["precision@k"],
            "recall@k": metrics["recall@k"],
            "mrr": metrics["mrr"],
            "map@k": metrics["map@k"],
            "mean_top1_cosine": mean_top1_cos if k==1 else (mean_top1_cos if k==max(TOPK_LIST) else None)
        })

    # per-query save
    perq_rows = []
    for qi, qid in enumerate(qids):
        perq_rows.append({
            "qid": qid,
            "query": q_map[qid],
            "expected": relevant_map.get(qid, []),
            "retrieved_ids_topk": retrieved_ids_all[qi],
            "retrieved_scores_topk": retrieved_scores_all[qi]
        })
    per_query_fname = os.path.join(OUT_ROOT, f"per_query_{model_tag}.csv")
    pd.DataFrame(perq_rows).to_csv(per_query_fname, index=False)
    per_query_outputs[model_tag] = per_query_fname

    t1 = time.time()
    print(f"Model {model_tag} done. mean_top1_cosine={mean_top1_cos:.4f}, time={t1-t0:.1f}s, per-query saved to {per_query_fname}")

# ------------- Save summary CSV -------------
df_summary = pd.DataFrame(summary_rows)
summary_csv = os.path.join(OUT_ROOT, f"faiss_evaluation_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
df_summary.to_csv(summary_csv, index=False)
print("\nSaved summary to:", summary_csv)
print(df_summary)

# ------------- quick print best method (by MRR@1? or by hit@1) -------------
# pick best by mrr at k=10
best = df_summary[df_summary.top_k==10].sort_values("mrr", ascending=False).head(1)
if not best.empty:
    print("\nBest model (by MRR@10):")
    print(best[["model","mrr","mean_top1_cosine","hit@k"]].to_string(index=False))



=== Evaluating model: finetuned_distiluse-base-multilingual-cased-v1 -> /content/models/sentence_distiluse-base-multilingual-cased-v1_20250903_124209
Encoding corpus with finetuned_distiluse-base-multilingual-cased-v1


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding queries with finetuned_distiluse-base-multilingual-cased-v1


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

FAISS index built, n =  20
Model finetuned_distiluse-base-multilingual-cased-v1 done. mean_top1_cosine=0.9973, time=1.0s, per-query saved to /content/results/semantic_search_faiss/per_query_finetuned_distiluse-base-multilingual-cased-v1.csv

=== Evaluating model: finetuned_sentence_model_multilingual_e5 -> /content/models/sentence_e5_allpos_20250903_124010
Encoding corpus with finetuned_sentence_model_multilingual_e5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding queries with finetuned_sentence_model_multilingual_e5


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

FAISS index built, n =  20
Model finetuned_sentence_model_multilingual_e5 done. mean_top1_cosine=0.9950, time=4.2s, per-query saved to /content/results/semantic_search_faiss/per_query_finetuned_sentence_model_multilingual_e5.csv

Saved summary to: /content/results/semantic_search_faiss/faiss_evaluation_summary_20250903_124635.csv
         dataset retriever                                           model  \
0  sentence_test  faiss_ip  finetuned_distiluse-base-multilingual-cased-v1   
1  sentence_test  faiss_ip  finetuned_distiluse-base-multilingual-cased-v1   
2  sentence_test  faiss_ip  finetuned_distiluse-base-multilingual-cased-v1   
3  sentence_test  faiss_ip  finetuned_distiluse-base-multilingual-cased-v1   
4  sentence_test  faiss_ip        finetuned_sentence_model_multilingual_e5   
5  sentence_test  faiss_ip        finetuned_sentence_model_multilingual_e5   
6  sentence_test  faiss_ip        finetuned_sentence_model_multilingual_e5   
7  sentence_test  faiss_ip        finetuned_

In [None]:
##The file path would be different for you since the path depends on current timestamp for debugging
pd.read_csv('/content/results/semantic_search_faiss/faiss_evaluation_summary_20250903_124635.csv')

Unnamed: 0,dataset,retriever,model,top_k,EM,hit@k,precision@k,recall@k,mrr,map@k,mean_top1_cosine
0,sentence_test,faiss_ip,finetuned_distiluse-base-multilingual-cased-v1,1,0.151899,0.151899,0.151899,0.151899,0.151899,0.151899,0.997289
1,sentence_test,faiss_ip,finetuned_distiluse-base-multilingual-cased-v1,3,,0.341772,0.113924,0.341772,0.234177,0.234177,
2,sentence_test,faiss_ip,finetuned_distiluse-base-multilingual-cased-v1,5,,0.506329,0.101266,0.506329,0.272152,0.272152,
3,sentence_test,faiss_ip,finetuned_distiluse-base-multilingual-cased-v1,10,,0.670886,0.067089,0.670886,0.293555,0.293555,0.997289
4,sentence_test,faiss_ip,finetuned_sentence_model_multilingual_e5,1,0.518987,0.518987,0.518987,0.518987,0.518987,0.518987,0.995033
5,sentence_test,faiss_ip,finetuned_sentence_model_multilingual_e5,3,,0.708861,0.236287,0.708861,0.601266,0.601266,
6,sentence_test,faiss_ip,finetuned_sentence_model_multilingual_e5,5,,0.772152,0.15443,0.772152,0.615823,0.615823,
7,sentence_test,faiss_ip,finetuned_sentence_model_multilingual_e5,10,,0.936709,0.093671,0.936709,0.635845,0.635845,0.995033


## Save our best model

In [None]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from sentence_transformers import SentenceTransformer


best_model_path ="/content/models/sentence_e5_allpos_20250903_124010"
save_path = "/content/drive/MyDrive/Colab Notebooks/AI_task/RAG/model-e5"

model = SentenceTransformer(best_model_path)

# Save to Google Drive
model.save(save_path)
print(f"Model saved to: {save_path}")

Model saved to: /content/drive/MyDrive/Colab Notebooks/AI_task/RAG/model-e5


In [None]:
from google.colab import drive
import zipfile
import os

# Mount Google Drive
drive.mount('/content/drive')

# Path to your model folder in Google Drive
model_folder_path = "/content/drive/MyDrive/Colab Notebooks/AI_task/RAG/model-e5"
zip_output_path = "/content/drive/MyDrive/Colab Notebooks/AI_task/RAG/model-e5.zip"

# Create zip file
with zipfile.ZipFile(zip_output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(model_folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            # Add file to zip with relative path
            arcname = os.path.relpath(file_path, model_folder_path)
            zipf.write(file_path, arcname)

print(f"✓ Zip file created at: {zip_output_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Zip file created at: /content/drive/MyDrive/Colab Notebooks/AI_task/RAG/model-e5.zip


## Final result (UI)

You can run all the below cells and see the results without training the model. There are couple of things to substitute, including google API key, Ngrok auth token, and hugging face auth token.

In [None]:

!pip install -q sentence-transformers faiss-cpu streamlit pyngrok tqdm


In [None]:
!pip install --upgrade google-genai




In [None]:
from google.colab import userdata
from google import genai

# Get API key from Colab secrets
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

client = genai.Client(api_key=GOOGLE_API_KEY)

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="Hey"
)
print(response.text)

Hey there! How can I help you today?


In [None]:
import os
from google.colab import userdata


NGROK_AUTH_TOKEN = userdata.get("NGROK_AUTH_TOKEN")
!ngrok config add-authtoken $NGROK_AUTH_TOKEN


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
## LOAD our model
## google drive link: #https://drive.google.com/file/d/1F0X0b0akte2PgWTbAJ_KxQcJeeNl_yte/view?usp=sharing
!pip install gdown

def download_drive_file(file_id: str, dest_path: str):
    """Download file from Google Drive given a file_id."""
    url = f"https://drive.google.com/uc?id={file_id}"
    print("Downloading from:", url)
    gdown.download(url, dest_path, quiet=False)

def unzip_to(zip_path: str, dest_dir: str):
    print("Unzipping", zip_path, "->", dest_dir)
    if os.path.exists(dest_dir):
        print("dest exists, removing and re-creating")
        shutil.rmtree(dest_dir)
    os.makedirs(dest_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(dest_dir)
    print("   extracted")





In [None]:


import gdown
import zipfile
import os
from sentence_transformers import SentenceTransformer


file_id = "1F0X0b0akte2PgWTbAJ_KxQcJeeNl_yte"
zip_path = "/content/model.zip"
extract_path = "/content/model"


print("Downloading model zip file...")
download_drive_file(file_id, zip_path)


print("Extracting model...")
unzip_to(zip_path, extract_path)


print(" Loading Sentence Transformer model...")
model = SentenceTransformer(extract_path)

# Step 4: Test the model
print("🧪 Testing model...")
embeddings = model.encode(["This is a test sentence"])
print(f" Model loaded successfully!")
print(f"   Embedding shape: {embeddings.shape}")
print(f"   Model dimension: {embeddings.shape[1]}")


os.remove(zip_path)
print("Cleaned up temporary zip file")


Downloading model zip file...
Downloading from: https://drive.google.com/uc?id=1F0X0b0akte2PgWTbAJ_KxQcJeeNl_yte


Downloading...
From (original): https://drive.google.com/uc?id=1F0X0b0akte2PgWTbAJ_KxQcJeeNl_yte
From (redirected): https://drive.google.com/uc?id=1F0X0b0akte2PgWTbAJ_KxQcJeeNl_yte&confirm=t&uuid=8d301c32-78b0-4afd-a7d3-c9f31e187465
To: /content/model.zip
100%|██████████| 800M/800M [00:11<00:00, 70.2MB/s]


Extracting model...
Unzipping /content/model.zip -> /content/model
   extracted
 Loading Sentence Transformer model...
🧪 Testing model...
 Model loaded successfully!
   Embedding shape: (1, 768)
   Model dimension: 768
Cleaned up temporary zip file


In [None]:


def load_dataset(path: str) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, List[str]]]:
    """Load dataset JSON into queries, corpus, relevant docs."""
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    queries = d["queries"]          # qid -> question text
    corpus = d["corpus"]            # docid -> chunk text
    relevant = d["relevant_docs"]   # qid -> [docid, ...]
    return queries, corpus, relevant



In [None]:
GDRIVE_FILE_ID_SENTENCE = "1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH"
download_drive_file(GDRIVE_FILE_ID_SENTENCE, "sentence_dataset.zip")
unzip_to("sentence_dataset.zip", "sentence_dataset")

Downloading from: https://drive.google.com/uc?id=1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH


Downloading...
From: https://drive.google.com/uc?id=1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH
To: /content/sentence_dataset.zip
100%|██████████| 122k/122k [00:00<00:00, 3.65MB/s]

Unzipping sentence_dataset.zip -> sentence_dataset
dest exists, removing and re-creating
   extracted





In [None]:
import os, json, math, pickle
from tqdm.auto import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# ---------- Paths (adjust if needed) ----------
TRAIN_PATH = "/content/sentence_dataset/train_dataset.json"
VAL_PATH   = "/content/sentence_dataset/val_dataset.json"

# Output artifacts
ARTIFACT_DIR = "/content/faiss_store"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
INDEX_PATH = os.path.join(ARTIFACT_DIR, "faiss_index.bin")
META_PATH = os.path.join(ARTIFACT_DIR, "docid_texts.pkl")


MODEL_NAME = "/content/model"

# ---------- helper to load dataset JSON ----------
def load_dataset(path: str):
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    return d["queries"], d["corpus"], d["relevant_docs"]

# ---------- load train + val corpora and merge them ----------
_, corpus_train, _ = load_dataset(TRAIN_PATH)
_, corpus_val,   _ = load_dataset(VAL_PATH)

# Merge while preserving deterministic order; prefer train entries then val (avoid duplicates)
merged_corpus = {}
for d in (corpus_train, corpus_val):
    for docid, text in d.items():
        if docid not in merged_corpus:
            merged_corpus[docid] = text

print(f"Total unique chunks to index: {len(merged_corpus)}")

# Optional: you can further concatenate neighboring chunks or de-duplicate here
# For now we index each chunk as a separate passage (docid -> chunk text)

# ---------- prepare lists in stable order ----------
docids = list(merged_corpus.keys())
texts  = [merged_corpus[d] for d in docids]

# ---------- load model and compute embeddings (batched) ----------
model = SentenceTransformer(MODEL_NAME)

BATCH_SIZE = 64
embs = []
for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Encoding corpus"):
    batch_texts = texts[i:i+BATCH_SIZE]
    batch_emb = model.encode(batch_texts, batch_size=len(batch_texts), convert_to_numpy=True, show_progress_bar=False)
    embs.append(batch_emb)
embs = np.vstack(embs).astype("float32")
print("Embeddings shape:", embs.shape)

# Normalize vectors (so inner product == cosine similarity)
norms = np.linalg.norm(embs, axis=1, keepdims=True)
norms[norms == 0.0] = 1.0
embs = embs / norms

# ---------- build FAISS Index (IndexFlatIP on normalized vectors -> cosine) ----------
d = embs.shape[1]
index = faiss.IndexFlatIP(d)   # inner product on unit vectors = cosine similarity
index.add(embs)
print("FAISS index built. ntotal =", index.ntotal)

# save index and metadata
faiss.write_index(index, INDEX_PATH)
with open(META_PATH, "wb") as f:
    pickle.dump({"docids": docids, "texts": texts, "model_used": MODEL_NAME}, f)

print("Saved index to:", INDEX_PATH)
print("Saved metadata to:", META_PATH)


Total unique chunks to index: 98


Encoding corpus:   0%|          | 0/2 [00:00<?, ?it/s]

Embeddings shape: (98, 768)
FAISS index built. ntotal = 98
Saved index to: /content/faiss_store/faiss_index.bin
Saved metadata to: /content/faiss_store/docid_texts.pkl


In [None]:
%%bash
cat > app.py <<'PY'
import streamlit as st
import faiss, pickle, os, json
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

# ------- Config -------

GOOGLE_API_KEY = "Your API KEY SHOULD BE HERE" #Your google API key
ARTIFACT_DIR = "/content/faiss_store"
INDEX_PATH = os.path.join(ARTIFACT_DIR, "faiss_index.bin")
META_PATH  = os.path.join(ARTIFACT_DIR, "docid_texts.pkl")

DEFAULT_MODEL = "intfloat/multilingual-e5-base"
MODEL_PATH = "/content/model"  # fine-tuned model

# ------- Helpers -------
def load_faiss_and_meta(index_path=INDEX_PATH, meta_path=META_PATH):
    if not os.path.exists(index_path) or not os.path.exists(meta_path):
        raise FileNotFoundError("FAISS index or metadata not found.")
    idx = faiss.read_index(index_path)
    with open(meta_path, "rb") as f:
        meta = pickle.load(f)
    return idx, meta

def load_model():
    if os.path.exists(MODEL_PATH) and os.path.isdir(MODEL_PATH):
        try:
            # First try to load normally
            model = SentenceTransformer(MODEL_PATH)
            # Ensure model is on the right device
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            model = model.to(device)
            return model
        except Exception as e:
            st.error(f"Error loading fine-tuned model: {e}. Using default model.")
            return SentenceTransformer(DEFAULT_MODEL)
    else:
        st.info("Fine-tuned model not found, using default model.")
        return SentenceTransformer(DEFAULT_MODEL)



def search_index(query, model, index, meta, top_k=10):
    q_emb = model.encode([query], convert_to_numpy=True)
    q_emb = q_emb.astype("float32")
    q_emb = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-12)
    D, I = index.search(q_emb, top_k)
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0:
            continue
        docid = meta["docids"][idx]
        text = meta["texts"][idx]
        results.append({"docid": docid, "score": float(score), "text": text})
    return results

# ------- Streamlit UI -------
st.set_page_config(page_title="مدل بازیابی داروها", layout="wide")

# Inject CSS for RTL
st.markdown(
    """
    <style>
    body, .css-1d391kg { direction: rtl; text-align: right; }
    textarea, input, .stNumberInput, .stTextInput { direction: rtl; text-align: right; }
    </style>
    """,
    unsafe_allow_html=True
)

st.title("پرسش و پاسخ در مورد داروها")
st.markdown("سوال خود را وارد کنید و جستجو را بزنید.")

# Query input
query = st.text_area("پرسش", height=160)
top_k = int(st.number_input("تعداد جواب ها (Top K)", min_value=1, max_value=10, value=5, step=1))

# Select output type
output_type = st.radio("نوع خروجی:", ["نتایج خام (Top-K)", "خروجی Gemini Flash"])

# Load model and FAISS index once
try:
    index, meta = load_faiss_and_meta()
    model = load_model()
except Exception as e:
    st.error(f"Error loading model/index: {e}")
    st.stop()

if output_type == "نتایج خام (Top-K)":
    if st.button("جستجو"):
        if not query.strip():
            st.warning("لطفا متن خود را وارد کنید.")
            st.stop()

        # FAISS search
        results = search_index(query, model, index, meta, top_k=top_k)
        st.markdown(f"**Top-{top_k} results (FAISS)**")
        for i, r in enumerate(results, start=1):
            st.markdown(f"**{i}. docid:** `{r['docid']}` — **score:** {r['score']:.4f}")
            st.write(r["text"])
            st.markdown("---")

elif output_type == "خروجی Gemini Flash":
    if st.button("جستجو"):
        if not query.strip():
            st.warning("لطفا متن خود را وارد کنید.")
            st.stop()

        # Retrieve top 10 docs from FAISS
        top_docs = search_index(query, model, index, meta, top_k=10)
        # Add docid references to the context
        context_with_refs = "\n\n".join([f"[{d['docid']}]: {d['text']}" for d in top_docs])

        # Build the prompt for Gemini Flash with reference requirement
        prompt = f"""بر اساس متن‌های زیر، لطفاً یک پاسخ جامع به پرسش '{query}' ارائه دهید.

اگر پاسخ در متن زیر نبود بگو که پاسخی یافت نشد.
اگر جوابی پیدا شد صفحه جواب رو به عنوان رفرنس بده (مثلاً صفحه اول میشود p1).

متن‌های مرجع:
{context_with_refs}"""

        # Call the Gemini Flash API
        try:
            from google import genai
            client = genai.Client(api_key=GOOGLE_API_KEY)

            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=prompt
            )

            gemini_response = response.text  # extract generated text
            st.markdown("**خروجی:**")
            st.write(gemini_response)

        except Exception as e:
            st.error(f"Gemini Flash API call failed: {e}")
            st.stop()

PY


In [None]:

from pyngrok import ngrok
import subprocess, time, os



PORT = 8501
public_url = ngrok.connect(PORT).public_url
print("Public URL:", public_url)

# Start streamlit (headless) in background
cmd = f"streamlit run /content/app.py --server.port {PORT} --server.headless true"
# Use Popen so the notebook doesn't block
proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

print("Streamlit started. Visit the public URL above.")
# Optionally, to see logs:
# print(proc.stdout.readline().decode())


Public URL: https://2e767c71c351.ngrok-free.app
Streamlit started. Visit the public URL above.
