<a href="https://colab.research.google.com/github/Benyormin/Question_answering/blob/main/AI_Task_D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q sentence-transformers gdown tqdm

## Loading Dataset

In [None]:
# === Imports ===
import os
import json
import zipfile
import shutil
import gdown
from typing import Dict, List, Tuple

# === Helper functions ===
def download_drive_file(file_id: str, dest_path: str):
    """Download file from Google Drive given a file_id."""
    url = f"https://drive.google.com/uc?id={file_id}"
    print("Downloading from:", url)
    gdown.download(url, dest_path, quiet=False)


def unzip_to(zip_path: str, dest_dir: str):
    print("Unzipping", zip_path, "->", dest_dir)
    if os.path.exists(dest_dir):
        print("dest exists, removing and re-creating")
        shutil.rmtree(dest_dir)
    os.makedirs(dest_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(dest_dir)
    print("   extracted")


def load_dataset(path: str) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, List[str]]]:
    """Load dataset JSON into queries, corpus, relevant docs."""
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    queries = d["queries"]          # qid -> question text
    corpus = d["corpus"]            # docid -> chunk text
    relevant = d["relevant_docs"]   # qid -> [docid, ...]
    return queries, corpus, relevant


# === Google Drive File IDs ===
GDRIVE_FILE_ID_SENTENCE = "1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH"
GDRIVE_FILE_ID_WORD     = "19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k"

# === Step 1: Download ===
download_drive_file(GDRIVE_FILE_ID_SENTENCE, "sentence_dataset.zip")
download_drive_file(GDRIVE_FILE_ID_WORD, "word_dataset.zip")

# === Unzip ===
unzip_to("sentence_dataset.zip", "sentence_dataset")
unzip_to("word_dataset.zip", "word_dataset")




Downloading from: https://drive.google.com/uc?id=1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH


Downloading...
From: https://drive.google.com/uc?id=1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH
To: /content/sentence_dataset.zip
100%|██████████| 122k/122k [00:00<00:00, 4.16MB/s]


Downloading from: https://drive.google.com/uc?id=19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k


Downloading...
From: https://drive.google.com/uc?id=19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k
To: /content/word_dataset.zip
100%|██████████| 134k/134k [00:00<00:00, 3.98MB/s]

Unzipping sentence_dataset.zip -> sentence_dataset
   extracted
Unzipping word_dataset.zip -> word_dataset
   extracted





In [None]:
# === Load datasets ===
sentence_queries_train, sentence_corpus_train, sentence_relevant_train = load_dataset('/content/sentence_dataset/train_dataset.json')
sentence_queries_val, sentence_corpus_val, sentence_relevant_val = load_dataset('/content/sentence_dataset/val_dataset.json')

word_queries_train, word_corpus_train, word_relevant_train = load_dataset("/content/word_dataset/train_dataset.json")
word_queries_val, word_corpus_val, word_relevant_val = load_dataset("/content/word_dataset/val_dataset.json")

print("Sentence dataset size:", len(sentence_queries_train), "queries,", len(sentence_corpus_train), "docs")
print("Word-based dataset size:", len(word_queries_train), "queries,", len(word_corpus_train), "docs")


Sentence dataset size: 309 queries, 78 docs
Word-based dataset size: 366 queries, 93 docs


In [None]:
sentence_queries_train, sentence_corpus_train, sentence_relevant_train

({'7645a669-81df-4cd7-a2b2-ca503100b1e0': 'دیکلوفناک چگونه با التهاب و درد مقابله می کند؟',
  '0df0cc7c-0dbc-4d02-a4c7-3d9a6c320502': 'دیکلوفناک به صورت چند ملح در بازار موجود است؟',
  'b7f349cb-7927-40c0-ba47-857906fc2ee1': 'تفاوت دیکلوفناک سدیم و پتاسیم در چیست؟',
  '86c5b2ec-aeab-466c-8875-17c37f52e950': 'شیاف دیکلوفناک چه علائم آرتریت مفاصل را تسکین می دهد؟',
  'a6601503-7266-458d-9d07-85681526d987': 'شیاف دیکلوفناک برای تسکین درد قاعدگی چقدر زمان می برد؟',
  'b87c36b2-91e3-47f5-a63a-3911868feece': 'آمپول دیکلوفناک چه علائمی را در بیماری آرتروز آرام می کند؟',
  'd0fffdb7-e7ca-4c00-a046-a8919821aaa9': 'دیکلوفناک تولید کدام آنزیم التهاب ساز را مهار می کند؟',
  'be587b32-cdab-49fa-b09e-ad289d759d6b': 'دُز بالای آمپول دیکلوفناک به عنوان چه نوع مسکنی شناخته می شود؟',
  '6a4a27f8-8bf7-41c1-b23f-2923513b4202': 'با کاهش میزان مواد شیمیایی التهابی در بدن چه اتفاقی می افتد؟',
  '2da0f6d4-ddf8-4a58-ab01-afcc45b53032': 'پزشک معالج بعد از تجویز دیکلوفناک چه چیزی به بیمار می دهد؟',
  'dd9810ad-b

## Preprocessing

In [None]:
# ------------------ Prepare dataset splits + DataLoaders (copy-paste) ------------------
import os
import json
import random
from typing import Dict, List, Tuple
from sentence_transformers import InputExample, SentencesDataset, SentenceTransformer
from torch.utils.data import DataLoader

# ---------- CONFIG (keep your values or adjust) ----------
RND_SEED = 42
VAL_FRAC = 0.10
POS_PER_QUERY = 1
TRAIN_BATCH_SIZE = 16

# paths (make sure these point to your datasets)
SENT_TRAIN_PATH = "/content/sentence_dataset/train_dataset.json"
SENT_VAL_PATH   = "/content/sentence_dataset/val_dataset.json"
WORD_TRAIN_PATH = "/content/word_dataset/train_dataset.json"
WORD_VAL_PATH   = "/content/word_dataset/val_dataset.json"

OUT_DIR = "/content/dataset_splits"
os.makedirs(OUT_DIR, exist_ok=True)
random.seed(RND_SEED)

# base model used to build SentencesDataset (needed for tokenization)
# this is small and multilingual; you can change to your chosen base model
BASE_MODEL_FOR_DATASET = "distiluse-base-multilingual-cased-v2"

# ---------- helpers ----------
def save_split_json(queries: Dict[str,str], corpus: Dict[str,str], relevant: Dict[str,List[str]], out_path: str):
    obj = {"queries": queries, "corpus": corpus, "relevant_docs": relevant}
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def split_train_holdout(
    train_queries: Dict[str,str],
    train_corpus: Dict[str,str],
    train_relevant: Dict[str,List[str]],
    val_frac: float = VAL_FRAC,
    seed: int = RND_SEED
) -> Tuple[Tuple[Dict[str,str],Dict[str,str],Dict[str,List[str]]], Tuple[Dict[str,str],Dict[str,str],Dict[str,List[str]]]]:
    """
    Split train queries into new_train and new_valid (by query ids).
    Corpus is kept intact for both splits (retriever expects full corpus).
    Returns ((train_q, train_c, train_rel), (valid_q, valid_c, valid_rel))
    """
    qids = list(train_queries.keys())
    random.Random(seed).shuffle(qids)
    n_valid = max(1, int(len(qids) * val_frac))
    valid_qids = set(qids[:n_valid])
    train_qids = list(qids[n_valid:])

    train_q_map = {qid: train_queries[qid] for qid in train_qids}
    valid_q_map = {qid: train_queries[qid] for qid in valid_qids}

    # keep the whole corpus in both splits
    train_corpus_map = dict(train_corpus)
    valid_corpus_map = dict(train_corpus)

    train_rel = {qid: train_relevant.get(qid, []) for qid in train_qids}
    valid_rel = {qid: train_relevant.get(qid, []) for qid in valid_qids}

    return (train_q_map, train_corpus_map, train_rel), (valid_q_map, valid_corpus_map, valid_rel)

def build_input_examples(
    queries_map: Dict[str,str],
    corpus_map: Dict[str,str],
    relevant_map: Dict[str,List[str]],
    pos_per_query: int = POS_PER_QUERY,
    seed: int = RND_SEED,
    use_all_positives: bool = False
) -> List[InputExample]:
    """
    Make InputExample(texts=[query, positive_doc]) for each positive chosen.
    """
    rng = random.Random(seed)
    examples = []
    for qid, qtext in queries_map.items():
        positives = list(relevant_map.get(qid, []) or [])
        if len(positives) == 0:
            continue
        if use_all_positives:
            chosen = positives
        else:
            if len(positives) <= pos_per_query:
                chosen = positives
            else:
                chosen = rng.sample(positives, pos_per_query)
        for docid in chosen:
            doc_text = corpus_map.get(docid)
            if doc_text is None:
                continue
            examples.append(InputExample(texts=[qtext, doc_text]))
    return examples

# ---------- main function ----------
def prepare_dataset_for_training(
    train_path: str,
    test_path: str,
    name: str,
    val_frac: float = VAL_FRAC,
    pos_per_query: int = POS_PER_QUERY,
    seed: int = RND_SEED,
    out_dir: str = OUT_DIR,
    base_model_for_dataset: str = BASE_MODEL_FOR_DATASET,
    train_batch_size: int = TRAIN_BATCH_SIZE
):
    # load original files
    def _load(path):
        with open(path, "r", encoding="utf-8") as f:
            d = json.load(f)
        return d["queries"], d["corpus"], d["relevant_docs"]

    train_q, train_c, train_rel = _load(train_path)
    test_q, test_c, test_rel    = _load(test_path)   # treat as final test (do not touch)

    # split train -> new_train + new_valid
    (new_train_q, new_train_c, new_train_rel), (new_valid_q, new_valid_c, new_valid_rel) = split_train_holdout(
        train_q, train_c, train_rel, val_frac=val_frac, seed=seed
    )

    # Save splits
    base_out = os.path.join(out_dir, name)
    os.makedirs(base_out, exist_ok=True)
    train_out = os.path.join(base_out, "train_split.json")
    valid_out = os.path.join(base_out, "valid_split.json")
    test_out  = os.path.join(base_out, "test_dataset.json")
    save_split_json(new_train_q, new_train_c, new_train_rel, train_out)
    save_split_json(new_valid_q, new_valid_c, new_valid_rel, valid_out)
    save_split_json(test_q, test_c, test_rel, test_out)

    # Build InputExamples
    train_examples = build_input_examples(new_train_q, new_train_c, new_train_rel, pos_per_query, seed)
    valid_examples = build_input_examples(new_valid_q, new_valid_c, new_valid_rel, pos_per_query, seed)

    # Create SentencesDataset + DataLoader (needs a model for tokenization)
    model_for_tokenization = SentenceTransformer(base_model_for_dataset)
    train_dataset = SentencesDataset(train_examples, model_for_tokenization)
    valid_dataset = SentencesDataset(valid_examples, model_for_tokenization)

    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=train_batch_size)

    info = {
        "name": name,
        "orig_train_q": len(train_q),
        "orig_train_docs": len(train_c),
        "orig_test_q": len(test_q),
        "new_train_q": len(new_train_q),
        "new_valid_q": len(new_valid_q),
        "train_examples": len(train_examples),
        "valid_examples": len(valid_examples),
        "paths": {"train": train_out, "valid": valid_out, "test": test_out},
        "dataloaders": {"train_loader": train_loader, "valid_loader": valid_loader},
        "tokenizer_model_used": base_model_for_dataset
    }

    print(f"[{name}] original train queries: {len(train_q)}, docs: {len(train_c)}; test queries: {len(test_q)}")
    print(f"[{name}] new train queries: {len(new_train_q)}, validation queries: {len(new_valid_q)}")
    print(f"[{name}] train_examples (InputExample pairs): {len(train_examples)}; valid_examples: {len(valid_examples)}")
    print(f"[{name}] saved splits to: {base_out}")
    return info

# ---------- Run for both sentence & word datasets ----------
info_sentence = prepare_dataset_for_training(SENT_TRAIN_PATH, SENT_VAL_PATH, "sentence",
                                             val_frac=VAL_FRAC, pos_per_query=POS_PER_QUERY, seed=RND_SEED,
                                             out_dir=OUT_DIR, base_model_for_dataset=BASE_MODEL_FOR_DATASET,
                                             train_batch_size=TRAIN_BATCH_SIZE)

info_word = prepare_dataset_for_training(WORD_TRAIN_PATH, WORD_VAL_PATH, "word",
                                         val_frac=VAL_FRAC, pos_per_query=POS_PER_QUERY, seed=RND_SEED,
                                         out_dir=OUT_DIR, base_model_for_dataset=BASE_MODEL_FOR_DATASET,
                                         train_batch_size=TRAIN_BATCH_SIZE)

# Return dataloaders in variables for immediate use
train_loader_sentence = info_sentence["dataloaders"]["train_loader"]
valid_loader_sentence = info_sentence["dataloaders"]["valid_loader"]
train_loader_word     = info_word["dataloaders"]["train_loader"]
valid_loader_word     = info_word["dataloaders"]["valid_loader"]


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

[sentence] original train queries: 309, docs: 78; test queries: 79
[sentence] new train queries: 279, validation queries: 30
[sentence] train_examples (InputExample pairs): 279; valid_examples: 30
[sentence] saved splits to: /content/dataset_splits/sentence
[word] original train queries: 366, docs: 93; test queries: 95
[word] new train queries: 330, validation queries: 36
[word] train_examples (InputExample pairs): 330; valid_examples: 36
[word] saved splits to: /content/dataset_splits/word


In [None]:
info_sentence

{'name': 'sentence',
 'orig_train_q': 309,
 'orig_train_docs': 78,
 'orig_test_q': 79,
 'new_train_q': 279,
 'new_valid_q': 30,
 'train_examples': 279,
 'valid_examples': 30,
 'paths': {'train': '/content/dataset_splits/sentence/train_split.json',
  'valid': '/content/dataset_splits/sentence/valid_split.json',
  'test': '/content/dataset_splits/sentence/test_dataset.json'},
 'dataloaders': {'train_loader': <torch.utils.data.dataloader.DataLoader at 0x7fab63945670>,
  'valid_loader': <torch.utils.data.dataloader.DataLoader at 0x7fab637ff530>},
 'tokenizer_model_used': 'distiluse-base-multilingual-cased-v2'}

## Fine-Tuning

In [None]:

import os, json, math
from sentence_transformers import SentenceTransformer, losses, evaluation
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datetime import datetime

# ---------- Config (adjust if needed) ----------
BASE_MODEL = "distiluse-base-multilingual-cased-v2"   # base model
OUT_DIR = "/content/models"                           # where to save fine-tuned model
os.makedirs(OUT_DIR, exist_ok=True)
SPLIT_DIR = "/content/dataset_splits/sentence"       # where prepare_dataset_for_training saved splits
VALID_SPLIT_PATH = os.path.join(SPLIT_DIR, "valid_split.json")
TRAIN_DATALOADER = train_loader_sentence              # from your previous cell
VALID_DATALOADER = valid_loader_sentence              # from your previous cell

# Training hyperparams (you can tweak)
EPOCHS = 3
BATCH_SIZE = None  # not used because DataLoader already set batch_size
WARMUP_RATIO = 0.1   # fraction of total steps used for warmup
LEARNING_RATE = 2e-5
SAVE_DIR = os.path.join(OUT_DIR, f"sentence_finetuned_mnlr_{datetime.now().strftime('%Y%m%d_%H%M%S')}")

print("SAVE_DIR:", SAVE_DIR)
os.makedirs(SAVE_DIR, exist_ok=True)

# ---------- helper: load valid split for evaluator ----------
def load_split_json(path):
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    return d["queries"], d["corpus"], d["relevant_docs"]

if not os.path.exists(VALID_SPLIT_PATH):
    raise FileNotFoundError(f"Validation split not found: {VALID_SPLIT_PATH}. Run the split-creation cell first.")

valid_queries_map, valid_corpus_map, valid_relevant_map = load_split_json(VALID_SPLIT_PATH)
print(f"Validation: {len(valid_queries_map)} queries, {len(valid_corpus_map)} docs")

# ---------- Load base model (will be fine-tuned) ----------
print("Loading base model:", BASE_MODEL)
model = SentenceTransformer(BASE_MODEL)

# ---------- Loss ----------
train_loss = losses.MultipleNegativesRankingLoss(model=model)

# ---------- Evaluator ----------
# InformationRetrievalEvaluator expects: queries_map (qid->text), corpus_map (docid->text), relevant_map (qid->[docid,...])
ir_evaluator = InformationRetrievalEvaluator(valid_queries_map, valid_corpus_map, valid_relevant_map, name="sentence-valid-eval")

# ---------- Warmup steps calculation ----------
# total_steps = num_epochs * (num_train_examples // effective_batch_size)
# We can estimate number of train examples from the dataloader length * batch_size
train_steps_per_epoch = len(TRAIN_DATALOADER)
total_train_steps = train_steps_per_epoch * EPOCHS
warmup_steps = max(1, int(total_train_steps * WARMUP_RATIO))
print(f"Training steps per epoch: {train_steps_per_epoch}, total_steps: {total_train_steps}, warmup_steps: {warmup_steps}")

# ---------- Fit (train) ----------
# model.fit accepts list of (dataloader, loss) tuples
print("Starting training... (this cell will take time depending on GPU)")

model.fit(
    train_objectives=[(TRAIN_DATALOADER, train_loss)],
    evaluator=ir_evaluator,
    epochs=EPOCHS,
    evaluation_steps=max(1, train_steps_per_epoch),   # evaluate once per epoch (approx)
    output_path=SAVE_DIR,
    warmup_steps=warmup_steps,
    optimizer_params={'lr': LEARNING_RATE},
    use_amp=True   # automatic mixed precision (if available) to speed up/memory save
)

print("Training finished. Model saved to:", SAVE_DIR)



SAVE_DIR: /content/models/sentence_finetuned_mnlr_20250901_181037
Validation: 30 queries, 78 docs
Loading base model: distiluse-base-multilingual-cased-v2
Training steps per epoch: 18, total_steps: 54, warmup_steps: 5
Starting training... (this cell will take time depending on GPU)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbeny11min[0m ([33mbeny11min-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Sentence-valid-eval Cosine Accuracy@1,Sentence-valid-eval Cosine Accuracy@3,Sentence-valid-eval Cosine Accuracy@5,Sentence-valid-eval Cosine Accuracy@10,Sentence-valid-eval Cosine Precision@1,Sentence-valid-eval Cosine Precision@3,Sentence-valid-eval Cosine Precision@5,Sentence-valid-eval Cosine Precision@10,Sentence-valid-eval Cosine Recall@1,Sentence-valid-eval Cosine Recall@3,Sentence-valid-eval Cosine Recall@5,Sentence-valid-eval Cosine Recall@10,Sentence-valid-eval Cosine Ndcg@10,Sentence-valid-eval Cosine Mrr@10,Sentence-valid-eval Cosine Map@100
18,No log,No log,0.466667,0.733333,0.733333,0.9,0.466667,0.244444,0.146667,0.09,0.466667,0.733333,0.733333,0.9,0.667908,0.596614,0.601112
36,No log,No log,0.5,0.733333,0.8,0.866667,0.5,0.244444,0.16,0.086667,0.5,0.733333,0.8,0.866667,0.682884,0.623889,0.632084
54,No log,No log,0.466667,0.733333,0.8,0.9,0.466667,0.244444,0.16,0.09,0.466667,0.733333,0.8,0.9,0.682558,0.613056,0.617738


Training finished. Model saved to: /content/models/sentence_finetuned_mnlr_20250901_181037


In [None]:
eval_dir = os.path.join(SAVE_DIR, "eval_results")
os.makedirs(eval_dir, exist_ok=True)
ir_metrics = ir_evaluator(model, output_path=eval_dir)


In [None]:
ir_metrics

{'sentence-valid-eval_cosine_accuracy@1': 0.4666666666666667,
 'sentence-valid-eval_cosine_accuracy@3': 0.7333333333333333,
 'sentence-valid-eval_cosine_accuracy@5': 0.8,
 'sentence-valid-eval_cosine_accuracy@10': 0.9,
 'sentence-valid-eval_cosine_precision@1': 0.4666666666666667,
 'sentence-valid-eval_cosine_precision@3': 0.2444444444444444,
 'sentence-valid-eval_cosine_precision@5': 0.16000000000000003,
 'sentence-valid-eval_cosine_precision@10': 0.09000000000000002,
 'sentence-valid-eval_cosine_recall@1': 0.4666666666666667,
 'sentence-valid-eval_cosine_recall@3': 0.7333333333333333,
 'sentence-valid-eval_cosine_recall@5': 0.8,
 'sentence-valid-eval_cosine_recall@10': 0.9,
 'sentence-valid-eval_cosine_ndcg@10': 0.6825579591405239,
 'sentence-valid-eval_cosine_mrr@10': 0.6130555555555556,
 'sentence-valid-eval_cosine_map@100': 0.6177375895385641}

In [None]:
import pandas as pd
pd.read_csv('/content/models/sentence_finetuned_mnlr_20250901_181037/eval_results/Information-Retrieval_evaluation_sentence-valid-eval_results.csv')

Unnamed: 0,epoch,steps,cosine-Accuracy@1,cosine-Accuracy@3,cosine-Accuracy@5,cosine-Accuracy@10,cosine-Precision@1,cosine-Recall@1,cosine-Precision@3,cosine-Recall@3,cosine-Precision@5,cosine-Recall@5,cosine-Precision@10,cosine-Recall@10,cosine-MRR@10,cosine-NDCG@10,cosine-MAP@100
0,-1,-1,0.466667,0.733333,0.8,0.9,0.466667,0.466667,0.244444,0.733333,0.16,0.8,0.09,0.9,0.613056,0.682558,0.617738
