<a href="https://colab.research.google.com/github/Benyormin/Question_answering/blob/main/AI_Task_C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#https://drive.google.com/file/d/1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH/view?usp=sharing #sentence_dataset

#https://drive.google.com/file/d/19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k/view?usp=sharing #word-based_dataset


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


In [None]:
GDRIVE_FILE_ID_SENTENCE = "1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH"
GDRIVE_FILE_ID_WORD     = "19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k"

In [None]:
def download_drive_file(file_id: str, dest_path: str):
    url = f"https://drive.google.com/uc?id={file_id}"
    print("Downloading from:", url)
    gdown.download(url, dest_path, quiet=False)

# helper: unzip
def unzip_to(zip_path: str, dest_dir: str):
    print("Unzipping", zip_path, "->", dest_dir)
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(dest_dir)


In [None]:
from typing import Dict, List, Tuple
def load_dataset(path: str) -> Tuple[Dict[str,str], Dict[str,str], Dict[str,List[str]]]:
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    queries = d["queries"]          # qid -> question text
    corpus  = d["corpus"]           # docid -> chunk text
    relevant = d["relevant_docs"]   # qid -> [docid, ...]
    return queries, corpus, relevant



In [None]:
# === Imports ===
import os
import json
import zipfile
import gdown
from typing import Dict, List, Tuple

# === Helper functions ===
def download_drive_file(file_id: str, dest_path: str):
    """Download file from Google Drive given a file_id."""
    url = f"https://drive.google.com/uc?id={file_id}"
    print("Downloading from:", url)
    gdown.download(url, dest_path, quiet=False)


def unzip_to(zip_path: str, dest_dir: str):
    print("Unzipping", zip_path, "->", dest_dir)
    if os.path.exists(dest_dir):
        print("   dest exists, removing and re-creating")
        shutil.rmtree(dest_dir)
    os.makedirs(dest_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(dest_dir)
    print("   extracted")


def load_dataset(path: str) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, List[str]]]:
    """Load dataset JSON into queries, corpus, relevant docs."""
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    queries = d["queries"]          # qid -> question text
    corpus = d["corpus"]            # docid -> chunk text
    relevant = d["relevant_docs"]   # qid -> [docid, ...]
    return queries, corpus, relevant


# === Google Drive File IDs ===
GDRIVE_FILE_ID_SENTENCE = "1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH"
GDRIVE_FILE_ID_WORD     = "19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k"

# === Step 1: Download ===
download_drive_file(GDRIVE_FILE_ID_SENTENCE, "sentence_dataset.zip")
download_drive_file(GDRIVE_FILE_ID_WORD, "word_dataset.zip")

# === Step 2: Unzip ===
unzip_to("sentence_dataset.zip", "sentence_dataset")
unzip_to("word_dataset.zip", "word_dataset")




Downloading from: https://drive.google.com/uc?id=1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH


Downloading...
From: https://drive.google.com/uc?id=1rrqJLtBFgSl7mnfOOE5DtbtaVJnyasFH
To: /content/sentence_dataset.zip
100%|██████████| 122k/122k [00:00<00:00, 77.8MB/s]


Downloading from: https://drive.google.com/uc?id=19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k


Downloading...
From: https://drive.google.com/uc?id=19EJy3dh1IAW5Ko6Z-lz7gmm9OxniH00k
To: /content/word_dataset.zip
100%|██████████| 134k/134k [00:00<00:00, 73.1MB/s]

Unzipping sentence_dataset.zip -> sentence_dataset
   extracted
Unzipping word_dataset.zip -> word_dataset
   extracted





In [None]:
# === Step 3: Load datasets ===

sentence_queries, sentence_corpus, sentence_relevant = load_dataset('/content/sentence_dataset/val_dataset.json')
word_queries, word_corpus, word_relevant = load_dataset("/content/word_dataset/val_dataset.json")

print("Sentence dataset size:", len(sentence_queries), "queries,", len(sentence_corpus), "docs")
print("Word-based dataset size:", len(word_queries), "queries,", len(word_corpus), "docs")

Sentence dataset size: 79 queries, 20 docs
Word-based dataset size: 95 queries, 24 docs


In [None]:
# === Constants ===
SENT_FINE_ID = "14qwqVOG-W5N5whojJVH6H75P7B9HcSMf"   # sentence-finetuned zip id
WORD_FINE_ID = "1XiTlLR65cTcsrL3MGgVTk2WIyNMi9v-_"   # word-finetuned zip id
BASELINE_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

# local filenames/dirs
os.makedirs("/content/models", exist_ok=True)
sent_zip = "/content/models/sent_finetuned.zip"
word_zip = "/content/models/word_finetuned.zip"
sent_dir = "/content/models/sent_finetuned"
word_dir = "/content/models/word_finetuned"

OUT_DIR = "/content/results"
os.makedirs(OUT_DIR, exist_ok=True)

# === Download + unzip if not exists ===
if not os.path.exists(sent_dir):
    print("Sentence-level model not found, downloading...")
    download_drive_file(SENT_FINE_ID, sent_zip)
    unzip_to(sent_zip, sent_dir)

if not os.path.exists(word_dir):
    print("Word-level model not found, downloading...")
    download_drive_file(WORD_FINE_ID, word_zip)
    unzip_to(word_zip, word_dir)

# === Load models ===
baseline = SentenceTransformer(BASELINE_MODEL)
sent_model = SentenceTransformer(sent_dir)
word_model = SentenceTransformer(word_dir)

print("✅ All models are ready.")


✅ All models are ready.


In [None]:
!pip install -q sentence-transformers gdown tqdm rank_bm25


In [None]:
import os
import json
import math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from typing import Dict, List, Tuple, Any, Optional
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import re

# --------- CONFIG ----------
OUT_DIR = "/content/results"
os.makedirs(OUT_DIR, exist_ok=True)


baseline = SentenceTransformer(BASELINE_MODEL)
sent_model = SentenceTransformer(sent_dir)
word_model = SentenceTransformer(word_dir)

# --------- helpers (normalization/tokenize) ----------
def simple_normalize(text: str) -> str:
    # minimal Persian normalization similar to your _simple_persian_normalize
    if text is None:
        return ""
    text = text.replace('\ufeff', ' ')
    text = re.sub(r'[\u200c\u200b\u200d\u200e\u200f]+', ' ', text)
    text = re.sub(r'[\x00-\x1f]', ' ', text)
    text = text.replace('ي', 'ی').replace('ك', 'ک')
    text = re.sub(r'-\s+\n', '', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

_token_re = re.compile(r"(?u)\b\w+\b")
def tokenize_for_bm25(text: str) -> List[str]:
    t = simple_normalize(text).lower()
    return _token_re.findall(t)

# --------- embedding encode (batched) ----------
def encode_texts(model: SentenceTransformer, texts: List[str], batch_size:int=64, normalize:bool=True) -> np.ndarray:
    embs = model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    if normalize:
        norms = np.linalg.norm(embs, axis=1, keepdims=True)
        norms[norms==0] = 1.0
        embs = embs / norms
    return embs

# --------- retrieve top-k for embeddings ----------
def retrieve_topk_embeddings(query_embs: np.ndarray, corpus_embs: np.ndarray, corpus_ids: List[str], topk:int=10):
    sims = np.dot(query_embs, corpus_embs.T)   # cosine if normalized
    topk_idx = np.argsort(-sims, axis=1)[:, :topk]
    topk_scores = np.take_along_axis(sims, topk_idx, axis=1)
    topk_ids = [[corpus_ids[i] for i in row] for row in topk_idx]
    return topk_ids, topk_scores

# --------- TF-IDF: build + query ----------
def build_tfidf(corpus_texts: List[str]):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        analyzer='word',
        token_pattern=r"(?u)\b\w+\b",
        ngram_range=(1,2),
        max_features=None
    )
    tfidf_matrix = vectorizer.fit_transform(corpus_texts)  # (n_docs, n_feats)
    return vectorizer, tfidf_matrix

def retrieve_topk_tfidf(vectorizer: TfidfVectorizer, tfidf_matrix, corpus_ids: List[str], queries: List[str], topk:int=10):
    q_vecs = vectorizer.transform(queries)
    sims = cosine_similarity(q_vecs, tfidf_matrix)  # (n_queries, n_docs)
    topk_idx = np.argsort(-sims, axis=1)[:, :topk]
    topk_scores = np.take_along_axis(sims, topk_idx, axis=1)
    topk_ids = [[corpus_ids[i] for i in row] for row in topk_idx]
    return topk_ids, topk_scores

# --------- BM25: build + query ----------
def build_bm25(tokenized_corpus: List[List[str]]):
    return BM25Okapi(tokenized_corpus, k1=1.5, b=0.75)

def retrieve_topk_bm25(bm25: BM25Okapi, tokenized_corpus: List[List[str]], corpus_ids: List[str], queries: List[str], topk:int=10):
    all_ids = []
    all_scores = []
    for q in queries:
        q_tokens = tokenize_for_bm25(q)
        scores = bm25.get_scores(q_tokens)  # 1D array of length n_docs
        top_idx = np.argsort(-scores)[:topk]
        ids = [corpus_ids[i] for i in top_idx]
        scs = [float(scores[i]) for i in top_idx]
        all_ids.append(ids)
        all_scores.append(scs)
    return all_ids, all_scores

# --------- metrics (your compute_metrics_for_k function) ----------
def compute_metrics_for_k(retrieved_ids: List[List[str]], relevant_map: Dict[str, List[str]], qids: List[str], k:int):
    n = len(qids)
    hits = 0
    precision_sum = 0.0
    recall_sum = 0.0
    rr_sum = 0.0
    ap_sum = 0.0
    for i, qid in enumerate(qids):
        retrieved = retrieved_ids[i][:k]
        relevant = set(relevant_map.get(qid, []))
        if len(relevant) == 0:
            continue
        is_hit = len(set(retrieved) & relevant) > 0
        if is_hit:
            hits += 1
        rel_retrieved = len(set(retrieved) & relevant)
        precision_sum += rel_retrieved / float(k)
        recall_sum += rel_retrieved / float(len(relevant))
        rr = 0.0
        ap = 0.0
        first_rel_rank = None
        num_rel_found = 0
        for rank_idx, docid in enumerate(retrieved, start=1):
            if docid in relevant:
                num_rel_found += 1
                if first_rel_rank is None:
                    first_rel_rank = rank_idx
                ap += (num_rel_found / rank_idx)
        if first_rel_rank is not None:
            rr = 1.0 / float(first_rel_rank)
            ap = ap / float(len(relevant))
        else:
            rr = 0.0
            ap = 0.0
        rr_sum += rr
        ap_sum += ap
    valid_qids = [q for q in qids if len(relevant_map.get(q, []))>0]
    m = len(valid_qids)
    if m == 0:
        return {"hit@k": None, "precision@k": None, "recall@k": None, "mrr": None, "map@k": None}
    return {
        "hit@k": hits / m,
        "precision@k": precision_sum / m,
        "recall@k": recall_sum / m,
        "mrr": rr_sum / m,
        "map@k": ap_sum / m
    }

# --------- evaluation driver (fixed: safe directory creation) ----------
def evaluate_dataset_variant(
    name: str,
    queries_map: Dict[str,str],
    corpus_map: Dict[str,str],
    relevant_map: Dict[str, List[str]],
    baseline_model: SentenceTransformer,
    finetuned_model: Optional[SentenceTransformer] = None,
    run_tfidf: bool = True,
    run_bm25: bool = True,
    topk_list: List[int] = [1,3,5,10],
    batch_size:int=64
) -> Tuple[pd.DataFrame, Dict[Tuple[str,str], str]]:

    # ensure output dir exists
    os.makedirs(OUT_DIR, exist_ok=True)

    qids = list(queries_map.keys())
    corpus_ids = list(corpus_map.keys())
    query_texts = [queries_map[q] for q in qids]
    corpus_texts = [corpus_map[c] for c in corpus_ids]
    rows = []
    saved_per_query = {}
    max_k = max(topk_list)

    # Embedding baseline
    print(f"\n-> Embedding baseline for dataset {name}")
    corp_embs_base = encode_texts(baseline_model, corpus_texts, batch_size=batch_size, normalize=True)
    q_embs_base = encode_texts(baseline_model, query_texts, batch_size=batch_size, normalize=True)
    emb_ids_base, emb_scores_base = retrieve_topk_embeddings(q_embs_base, corp_embs_base, corpus_ids, topk=max_k)
    mean_top1_cos_emb = float(np.mean([scores[0] for scores in emb_scores_base])) if len(emb_scores_base)>0 else float('nan')

    for k in topk_list:
        m = compute_metrics_for_k(emb_ids_base, relevant_map, qids, k)
        rows.append({
            "dataset": name, "retriever": "embedding", "model": "baseline_paraphrase_miniLM", "top_k": k,
            "EM": m["hit@k"] if k==1 else None, "hit@k": m["hit@k"], "precision@k": m["precision@k"],
            "recall@k": m["recall@k"], "mrr": m["mrr"], "map@k": m["map@k"], "mean_top1_cosine": mean_top1_cos_emb if k==1 else np.nan, "mean_top1_bm25_score": np.nan
        })
    perq_file = os.path.join(OUT_DIR, f"per_query_{name}_embedding_baseline.csv")
    os.makedirs(os.path.dirname(perq_file), exist_ok=True)   # ensure parent exists
    perq = [{"qid": qid, "query": queries_map[qid], "expected": relevant_map.get(qid, []),
             "retrieved_ids_topk": ids, "retrieved_scores_topk": scores.tolist()}
            for qid, ids, scores in zip(qids, emb_ids_base, emb_scores_base)]
    pd.DataFrame(perq).to_csv(perq_file, index=False)
    saved_per_query[("embedding","baseline")] = perq_file

    # Embedding finetuned (if provided)
    if finetuned_model is not None:
        print(f"\n-> Embedding finetuned for dataset {name}")
        corp_embs_ft = encode_texts(finetuned_model, corpus_texts, batch_size=batch_size, normalize=True)
        q_embs_ft = encode_texts(finetuned_model, query_texts, batch_size=batch_size, normalize=True)
        emb_ids_ft, emb_scores_ft = retrieve_topk_embeddings(q_embs_ft, corp_embs_ft, corpus_ids, topk=max_k)
        mean_top1_cos_ft = float(np.mean([s[0] for s in emb_scores_ft])) if len(emb_scores_ft)>0 else float('nan')
        for k in topk_list:
            m = compute_metrics_for_k(emb_ids_ft, relevant_map, qids, k)
            rows.append({
                "dataset": name, "retriever": "embedding", "model": "finetuned", "top_k": k,
                "EM": m["hit@k"] if k==1 else None, "hit@k": m["hit@k"], "precision@k": m["precision@k"],
                "recall@k": m["recall@k"], "mrr": m["mrr"], "map@k": m["map@k"], "mean_top1_cosine": mean_top1_cos_ft if k==1 else np.nan,
                "mean_top1_bm25_score": np.nan
            })
        perq_file = os.path.join(OUT_DIR, f"per_query_{name}_embedding_finetuned.csv")
        os.makedirs(os.path.dirname(perq_file), exist_ok=True)
        perq = [{"qid": qid, "query": queries_map[qid], "expected": relevant_map.get(qid, []),
                 "retrieved_ids_topk": ids, "retrieved_scores_topk": scores.tolist()}
                for qid, ids, scores in zip(qids, emb_ids_ft, emb_scores_ft)]
        pd.DataFrame(perq).to_csv(perq_file, index=False)
        saved_per_query[("embedding","finetuned")] = perq_file

    # TF-IDF
    if run_tfidf:
        print(f"\n-> TF-IDF retrieval for dataset {name}")
        vectorizer, tfidf_matrix = build_tfidf([simple_normalize(t) for t in corpus_texts])
        tfidf_ids, tfidf_scores = retrieve_topk_tfidf(vectorizer, tfidf_matrix, corpus_ids, [simple_normalize(q) for q in query_texts], topk=max_k)
        mean_top1_cos_tfidf = float(np.mean([s[0] for s in tfidf_scores])) if len(tfidf_scores)>0 else float('nan')
        for k in topk_list:
            m = compute_metrics_for_k(tfidf_ids, relevant_map, qids, k)
            rows.append({
                "dataset": name, "retriever": "tfidf", "model": "tfidf", "top_k": k,
                "EM": m["hit@k"] if k==1 else None, "hit@k": m["hit@k"], "precision@k": m["precision@k"],
                "recall@k": m["recall@k"], "mrr": m["mrr"], "map@k": m["map@k"], "mean_top1_cosine": mean_top1_cos_tfidf if k==1 else np.nan,
                "mean_top1_bm25_score": np.nan
            })
        perq_file = os.path.join(OUT_DIR, f"per_query_{name}_tfidf.csv")
        os.makedirs(os.path.dirname(perq_file), exist_ok=True)
        perq = [{"qid": qid, "query": queries_map[qid], "expected": relevant_map.get(qid, []),
                 "retrieved_ids_topk": ids, "retrieved_scores_topk": scores.tolist()}
                for qid, ids, scores in zip(qids, tfidf_ids, tfidf_scores)]
        pd.DataFrame(perq).to_csv(perq_file, index=False)
        saved_per_query[("tfidf","tfidf")] = perq_file

    # BM25
    if run_bm25:
        print(f"\n-> BM25 retrieval for dataset {name}")
        tokenized_corpus = [tokenize_for_bm25(t) for t in corpus_texts]
        bm25 = build_bm25(tokenized_corpus)
        bm25_ids, bm25_scores = retrieve_topk_bm25(bm25, tokenized_corpus, corpus_ids, query_texts, topk=max_k)
        mean_top1_bm25 = float(np.mean([s[0] for s in bm25_scores])) if len(bm25_scores)>0 else float('nan')
        for k in topk_list:
            m = compute_metrics_for_k(bm25_ids, relevant_map, qids, k)
            rows.append({
                "dataset": name, "retriever": "bm25", "model": "bm25", "top_k": k,
                "EM": m["hit@k"] if k==1 else None, "hit@k": m["hit@k"], "precision@k": m["precision@k"],
                "recall@k": m["recall@k"], "mrr": m["mrr"], "map@k": m["map@k"],
                "mean_top1_cosine": np.nan, "mean_top1_bm25_score": mean_top1_bm25 if k==1 else np.nan
            })
        perq_file = os.path.join(OUT_DIR, f"per_query_{name}_bm25.csv")
        os.makedirs(os.path.dirname(perq_file), exist_ok=True)
        perq = [{"qid": qid, "query": queries_map[qid], "expected": relevant_map.get(qid, []),
                 "retrieved_ids_topk": ids, "retrieved_scores_topk": scores}
                for qid, ids, scores in zip(qids, bm25_ids, bm25_scores)]
        pd.DataFrame(perq).to_csv(perq_file, index=False)
        saved_per_query[("bm25","bm25")] = perq_file

    df = pd.DataFrame(rows)
    return df, saved_per_query


os.makedirs(OUT_DIR, exist_ok=True)


df_sent, perq_sent = evaluate_dataset_variant("sentence", sentence_queries, sentence_corpus, sentence_relevant, baseline, finetuned_model=sent_model, run_tfidf=True, run_bm25=True)
df_word, perq_word = evaluate_dataset_variant("word", word_queries, word_corpus, word_relevant, baseline, finetuned_model=word_model, run_tfidf=True, run_bm25=True)

# After running, combine and save:
df_all = pd.concat([df_sent, df_word], ignore_index=True)
df_all.to_csv(os.path.join(OUT_DIR, "semantic_similarity_mrr_summary.csv"), index=False)



-> Embedding baseline for dataset sentence


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]


-> Embedding finetuned for dataset sentence


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]


-> TF-IDF retrieval for dataset sentence

-> BM25 retrieval for dataset sentence

-> Embedding baseline for dataset word


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]


-> Embedding finetuned for dataset word


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]


-> TF-IDF retrieval for dataset word

-> BM25 retrieval for dataset word


In [None]:
pd.read_csv('/content/results/semantic_similarity_mrr_summary.csv')

Unnamed: 0,dataset,retriever,model,top_k,EM,hit@k,precision@k,recall@k,mrr,map@k,mean_top1_cosine,mean_top1_bm25_score
0,sentence,embedding,baseline_paraphrase_miniLM,1,0.506329,0.506329,0.506329,0.506329,0.506329,0.506329,0.659837,
1,sentence,embedding,baseline_paraphrase_miniLM,3,,0.696203,0.232068,0.696203,0.590717,0.590717,,
2,sentence,embedding,baseline_paraphrase_miniLM,5,,0.772152,0.15443,0.772152,0.607173,0.607173,,
3,sentence,embedding,baseline_paraphrase_miniLM,10,,0.860759,0.086076,0.860759,0.617877,0.617877,,
4,sentence,embedding,finetuned,1,0.468354,0.468354,0.468354,0.468354,0.468354,0.468354,0.657955,
5,sentence,embedding,finetuned,3,,0.670886,0.223629,0.670886,0.565401,0.565401,,
6,sentence,embedding,finetuned,5,,0.797468,0.159494,0.797468,0.593882,0.593882,,
7,sentence,embedding,finetuned,10,,0.886076,0.088608,0.886076,0.605129,0.605129,,
8,sentence,tfidf,tfidf,1,0.797468,0.797468,0.797468,0.797468,0.797468,0.797468,0.221911,
9,sentence,tfidf,tfidf,3,,0.936709,0.312236,0.936709,0.864979,0.864979,,
