# 🔍 Information Retrieval System using TF-IDF, Word2Vec, BERT, Hybrid

This notebook presents a full pipeline for building and evaluating an Information Retrieval (IR) system using different vector representations and integrating a generative component (RAG).

In [10]:
import sys
import os

# احصل على المسار الكامل لمجلد src
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # يعود خطوة للخلف من notebooks/
src_path = os.path.join(project_root, "src")

# أضف src إلى مسار بايثون
if src_path not in sys.path:
    sys.path.append(src_path)

print("✅ src path added:", src_path)


✅ src path added: d:\retrieval_project\src


## 📦 Step 1: Imports and Utilities

In [None]:
import os
import json
import joblib
import numpy as np
import torch
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from transformers import AutoTokenizer, AutoModel
from scipy.sparse import hstack, issparse
from preprocessing import clean_text
from app import generate_rag_answer



  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'preprocessing'

## 🧹 Step 2: Data Preprocessing

We clean the raw dataset using tokenization, stopword removal, and lemmatization.

In [2]:
!python ../src/preprocess_dataset.py

🚀 بدء المعالجة المسبقة للبيانات...


Traceback (most recent call last):
  File "d:\retrieval_project\src\preprocess_dataset.py", line 27, in <module>
    preprocess_file(
  File "d:\retrieval_project\src\preprocess_dataset.py", line 15, in preprocess_file
    with open(input_path, 'r', encoding='utf-8') as infile, \
FileNotFoundError: [Errno 2] No such file or directory: 'data/antique\\corpus.jsonl'


## 🔠 Step 3: Build Representations

In [None]:
!python ../src/representation.py

C:\Users\TOSHIBA\AppData\Local\Programs\Python\Python310\python.exe: can't open file 'd:\\retrieval_project\\notebooks\\representation.py': [Errno 2] No such file or directory


## 🧭 Step 4: Build Indexes

In [None]:
!python ../src/indexing.py

## 🔍 Step 5: Retrieve Top-K Documents

In [None]:
from retrieval import retrieve_top_k_index
from preprocessing import clean_text

# Example:
dataset = "antique"
representation = "hybrid"

vectorizer_path = f"../vector_stores/{dataset}_{representation}_vectorizer.joblib"
index_path = f"../vector_stores/{dataset}_{representation}_index.joblib"
corpus_path = f"../data/{dataset}/cleaned_corpus.jsonl"

vectorizer = joblib.load(vectorizer_path)
doc_ids, index = joblib.load(index_path)
corpus = {json.loads(line)["_id"]: json.loads(line) for line in open(corpus_path, encoding='utf-8')}

results = retrieve_top_k_index("information about search engine", vectorizer, index, doc_ids, corpus)
results[:3]

## 📊 Step 6: Evaluate Representations

In [7]:
!python ../src/evaluation.py

⏩ Skipping TFIDF (already evaluated)
⏩ Skipping WORD2VEC (already evaluated)
⏩ Skipping BERT (already evaluated)
⏩ Skipping HYBRID (already evaluated)

📊 Final Evaluation Results:
╒═══════════╤══════════════════╤════════╤════════╤════════╤═════════════╕
│ Dataset   │ Representation   │    MAP │    MRR │   P@10 │   Recall@10 │
╞═══════════╪══════════════════╪════════╪════════╪════════╪═════════════╡
│ QUORA     │ TFIDF            │ 0.4366 │ 0.4649 │ 0.0719 │      0.5405 │
├───────────┼──────────────────┼────────┼────────┼────────┼─────────────┤
│ QUORA     │ WORD2VEC         │ 0.3001 │ 0.3278 │ 0.0501 │      0.3835 │
├───────────┼──────────────────┼────────┼────────┼────────┼─────────────┤
│ QUORA     │ BERT             │ 0.5048 │ 0.5289 │ 0.0826 │      0.5993 │
├───────────┼──────────────────┼────────┼────────┼────────┼─────────────┤
│ QUORA     │ HYBRID           │ 0.5057 │ 0.5297 │ 0.0827 │      0.6001 │
╘═══════════╧══════════════════╧════════╧════════╧════════╧═════════════╛


## 🧠 Step 7: RAG - Generate Answers

Use top retrieved results as context to a generative model.

In [3]:
from app import generate_rag_answer
from retrieval import retrieve_top_k_index
from preprocessing import clean_text
import joblib
import json
import os

# إعداد المسارات (يجب تعديلها حسب هيكل المشروع)
BASE_DIR = os.path.abspath(os.path.dirname("app.py"))
VECTOR_STORE = os.path.join(BASE_DIR, "..", "vector_stores")
CORPUS_DIR = os.path.join(BASE_DIR, "..", "data")

# اختيار البيانات والتمثيل
dataset = "quora"
representation = "bert"

# تحميل الموارد
index_path = os.path.join(VECTOR_STORE, f"{dataset}_{representation}_index.joblib")
doc_ids, index = joblib.load(index_path)

if representation == "bert":
    from transformers import AutoTokenizer, AutoModel
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    vectorizer = (tokenizer, model)
elif representation == "hybrid":
    tfidf_vectorizer = joblib.load(os.path.join(VECTOR_STORE, f"{dataset}_tfidf_vectorizer.joblib"))
    tokenizer, model = joblib.load(os.path.join(VECTOR_STORE, f"{dataset}_bert_vectorizer.joblib"))
    vectorizer = (tfidf_vectorizer, tokenizer, model)
else:
    vectorizer_path = os.path.join(VECTOR_STORE, f"{dataset}_{representation}_vectorizer.joblib")
    vectorizer = joblib.load(vectorizer_path)

# تحميل النصوص
corpus_path = os.path.join(CORPUS_DIR, dataset, "cleaned_corpus.jsonl")
corpus = {}
with open(corpus_path, "r", encoding="utf-8") as f:
    for line in f:
        doc = json.loads(line)
        corpus[doc["_id"]] = {"text": doc["text"]}

# استعلام مثال
query = "What is vector space model?"

# استرجاع النتائج
results = retrieve_top_k_index(query, vectorizer, index, doc_ids, corpus)

# إعداد النصوص المسترجعة
context = [r["text"] for r in results[:5]]

# توليد إجابة باستخدام RAG
answer = generate_rag_answer(query, context)
print("RAG Answer:")
print(answer)


ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /api/models/sentence-transformers/all-MiniLM-L6-v2/tree/main/additional_chat_templates?recursive=False&expand=False (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001BF7373DE10>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 0e0b5444-29e0-4758-bb72-02b508dfa91e)')

1. تحميل نتائج التقييم الأساسية:

In [11]:
import json
import os

base_path = "outputs/quora"
reps = ["tfidf", "word2vec", "bert", "hybrid"]
results = []

for rep in reps:
    with open(os.path.join(base_path, f"{rep}_evaluation.json"), encoding="utf-8") as f:
        results.append(json.load(f))


🟢 1. استيراد الدوال والملفات الأساسية

In [13]:
from evaluation import (
    load_queries,
    load_qrels,
    load_corpus,
    load_resources,
    evaluate_rag  # إذا كانت الدالة موجودة داخل evaluation.py
)


🟢 2. تحميل البيانات الأساسية

In [14]:
# المسارات
DATASET = "quora"
BASE_DIR = "../data/quora"

# تحميل البيانات
queries_original = load_queries(os.path.join(BASE_DIR, "cleaned_queries.jsonl"))
queries_suggested = load_queries(os.path.join(BASE_DIR, "cleaned_queries_with_suggestions.jsonl"))
qrels = load_qrels(os.path.join(BASE_DIR, "qrels.jsonl"))
corpus = load_corpus(os.path.join(BASE_DIR, "cleaned_corpus.jsonl"))

# تحميل موارد BERT لأن RAG يعتمد عليه
vectorizer, index, doc_ids = load_resources("bert")


🟢 3. تقييم الاستعلامات الأصلية باستخدام RAG

In [17]:
rag_result_original = evaluate_rag(
    queries=queries_original,
    qrels=qrels,
    corpus=corpus,
    vectorizer=vectorizer,
    index=index,
    doc_ids=doc_ids
)
rag_result_original["representation"] = "RAG (original)"

🔍 Evaluating with RAG:   0%|          | 0/15000 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Evaluating with RAG:   0%|          | 1/15000 [00:17<73:21:49, 17.61s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Evaluating with RAG:   0%|          | 2/15000 [00:20<37:51:02,  9.09s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Evaluating with RAG:   0%|          | 2/15000 [00:24<50:27:33, 12.11s/it]


KeyboardInterrupt: 

4. تقييم الاستعلامات المحسنة باستخدام Query Suggestions

In [6]:
print("✅ Evaluating suggested queries with RAG...")
rag_result_suggested = evaluate_rag(
    queries=queries_suggested,
    qrels=qrels,
    corpus=corpus,
    vectorizer=vectorizer,
    index=index,
    doc_ids=doc_ids
)
rag_result_suggested["representation"] = "RAG + Query Suggestions"


✅ Evaluating suggested queries with RAG...


🔍 Evaluating with RAG:   0%|          | 0/15000 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Evaluating with RAG:   0%|          | 1/15000 [00:28<117:01:46, 28.09s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Evaluating with RAG:   0%|          | 2/15000 [00:38<74:45:00, 17.94s/it] The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Evaluating with RAG:   0%|          | 3/15000 [00:42<47:59:35, 11.52s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🔍 Evaluating with RAG:   0%|          | 4/15000 [00:46<35:30:59,  8.53s/it]The following generation flags are not valid and may be ignore

🟢 5. عرض النتائج في جدول مقارنة احترافي

In [18]:
import pandas as pd

df = pd.DataFrame([
    rag_result_original,
    rag_result_suggested
])[["representation", "MAP", "MRR", "P@10", "Recall@10"]]

df.sort_values("MAP", ascending=False).reset_index(drop=True)


NameError: name 'rag_result_original' is not defined

🟢 6. رسم شارت للمقارنة

In [16]:
import matplotlib.pyplot as plt

df.set_index("representation").plot(
    kind="bar",
    figsize=(10,6),
    title="Comparison of RAG with and without Query Suggestions",
    grid=True,
    ylabel="Score",
    ylim=(0, 1),
    colormap="tab20"
)

plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'matplotlib'