<a href="https://colab.research.google.com/github/Asaad972/CollabFirstNoteBook/blob/main/HW02_Cloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1: Minimal package installation (only if missing)
import importlib.util, sys, subprocess

def ensure(pkg, import_name=None):
    name = import_name or pkg
    if importlib.util.find_spec(name) is None:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

# Usually already installed in Colab, but keep safe:
ensure("pandas", "pandas")

# Required for your homework plan:
ensure("nltk", "nltk")
ensure("sentence-transformers", "sentence_transformers")
ensure("faiss-cpu", "faiss")
ensure("pymupdf", "fitz")


print(" Dependencies ready")


 Dependencies ready


In [2]:
# CELL 2: Imports + NLTK resources (run once per runtime)

import re
from collections import defaultdict
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sentence_transformers import SentenceTransformer
import faiss

# NLTK downloads (required for stopwords/tokenizer/lemmatizer)
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

print(" Imports ready + NLTK resources downloaded")




 Imports ready + NLTK resources downloaded


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
# CELL 3: Store Classes (Vector Store + Inverted Index)
# =====================================================
"""
 CELL 3: STORE CLASSES
- SimpleVectorStore: stores embeddings + documents + metadatas + ids (like Tirgul 7)
- InvertedIndexStore: stores required index schema term -> DocIDs (homework requirement)
"""

import numpy as np
from collections import defaultdict

# ---------- Vector Store (similar to Tirgul 7) ----------
class SimpleVectorStore:
    """Simple in-memory vector store (fallback)"""

    def __init__(self):
        self.documents = []
        self.embeddings = []   # list of numpy arrays
        self.metadatas = []
        self.ids = []
        print(" SimpleVectorStore initialized")

    def add(self, embeddings, documents, metadatas, ids):
        # Ensure numpy arrays
        embeddings = [np.asarray(e, dtype=np.float32) for e in embeddings]
        self.embeddings.extend(embeddings)
        self.documents.extend(documents)
        self.metadatas.extend(metadatas)
        self.ids.extend(ids)
        print(f" Added {len(documents)} documents to simple vector store")

    def query(self, query_embeddings, n_results=5):
        if not self.embeddings:
            return {'ids': [[]], 'documents': [[]], 'metadatas': [[]], 'distances': [[]]}

        q = np.asarray(query_embeddings[0], dtype=np.float32)

        E = np.vstack(self.embeddings)  # shape: (N, d)

        # cosine similarity without sklearn
        q_norm = np.linalg.norm(q) + 1e-12
        E_norm = np.linalg.norm(E, axis=1) + 1e-12
        sims = (E @ q) / (E_norm * q_norm)

        top_idx = np.argsort(sims)[::-1][:n_results]

        return {
            'ids': [[self.ids[i] for i in top_idx]],
            'documents': [[self.documents[i] for i in top_idx]],
            'metadatas': [[self.metadatas[i] for i in top_idx]],
            'distances': [[float(1 - sims[i]) for i in top_idx]]  # distance-like
        }

    def count(self):
        return len(self.documents)


# ---------- Inverted Index (required by homework) ----------
class InvertedIndexStore:
    """Required structure: term -> DocIDs"""

    def __init__(self):
        self.term_to_docids = defaultdict(set)
        print(" InvertedIndexStore initialized")

    def add_occurrence(self, term: str, doc_id: str):
        self.term_to_docids[term].add(doc_id)

    def get_docids(self, term: str):
        return sorted(self.term_to_docids.get(term, set()))

    def count_terms(self) -> int:
        return len(self.term_to_docids)

    def to_required_format(self):
        # [{"term": ..., "DocIDs": [...]}, ...]
        return [{"term": t, "DocIDs": sorted(list(docids))}
                for t, docids in sorted(self.term_to_docids.items())]


print(" Store classes defined!")
print(" Next: Cell 4 (core logic: preprocess + build index + embeddings)")


 Store classes defined!
 Next: Cell 4 (core logic: preprocess + build index + embeddings)


In [4]:
# CELL 4: Core setup (custom stopwords + stemming + embedding model + FAISS)

# --- Custom stopwords (you define them) ---
# We remove these words because they are very frequent function words (articles, prepositions, pronouns).
# They usually do not add topic meaning, but they increase index size and add noise to retrieval.
CUSTOM_STOPWORDS = {
    "the","a","an","and","or","but",
    "to","of","in","on","at","for","from","by","with","as",
    "is","are","was","were","be","been","being",
    "this","that","these","those",
    "it","its","they","them","their","we","our","you","your",
    "i","me","my","he","him","his","she","her",
    "not","no","do","does","did","doing"
}

stemmer = PorterStemmer()

def preprocess_text(text: str):
    """
    Returns list of terms for indexing:
    - lowercase
    - tokenize
    - keep alphabetic tokens only
    - remove custom stopwords
    - apply stemming
    """
    text = text.lower()
    tokens = word_tokenize(text)
    terms = []
    for tok in tokens:
        if tok.isalpha() and tok not in CUSTOM_STOPWORDS:
            terms.append(stemmer.stem(tok))
    return terms

# --- Embedding model (for semantic retrieval) ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- FAISS index (stores embeddings for doc-level retrieval) ---
faiss_index = None
vector_dim = None

# Parallel stores (FAISS row -> doc data)
vector_doc_ids = []   # doc_id
vector_texts = []     # full doc text

print(" Core setup ready (custom stopwords + stemming + embeddings + FAISS)")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


 Core setup ready (custom stopwords + stemming + embeddings + FAISS)


In [5]:
# CELL 5: Wikipedia source links (seed documents for the corpus)

wiki_links = [
    "https://en.wikipedia.org/wiki/Plant_disease",
    "https://en.wikipedia.org/wiki/Plant_pathology",
    "https://en.wikipedia.org/wiki/Fungus",
    "https://en.wikipedia.org/wiki/Bacterial_wilt",
    "https://en.wikipedia.org/wiki/Powdery_mildew"
]

print("Wikipedia links used:")
for i, link in enumerate(wiki_links, 1):
    print(f"{i}. {link}")


Wikipedia links used:
1. https://en.wikipedia.org/wiki/Plant_disease
2. https://en.wikipedia.org/wiki/Plant_pathology
3. https://en.wikipedia.org/wiki/Fungus
4. https://en.wikipedia.org/wiki/Bacterial_wilt
5. https://en.wikipedia.org/wiki/Powdery_mildew


In [6]:
# CELL 6: Load documents from Wikipedia (API fetch + normalization + metadata)

import requests
import re

WIKI_API = "https://en.wikipedia.org/w/api.php"

# Wikipedia blocks requests without a proper User-Agent sometimes
HEADERS = {
    "User-Agent": "HW02-Cloud-RAG/1.0 (student project; contact: student@example.com)"
}

def title_from_wiki_url(url: str) -> str:
    if "/wiki/" not in url:
        raise ValueError(f"Unsupported Wikipedia URL: {url}")
    title = url.split("/wiki/", 1)[1]
    title = title.split("#", 1)[0]      # remove anchors
    title = title.replace("_", " ")
    return title

def fetch_page_extract_by_title(title: str):
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts|info",
        "titles": title,
        "inprop": "url",
        "explaintext": True,
        "redirects": 1,   # follow redirects
        "origin": "*"     # helps in some environments
    }
    r = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=30)
    r.raise_for_status()

    pages = r.json()["query"]["pages"]
    page = next(iter(pages.values()))

    # Handle missing page
    if "missing" in page:
        return {"pageid": None, "title": title, "url": "", "text": ""}

    return {
        "pageid": page.get("pageid"),
        "title": page.get("title", title),
        "url": page.get("fullurl", ""),
        "text": page.get("extract", "")
    }

def slugify(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    return s.strip("-")

def load_docs_from_wiki_links(wiki_links):
    docs = {}
    docs_meta = {}

    for url in wiki_links:
        title = title_from_wiki_url(url)
        data = fetch_page_extract_by_title(title)

        text = (data.get("text") or "").strip()
        if not text:
            print(f"Empty/blocked page: {title} | {url}")
            continue

        doc_id = f"wiki_{slugify(data['title'])}"
        docs[doc_id] = text
        docs_meta[doc_id] = {
            "title": data["title"],
            "url": data.get("url") or url,
            "source": "wikipedia",
            "pageid": data.get("pageid"),
        }

        print(f"Loaded: {data['title']} -> {doc_id} | chars={len(text)}")

    return docs, docs_meta

docs, docs_meta = load_docs_from_wiki_links(wiki_links)
print("Docs loaded:", len(docs))


Loaded: Plant disease -> wiki_plant-disease | chars=9654
Loaded: Plant pathology -> wiki_plant-pathology | chars=5228
Loaded: Fungus -> wiki_fungus | chars=65562
Loaded: Bacterial wilt -> wiki_bacterial-wilt | chars=3688
Loaded: Erysiphaceae -> wiki_erysiphaceae | chars=14230
Docs loaded: 5


In [7]:
# CELL 7: Build the required index (term -> DocIDs) + build FAISS embeddings store (doc-level)

# 1) Build inverted index (term -> DocIDs)
inv_index = InvertedIndexStore()

for doc_id, text in docs.items():
    terms = preprocess_text(text)   # uses custom stopwords + stemming
    for t in set(terms):            # presence only (not frequency)
        inv_index.add_occurrence(t, doc_id)

print(f" Inverted index built. Unique terms: {inv_index.count_terms()}")

# 2) Build embeddings + FAISS (one vector per doc)
doc_ids = list(docs.keys())
texts = [docs[d] for d in doc_ids]

emb = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True).astype("float32")

vector_dim = emb.shape[1]
faiss_index = faiss.IndexFlatIP(vector_dim)  # cosine similarity via normalized embeddings
faiss_index.add(emb)

# parallel arrays for retrieval results
vector_doc_ids = doc_ids
vector_texts = texts

print(f" FAISS built. Vectors: {faiss_index.ntotal} | dim={vector_dim}")


 InvertedIndexStore initialized
 Inverted index built. Unique terms: 2580
 FAISS built. Vectors: 5 | dim=384


In [8]:
# CELL 8: Firebase / Firestore initialization (cloud persistence setup)

!pip -q install firebase-admin

import firebase_admin
from firebase_admin import credentials, firestore

# Initialize Firebase Admin SDK using a service account key
cred = credentials.Certificate(
    "hw02-cloud-inverted-index-firebase-adminsdk-fbsvc-437db7abaa.json"
)

# Prevent re-initialization errors in notebook environments
if not firebase_admin._apps:
    firebase_admin.initialize_app(cred)

# Create Firestore client
db = firestore.client()
print("Firestore connected:", db.project)


Firestore connected: hw02-cloud-inverted-index


In [9]:
# CELL 9: Upload inverted index to Firestore (cloud storage of term → DocIDs)

from google.cloud.firestore_v1 import ArrayUnion

def upload_inverted_index(inv_index, collection_name="inverted_index", batch_size=400):
    """
    Uploads the inverted index to Firestore.

    Each term is stored as a document in the collection:
      inverted_index/{term}

    Stored fields:
      - term     : the stemmed term
      - doc_ids  : list of document IDs containing the term
      - df       : document frequency (number of documents containing the term)

    Batch writes are used to stay within Firestore limits and improve performance.
    """
    col = db.collection(collection_name)
    records = inv_index.to_required_format()  # [{"term": ..., "DocIDs": [...]}, ...]

    batch = db.batch()
    ops = 0

    for r in records:
        term = r["term"]
        doc_ids = r["DocIDs"]

        # Use the term itself as the Firestore document ID (trimmed for safety)
        doc_id = term[:1500]

        ref = col.document(doc_id)
        batch.set(ref, {
            "term": term,
            "doc_ids": doc_ids,
            "df": len(doc_ids),
        })

        ops += 1
        if ops >= batch_size:
            batch.commit()
            batch = db.batch()
            ops = 0

    # Commit any remaining operations
    if ops > 0:
        batch.commit()

    print(f"Uploaded {len(records)} terms to Firestore collection '{collection_name}'")

# Upload the built inverted index
upload_inverted_index(inv_index)


Uploaded 2580 terms to Firestore collection 'inverted_index'


In [10]:
# CELL 10: Upload Wikipedia document metadata to Firestore (documents collection)

def upload_wiki_meta(docs_meta, collection_name="documents", batch_size=400):
    """
    Uploads Wikipedia document metadata to Firestore.

    Each document is stored as:
      documents/{doc_id}

    Stored fields:
      - doc_id  : your internal document ID (e.g., wiki_plant-disease)
      - title   : Wikipedia page title
      - url     : Wikipedia page URL
      - source  : "wikipedia"
      - pageid  : Wikipedia page id (if available)

    This does NOT upload the full article text; it only uploads metadata.
    """
    col = db.collection(collection_name)

    batch = db.batch()
    ops = 0

    for doc_id, meta in docs_meta.items():
        ref = col.document(doc_id)
        batch.set(ref, {
            "doc_id": doc_id,
            "title": meta.get("title", ""),
            "url": meta.get("url", ""),
            "source": meta.get("source", "wikipedia"),
            "pageid": meta.get("pageid", None),
        }, merge=True)

        ops += 1
        if ops >= batch_size:
            batch.commit()
            batch = db.batch()
            ops = 0

    if ops > 0:
        batch.commit()

    print(f"Uploaded {len(docs_meta)} wiki docs to '{collection_name}'")

# Upload metadata for the loaded Wikipedia docs
upload_wiki_meta(docs_meta)


Uploaded 5 wiki docs to 'documents'


In [11]:
# CELL 11: Embedding-based document retrieval using FAISS (semantic search)

def retrieve_top_docs(query: str, top_k: int = 5):
    """
    Retrieves the top-K most relevant documents for a user query using
    vector embeddings and FAISS similarity search.

    This function:
      1) Embeds the query using the same embedding model as the documents
      2) Searches the FAISS index using cosine similarity
      3) Returns ranked documents with titles, similarity scores, and text snippets

    Note: This is retrieval only (no generation / no LLM).
    """
    if faiss_index is None or faiss_index.ntotal == 0:
        return "FAISS index is empty. Build vectors first."

    # Embed and normalize the query
    q_emb = embed_model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype("float32")

    # Search FAISS index
    distances, indices = faiss_index.search(q_emb, top_k)

    lines = []
    lines.append(f"Query: {query}")
    lines.append("=" * 60)

    # Format ranked results
    for rank, idx in enumerate(indices[0], start=1):
        if idx == -1:
            continue

        doc_id = vector_doc_ids[idx]
        title = docs_meta.get(doc_id, {}).get("title", "")
        text = vector_texts[idx]
        snippet = re.sub(r"\s+", " ", text)[:350]
        score = float(distances[0][rank - 1])

        lines.append(f"{rank}) {doc_id} | {title} | similarity: {score:.4f}")
        lines.append(f"Snippet: {snippet}...")
        lines.append("-" * 60)

    return "\n".join(lines)

print("Retrieval function ready")


Retrieval function ready


In [12]:
# CELL 12: RAG-style output (retrieval + "enriched" answer without OpenAI)
# We will: retrieve top docs, then produce a simple enriched response by extracting key sentences.

def split_sentences(text: str):
    # simple sentence split (good enough for baseline)
    parts = re.split(r'(?<=[.!?])\s+', re.sub(r"\s+", " ", text).strip())
    return [s for s in parts if len(s) > 30]

def rag_answer_without_llm(query: str, top_k: int = 3, max_sentences_per_doc: int = 2):
    if faiss_index is None or faiss_index.ntotal == 0:
        return "FAISS index is empty. Build vectors first."

    q_emb = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    distances, indices = faiss_index.search(q_emb, top_k)

    lines = []
    lines.append(f"Query: {query}")
    lines.append("=" * 60)

    # Retrieval section
    lines.append("Top retrieved documents:")
    retrieved = []
    for rank, idx in enumerate(indices[0], start=1):
        if idx == -1:
            continue
        doc_id = vector_doc_ids[idx]
        title = docs_meta.get(doc_id, {}).get("title", "")
        score = float(distances[0][rank - 1])
        retrieved.append((doc_id, title, score))
        lines.append(f"{rank}) {doc_id} | {title} | similarity: {score:.4f}")
    lines.append("=" * 60)

    # Enriched response (extractive, no LLM)
    lines.append("Enriched response (extractive, no LLM):")
    q_terms = set(preprocess_text(query))

    for doc_id, title, score in retrieved:
        text = docs[doc_id]
        sents = split_sentences(text)

        # score sentences by overlap with query terms (stems)
        scored = []
        for s in sents:
            s_terms = set(preprocess_text(s))
            overlap = len(q_terms & s_terms)
            if overlap > 0:
                scored.append((overlap, s))

        scored.sort(key=lambda x: x[0], reverse=True)
        best = [s for _, s in scored[:max_sentences_per_doc]]

        lines.append(f"- Source: {doc_id} | {title}")
        if best:
            for b in best:
                lines.append(f"  • {b}")
        else:
            lines.append("  • (No strong matching sentences found)")
        lines.append("-" * 60)

    return "\n".join(lines)

print(" RAG-style (no OpenAI) function ready")


 RAG-style (no OpenAI) function ready


In [13]:
# CELL 13: Quick demo (edit the query text)

print(retrieve_top_docs("how to detect plant diseases using sensors and ai", top_k=3))
print()
print(rag_answer_without_llm("how to detect plant diseases using sensors and ai", top_k=3))


Query: how to detect plant diseases using sensors and ai
1) wiki_plant-disease | Plant disease | similarity: 0.4522
Snippet: Plant diseases are diseases in plants caused by pathogens (infectious organisms) and environmental conditions (physiological factors). Organisms that cause infectious disease include fungi, oomycetes, bacteria, viruses, viroids, virus-like organisms, phytoplasmas, protozoa, nematodes and parasitic plants. Not included are ectoparasites like insects...
------------------------------------------------------------
2) wiki_plant-pathology | Plant pathology | similarity: 0.4479
Snippet: Plant pathology or phytopathology is the scientific study of plant diseases caused by pathogens (infectious organisms) and environmental conditions (physiological factors). Plant pathology involves the study of pathogen identification, disease etiology, disease cycles, economic impact, plant disease epidemiology, plant disease resistance, how plant...
----------------------------------