<a href="https://colab.research.google.com/github/Asaad972/AutoPARKPorject/blob/main/HW02_Cloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1: Minimal package installation (only if missing)
import importlib.util, sys, subprocess

def ensure(pkg, import_name=None):
    name = import_name or pkg
    if importlib.util.find_spec(name) is None:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

# Usually already installed in Colab, but keep safe:
ensure("pandas", "pandas")

# Required for your homework plan:
ensure("nltk", "nltk")
ensure("sentence-transformers", "sentence_transformers")
ensure("faiss-cpu", "faiss")
ensure("pymupdf", "fitz")


print(" Dependencies ready")


 Dependencies ready


In [2]:
# CELL 2: Imports + NLTK resources (run once per runtime)

import re
from collections import defaultdict
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sentence_transformers import SentenceTransformer
import faiss

# NLTK downloads (required for stopwords/tokenizer/lemmatizer)
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

print(" Imports ready + NLTK resources downloaded")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


 Imports ready + NLTK resources downloaded


[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
# CELL 3: Store Classes (Vector Store + Inverted Index)
# =====================================================
"""
 CELL 3: STORE CLASSES
- SimpleVectorStore: stores embeddings + documents + metadatas + ids (like Tirgul 7)
- InvertedIndexStore: stores required index schema term -> DocIDs (homework requirement)
"""

import numpy as np
from collections import defaultdict

# ---------- Vector Store (similar to Tirgul 7) ----------
class SimpleVectorStore:
    """Simple in-memory vector store (fallback)"""

    def __init__(self):
        self.documents = []
        self.embeddings = []   # list of numpy arrays
        self.metadatas = []
        self.ids = []
        print(" SimpleVectorStore initialized")

    def add(self, embeddings, documents, metadatas, ids):
        # Ensure numpy arrays
        embeddings = [np.asarray(e, dtype=np.float32) for e in embeddings]
        self.embeddings.extend(embeddings)
        self.documents.extend(documents)
        self.metadatas.extend(metadatas)
        self.ids.extend(ids)
        print(f" Added {len(documents)} documents to simple vector store")

    def query(self, query_embeddings, n_results=5):
        if not self.embeddings:
            return {'ids': [[]], 'documents': [[]], 'metadatas': [[]], 'distances': [[]]}

        q = np.asarray(query_embeddings[0], dtype=np.float32)

        E = np.vstack(self.embeddings)  # shape: (N, d)

        # cosine similarity without sklearn
        q_norm = np.linalg.norm(q) + 1e-12
        E_norm = np.linalg.norm(E, axis=1) + 1e-12
        sims = (E @ q) / (E_norm * q_norm)

        top_idx = np.argsort(sims)[::-1][:n_results]

        return {
            'ids': [[self.ids[i] for i in top_idx]],
            'documents': [[self.documents[i] for i in top_idx]],
            'metadatas': [[self.metadatas[i] for i in top_idx]],
            'distances': [[float(1 - sims[i]) for i in top_idx]]  # distance-like
        }

    def count(self):
        return len(self.documents)


# ---------- Inverted Index (required by homework) ----------
class InvertedIndexStore:
    """Required structure: term -> DocIDs"""

    def __init__(self):
        self.term_to_docids = defaultdict(set)
        print(" InvertedIndexStore initialized")

    def add_occurrence(self, term: str, doc_id: str):
        self.term_to_docids[term].add(doc_id)

    def get_docids(self, term: str):
        return sorted(self.term_to_docids.get(term, set()))

    def count_terms(self) -> int:
        return len(self.term_to_docids)

    def to_required_format(self):
        # [{"term": ..., "DocIDs": [...]}, ...]
        return [{"term": t, "DocIDs": sorted(list(docids))}
                for t, docids in sorted(self.term_to_docids.items())]


print(" Store classes defined!")
print(" Next: Cell 4 (core logic: preprocess + build index + embeddings)")


 Store classes defined!
 Next: Cell 4 (core logic: preprocess + build index + embeddings)


In [4]:
# CELL 4A: Core text preprocessing (custom stopwords + stemming)
# This part is needed for the inverted index (term -> DocIDs) and also reused later.

# --- Custom stopwords (you define them) ---
# We remove these words because they are very frequent function words (articles, prepositions, pronouns).
# They usually do not add topic meaning, but they increase index size and add noise to retrieval.
CUSTOM_STOPWORDS = {
    "the","a","an","and","or","but",
    "to","of","in","on","at","for","from","by","with","as",
    "is","are","was","were","be","been","being",
    "this","that","these","those",
    "it","its","they","them","their","we","our","you","your",
    "i","me","my","he","him","his","she","her",
    "not","no","do","does","did","doing"
}

stemmer = PorterStemmer()

def preprocess_text(text: str):
    """
    Returns list of terms for indexing:
    - lowercase
    - tokenize
    - keep alphabetic tokens only
    - remove custom stopwords
    - apply stemming
    """
    text = text.lower()
    tokens = word_tokenize(text)
    terms = []
    for tok in tokens:
        if tok.isalpha() and tok not in CUSTOM_STOPWORDS:
            terms.append(stemmer.stem(tok))
    return terms

print(" Core preprocessing ready (custom stopwords + stemming)")


 Core preprocessing ready (custom stopwords + stemming)


In [5]:
# CELL 5: Wikipedia source links (seed documents for the corpus)

wiki_links = [
    "https://en.wikipedia.org/wiki/Plant_disease",
    "https://en.wikipedia.org/wiki/Plant_pathology",
    "https://en.wikipedia.org/wiki/Fungus",
    "https://en.wikipedia.org/wiki/Bacterial_wilt",
    "https://en.wikipedia.org/wiki/Powdery_mildew"
]

print("Wikipedia links used:")
for i, link in enumerate(wiki_links, 1):
    print(f"{i}. {link}")


Wikipedia links used:
1. https://en.wikipedia.org/wiki/Plant_disease
2. https://en.wikipedia.org/wiki/Plant_pathology
3. https://en.wikipedia.org/wiki/Fungus
4. https://en.wikipedia.org/wiki/Bacterial_wilt
5. https://en.wikipedia.org/wiki/Powdery_mildew


In [6]:
# CELL 6: Load documents from Wikipedia (API fetch + normalization + metadata)

import requests
import re

WIKI_API = "https://en.wikipedia.org/w/api.php"

# Wikipedia blocks requests without a proper User-Agent sometimes
HEADERS = {
    "User-Agent": "HW02-Cloud-RAG/1.0 (student project; contact: student@example.com)"
}

def title_from_wiki_url(url: str) -> str:
    if "/wiki/" not in url:
        raise ValueError(f"Unsupported Wikipedia URL: {url}")
    title = url.split("/wiki/", 1)[1]
    title = title.split("#", 1)[0]      # remove anchors
    title = title.replace("_", " ")
    return title

def fetch_page_extract_by_title(title: str):
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts|info",
        "titles": title,
        "inprop": "url",
        "explaintext": True,
        "redirects": 1,   # follow redirects
        "origin": "*"     # helps in some environments
    }
    r = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=30)
    r.raise_for_status()

    pages = r.json()["query"]["pages"]
    page = next(iter(pages.values()))

    # Handle missing page
    if "missing" in page:
        return {"pageid": None, "title": title, "url": "", "text": ""}

    return {
        "pageid": page.get("pageid"),
        "title": page.get("title", title),
        "url": page.get("fullurl", ""),
        "text": page.get("extract", "")
    }

def slugify(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    return s.strip("-")

def load_docs_from_wiki_links(wiki_links):
    docs = {}
    docs_meta = {}

    for url in wiki_links:
        title = title_from_wiki_url(url)
        data = fetch_page_extract_by_title(title)

        text = (data.get("text") or "").strip()
        if not text:
            print(f"Empty/blocked page: {title} | {url}")
            continue

        doc_id = f"wiki_{slugify(data['title'])}"
        docs[doc_id] = text
        docs_meta[doc_id] = {
            "title": data["title"],
            "url": data.get("url") or url,
            "source": "wikipedia",
            "pageid": data.get("pageid"),
        }

        print(f"Loaded: {data['title']} -> {doc_id} | chars={len(text)}")

    return docs, docs_meta

docs, docs_meta = load_docs_from_wiki_links(wiki_links)
print("Docs loaded:", len(docs))


Loaded: Plant disease -> wiki_plant-disease | chars=9654
Loaded: Plant pathology -> wiki_plant-pathology | chars=5228
Loaded: Fungus -> wiki_fungus | chars=65562
Loaded: Bacterial wilt -> wiki_bacterial-wilt | chars=3688
Loaded: Erysiphaceae -> wiki_erysiphaceae | chars=14230
Docs loaded: 5


In [7]:
# CELL 7A: Build the required inverted index (term -> DocIDs)

inv_index = InvertedIndexStore()

for doc_id, text in docs.items():
    terms = preprocess_text(text)   # custom stopwords + stemming
    for t in set(terms):            # presence only (not frequency)
        inv_index.add_occurrence(t, doc_id)

print(f"Inverted index built. Unique terms: {inv_index.count_terms()}")


 InvertedIndexStore initialized
Inverted index built. Unique terms: 2580


In [8]:
# CELL 8: Firebase / Firestore initialization (cloud persistence setup)

!pip -q install firebase-admin

import firebase_admin
from firebase_admin import credentials, firestore

# Initialize Firebase Admin SDK using a service account key
cred = credentials.Certificate(
    "hw02-cloud-inverted-index-firebase-adminsdk-fbsvc-437db7abaa.json"
)

# Prevent re-initialization errors in notebook environments
if not firebase_admin._apps:
    firebase_admin.initialize_app(cred)

# Create Firestore client
db = firestore.client()
print("Firestore connected:", db.project)


Firestore connected: hw02-cloud-inverted-index


In [9]:
# CELL 9: Upload inverted index to Firestore (cloud storage of term → DocIDs)

from google.cloud.firestore_v1 import ArrayUnion

def upload_inverted_index(inv_index, collection_name="inverted_index", batch_size=400):
    """
    Uploads the inverted index to Firestore.

    Each term is stored as a document in the collection:
      inverted_index/{term}

    Stored fields:
      - term     : the stemmed term
      - doc_ids  : list of document IDs containing the term
      - df       : document frequency (number of documents containing the term)

    Batch writes are used to stay within Firestore limits and improve performance.
    """
    col = db.collection(collection_name)
    records = inv_index.to_required_format()  # [{"term": ..., "DocIDs": [...]}, ...]

    batch = db.batch()
    ops = 0

    for r in records:
        term = r["term"]
        doc_ids = r["DocIDs"]

        # Use the term itself as the Firestore document ID (trimmed for safety)
        doc_id = term[:1500]

        ref = col.document(doc_id)
        batch.set(ref, {
            "term": term,
            "doc_ids": doc_ids,
            "df": len(doc_ids),
        })

        ops += 1
        if ops >= batch_size:
            batch.commit()
            batch = db.batch()
            ops = 0

    # Commit any remaining operations
    if ops > 0:
        batch.commit()

    print(f"Uploaded {len(records)} terms to Firestore collection '{collection_name}'")

# Upload the built inverted index
upload_inverted_index(inv_index)


Uploaded 2580 terms to Firestore collection 'inverted_index'


In [10]:
# CELL 10: Upload Wikipedia document metadata to Firestore (documents collection)

def upload_wiki_meta(docs_meta, collection_name="documents", batch_size=400):
    """
    Uploads Wikipedia document metadata to Firestore.

    Each document is stored as:
      documents/{doc_id}

    Stored fields:
      - doc_id  : your internal document ID (e.g., wiki_plant-disease)
      - title   : Wikipedia page title
      - url     : Wikipedia page URL
      - source  : "wikipedia"
      - pageid  : Wikipedia page id (if available)

    This does NOT upload the full article text; it only uploads metadata.
    """
    col = db.collection(collection_name)

    batch = db.batch()
    ops = 0

    for doc_id, meta in docs_meta.items():
        ref = col.document(doc_id)
        batch.set(ref, {
            "doc_id": doc_id,
            "title": meta.get("title", ""),
            "url": meta.get("url", ""),
            "source": meta.get("source", "wikipedia"),
            "pageid": meta.get("pageid", None),
        }, merge=True)

        ops += 1
        if ops >= batch_size:
            batch.commit()
            batch = db.batch()
            ops = 0

    if ops > 0:
        batch.commit()

    print(f"Uploaded {len(docs_meta)} wiki docs to '{collection_name}'")

# Upload metadata for the loaded Wikipedia docs
upload_wiki_meta(docs_meta)


Uploaded 5 wiki docs to 'documents'


In [11]:
# CELL 11: Read inverted index from Firestore (term -> DocIDs)
# This cell is used to fetch candidate documents for hybrid retrieval (Option A)

def fetch_postings_for_query(query: str, collection_name="inverted_index"):
    """
    Reads only the inverted-index entries needed for the query.
    Returns:
        dict: {term: [doc_id, ...]}
    """
    # Use the same preprocessing (stopwords + stemming) as indexing
    q_terms = sorted(set(preprocess_text(query)))
    if not q_terms:
        return {}

    col = db.collection(collection_name)

    # Each term was stored as a Firestore document (doc_id = term)
    refs = [col.document(t[:1500]) for t in q_terms]
    snaps = db.get_all(refs)

    postings = {}
    for term, snap in zip(q_terms, snaps):
        if snap.exists:
            data = snap.to_dict()
            postings[term] = data.get("doc_ids", [])
        else:
            postings[term] = []

    return postings


def candidates_from_postings(postings: dict, mode="OR", min_match=1, max_candidates=2000):
    """
    Converts postings into a candidate DocID list.

    mode:
      - "OR": union of DocIDs (default)
      - "AND": intersection of DocIDs
    min_match:
      - minimum number of query terms a document must match (OR mode)
    """
    sets = [set(v) for v in postings.values() if v]
    if not sets:
        return []

    mode = mode.upper()

    if mode == "AND":
        cand = set.intersection(*sets)
        return list(cand)[:max_candidates]

    # OR mode with minimum match constraint
    counts = {}
    for s in sets:
        for doc_id in s:
            counts[doc_id] = counts.get(doc_id, 0) + 1

    candidates = [d for d, c in counts.items() if c >= min_match]
    candidates.sort(key=lambda d: counts[d], reverse=True)

    return candidates[:max_candidates]


print("CELL 11 ready: Firestore inverted-index reader")


CELL 11 ready: Firestore inverted-index reader


In [12]:
# CELL 12: Core semantic retrieval setup (embedding model + FAISS globals)
# This cell prepares the semantic components that will be used AFTER Cell 11
# (Cell 11 reads candidate DocIDs from Firestore; Cell 12 sets up embeddings/FAISS globals).

# --- Embedding model (for semantic retrieval) ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- FAISS index (stores embeddings for doc-level retrieval) ---
faiss_index = None
vector_dim = None

# Parallel stores (FAISS row -> doc data)
vector_doc_ids = []   # doc_id (aligned with FAISS rows / embeddings)
vector_texts = []     # full doc text (aligned with FAISS rows / embeddings)

# Extra mappings needed for Option A (Firestore candidates -> semantic rerank)
doc_embeddings = None   # numpy array (N, dim), aligned with vector_doc_ids
docid_to_row = {}       # dict: doc_id -> row index in doc_embeddings/vector_doc_ids

print("CELL 12 ready: semantic setup (embeddings + FAISS globals)")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


CELL 12 ready: semantic setup (embeddings + FAISS globals)


In [13]:
# CELL 14: Hybrid retrieval (Firestore inverted-index -> candidates -> semantic rerank)
# Uses:
# - CELL 11: fetch_postings_for_query + candidates_from_postings (Firestore inverted index)
# - CELL 12: semantic globals (embed_model, doc_embeddings, docid_to_row, vector_* )
# - CELL 13: built embeddings + FAISS (doc_embeddings/docid_to_row filled)

def retrieve_top_docs(query: str, top_k: int = 5,
                      mode="OR", min_match=1, max_candidates=2000):
    """
    Hybrid retrieval (Option A):
      1) Use Firestore inverted-index to get candidate DocIDs
      2) Semantic rerank candidates using embeddings (cosine via normalized dot product)
      3) Return ranked docs with titles, scores, and snippets

    If no candidates are found, falls back to full FAISS search.
    """
    if faiss_index is None or faiss_index.ntotal == 0:
        return "FAISS index is empty. Build vectors first."

    # 1) Candidates from Firestore inverted index (Cell 11)
    postings = fetch_postings_for_query(query)
    candidates = candidates_from_postings(
        postings, mode=mode, min_match=min_match, max_candidates=max_candidates
    )

    # 2) Embed query (normalized)
    q_emb = embed_model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype("float32")[0]

    # Fallback: if no candidates, do full FAISS search
    if not candidates:
        distances, indices = faiss_index.search(q_emb.reshape(1, -1), top_k)

        lines = [f"Query: {query}", "=" * 60, "Mode: FAISS-only (no Firestore candidates)"]
        for rank, idx in enumerate(indices[0], start=1):
            if idx == -1:
                continue
            doc_id = vector_doc_ids[idx]
            title = docs_meta.get(doc_id, {}).get("title", "")
            text = vector_texts[idx]
            snippet = re.sub(r"\s+", " ", text)[:350]
            score = float(distances[0][rank - 1])

            lines.append(f"{rank}) {doc_id} | {title} | similarity: {score:.4f}")
            lines.append(f"Snippet: {snippet}...")
            lines.append("-" * 60)

        return "\n".join(lines)

    # 3) Rerank candidates using embeddings in memory
    rows = [docid_to_row[d] for d in candidates if d in docid_to_row]
    if not rows:
        return "No candidate DocIDs matched local embeddings. (Docs mismatch between Firestore and local runtime.)"

    E = doc_embeddings[rows]    # (C, dim)
    sims = E @ q_emb            # (C,) since embeddings are normalized

    top_local = np.argsort(sims)[::-1][:top_k]

    lines = []
    lines.append(f"Query: {query}")
    lines.append("=" * 60)
    lines.append(f"Mode: Hybrid Firestore({mode}, min_match={min_match}) -> semantic rerank")
    lines.append("-" * 60)

    for rank, j in enumerate(top_local, start=1):
        row = rows[int(j)]
        doc_id = vector_doc_ids[row]
        title = docs_meta.get(doc_id, {}).get("title", "")
        text = vector_texts[row]
        snippet = re.sub(r"\s+", " ", text)[:350]
        score = float(sims[int(j)])

        lines.append(f"{rank}) {doc_id} | {title} | similarity: {score:.4f}")
        lines.append(f"Snippet: {snippet}...")
        lines.append("-" * 60)

    return "\n".join(lines)

print("CELL 14 ready: Hybrid retrieval function (Firestore + embeddings)")


CELL 14 ready: Hybrid retrieval function (Firestore + embeddings)


In [14]:
# CELL 15: Hybrid RAG-style output (Firestore inverted-index -> candidates -> semantic rerank -> extractive answer)
# Uses:
# - CELL 11: Firestore inverted-index reader
# - CELL 12/13: embeddings + mappings
# - CELL 14: hybrid retrieve_top_docs (for consistent retrieval behavior)

def split_sentences(text: str):
    # simple sentence split (good enough for baseline)
    parts = re.split(r'(?<=[.!?])\s+', re.sub(r"\s+", " ", text).strip())
    return [s for s in parts if len(s) > 30]

def rag_answer_without_llm(query: str, top_k: int = 3, max_sentences_per_doc: int = 2,
                           mode="OR", min_match=1, max_candidates=2000):
    if faiss_index is None or faiss_index.ntotal == 0:
        return "FAISS index is empty. Build vectors first."

    # --- Hybrid retrieval step (same logic as CELL 14) ---
    postings = fetch_postings_for_query(query)
    candidates = candidates_from_postings(
        postings, mode=mode, min_match=min_match, max_candidates=max_candidates
    )

    q_emb = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")[0]

    retrieved = []  # list of (doc_id, title, score)

    # Fallback: no candidates -> full FAISS
    if not candidates:
        distances, indices = faiss_index.search(q_emb.reshape(1, -1), top_k)
        for rank, idx in enumerate(indices[0], start=1):
            if idx == -1:
                continue
            doc_id = vector_doc_ids[idx]
            title = docs_meta.get(doc_id, {}).get("title", "")
            score = float(distances[0][rank - 1])
            retrieved.append((doc_id, title, score))
    else:
        # Rerank candidates by semantic similarity
        rows = [docid_to_row[d] for d in candidates if d in docid_to_row]
        if not rows:
            return "No candidate DocIDs matched local embeddings. (Docs mismatch between Firestore and local runtime.)"

        E = doc_embeddings[rows]
        sims = E @ q_emb
        top_local = np.argsort(sims)[::-1][:top_k]

        for j in top_local:
            row = rows[int(j)]
            doc_id = vector_doc_ids[row]
            title = docs_meta.get(doc_id, {}).get("title", "")
            score = float(sims[int(j)])
            retrieved.append((doc_id, title, score))

    # --- Output formatting ---
    lines = []
    lines.append(f"Query: {query}")
    lines.append("=" * 60)
    lines.append(f"Mode: Hybrid Firestore({mode}, min_match={min_match}) -> semantic rerank -> extractive")
    lines.append("-" * 60)

    # Retrieval section
    lines.append("Top retrieved documents:")
    for i, (doc_id, title, score) in enumerate(retrieved, start=1):
        lines.append(f"{i}) {doc_id} | {title} | similarity: {score:.4f}")
    lines.append("=" * 60)

    # Enriched response (extractive, no LLM)
    lines.append("Enriched response (extractive, no LLM):")
    q_terms = set(preprocess_text(query))

    for doc_id, title, score in retrieved:
        text = docs[doc_id]
        sents = split_sentences(text)

        scored = []
        for s in sents:
            s_terms = set(preprocess_text(s))
            overlap = len(q_terms & s_terms)
            if overlap > 0:
                scored.append((overlap, s))

        scored.sort(key=lambda x: x[0], reverse=True)
        best = [s for _, s in scored[:max_sentences_per_doc]]

        lines.append(f"- Source: {doc_id} | {title}")
        if best:
            for b in best:
                lines.append(f"  • {b}")
        else:
            lines.append("  • (No strong matching sentences found)")
        lines.append("-" * 60)

    return "\n".join(lines)

print("CELL 15 ready: Hybrid RAG-style (no OpenAI) function")


CELL 15 ready: Hybrid RAG-style (no OpenAI) function


In [15]:
# CELL 13: Quick demo (edit the query text)

print(retrieve_top_docs("how to detect plant diseases using sensors and ai", top_k=3))
print()
print(rag_answer_without_llm("how to detect plant diseases using sensors and ai", top_k=3))


FAISS index is empty. Build vectors first.

FAISS index is empty. Build vectors first.


In [16]:
'''
# CELL 4: Core setup (custom stopwords + stemming + embedding model + FAISS)

# --- Custom stopwords (you define them) ---
# We remove these words because they are very frequent function words (articles, prepositions, pronouns).
# They usually do not add topic meaning, but they increase index size and add noise to retrieval.
CUSTOM_STOPWORDS = {
    "the","a","an","and","or","but",
    "to","of","in","on","at","for","from","by","with","as",
    "is","are","was","were","be","been","being",
    "this","that","these","those",
    "it","its","they","them","their","we","our","you","your",
    "i","me","my","he","him","his","she","her",
    "not","no","do","does","did","doing"
}

stemmer = PorterStemmer()

def preprocess_text(text: str):
    """
    Returns list of terms for indexing:
    - lowercase
    - tokenize
    - keep alphabetic tokens only
    - remove custom stopwords
    - apply stemming
    """
    text = text.lower()
    tokens = word_tokenize(text)
    terms = []
    for tok in tokens:
        if tok.isalpha() and tok not in CUSTOM_STOPWORDS:
            terms.append(stemmer.stem(tok))
    return terms

# --- Embedding model (for semantic retrieval) ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- FAISS index (stores embeddings for doc-level retrieval) ---
faiss_index = None
vector_dim = None

# Parallel stores (FAISS row -> doc data)
vector_doc_ids = []   # doc_id
vector_texts = []     # full doc text

print(" Core setup ready (custom stopwords + stemming + embeddings + FAISS)")
'''



'\n# CELL 4: Core setup (custom stopwords + stemming + embedding model + FAISS)\n\n# --- Custom stopwords (you define them) ---\n# We remove these words because they are very frequent function words (articles, prepositions, pronouns).\n# They usually do not add topic meaning, but they increase index size and add noise to retrieval.\nCUSTOM_STOPWORDS = {\n    "the","a","an","and","or","but",\n    "to","of","in","on","at","for","from","by","with","as",\n    "is","are","was","were","be","been","being",\n    "this","that","these","those",\n    "it","its","they","them","their","we","our","you","your",\n    "i","me","my","he","him","his","she","her",\n    "not","no","do","does","did","doing"\n}\n\nstemmer = PorterStemmer()\n\ndef preprocess_text(text: str):\n    """\n    Returns list of terms for indexing:\n    - lowercase\n    - tokenize\n    - keep alphabetic tokens only\n    - remove custom stopwords\n    - apply stemming\n    """\n    text = text.lower()\n    tokens = word_tokenize(text)

In [17]:
'''
# CELL 7: Build the required index (term -> DocIDs) + build FAISS embeddings store (doc-level)

# 1) Build inverted index (term -> DocIDs)
inv_index = InvertedIndexStore()

for doc_id, text in docs.items():
    terms = preprocess_text(text)   # uses custom stopwords + stemming
    for t in set(terms):            # presence only (not frequency)
        inv_index.add_occurrence(t, doc_id)

print(f" Inverted index built. Unique terms: {inv_index.count_terms()}")

# 2) Build embeddings + FAISS (one vector per doc)
doc_ids = list(docs.keys())
texts = [docs[d] for d in doc_ids]

emb = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True).astype("float32")

vector_dim = emb.shape[1]
faiss_index = faiss.IndexFlatIP(vector_dim)  # cosine similarity via normalized embeddings
faiss_index.add(emb)

# parallel arrays for retrieval results
vector_doc_ids = doc_ids
vector_texts = texts

print(f" FAISS built. Vectors: {faiss_index.ntotal} | dim={vector_dim}")
'''

'\n# CELL 7: Build the required index (term -> DocIDs) + build FAISS embeddings store (doc-level)\n\n# 1) Build inverted index (term -> DocIDs)\ninv_index = InvertedIndexStore()\n\nfor doc_id, text in docs.items():\n    terms = preprocess_text(text)   # uses custom stopwords + stemming\n    for t in set(terms):            # presence only (not frequency)\n        inv_index.add_occurrence(t, doc_id)\n\nprint(f" Inverted index built. Unique terms: {inv_index.count_terms()}")\n\n# 2) Build embeddings + FAISS (one vector per doc)\ndoc_ids = list(docs.keys())\ntexts = [docs[d] for d in doc_ids]\n\nemb = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True).astype("float32")\n\nvector_dim = emb.shape[1]\nfaiss_index = faiss.IndexFlatIP(vector_dim)  # cosine similarity via normalized embeddings\nfaiss_index.add(emb)\n\n# parallel arrays for retrieval results\nvector_doc_ids = doc_ids\nvector_texts = texts\n\nprint(f" FAISS built. Vectors: {faiss_index.ntotal} | dim={vecto

In [18]:
#FROM NOW ON. ASAAD'S PART

In [19]:
!pip -q install firebase-admin ipywidgets matplotlib

import firebase_admin
from firebase_admin import credentials, firestore

# Path to the uploaded JSON file
SERVICE_ACCOUNT_PATH = "/content/hw02-cloud-inverted-index-firebase-adminsdk-fbsvc-437db7abaa.json"

# Init Firebase Admin
if not firebase_admin._apps:
    cred = credentials.Certificate(SERVICE_ACCOUNT_PATH)
    firebase_admin.initialize_app(cred)

db = firestore.client()
print("✅ Connected to Firestore in project:", db.project)


✅ Connected to Firestore in project: hw02-cloud-inverted-index


In [20]:
import requests
import pandas as pd

BASE_URL = "https://server-cloud-v645.onrender.com"

def fetch_history(feed: str, limit: int = 30) -> pd.DataFrame:
    """Fetch IoT history from course server. Returns DataFrame with created_at,value."""
    resp = requests.get(f"{BASE_URL}/history", params={"feed": feed, "limit": int(limit)}, timeout=120)
    resp.raise_for_status()
    data = resp.json()
    if "data" not in data:
        raise ValueError(f"Server error: {data}")

    df = pd.DataFrame(data["data"])
    df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df = df.dropna(subset=["created_at", "value"]).sort_values("created_at")
    return df


In [21]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
from datetime import datetime, timezone

# -------------------------
# Screen A — Plant Image
# -------------------------
plant_name = widgets.Text(placeholder="Plant name")
plant_upload = widgets.FileUpload(accept="image/*", multiple=False)
plant_save = widgets.Button(description="Save", button_style="success")
plant_out = widgets.Output()

def plant_show(_):
    with plant_out:
        clear_output()
        if plant_upload.value:
            _, f = list(plant_upload.value.items())[0]
            display(widgets.Image(value=f["content"]))

def plant_save_meta(_):
    with plant_out:
        if not plant_upload.value or not plant_name.value.strip():
            print("Upload image + plant name")
            return
        filename, _f = list(plant_upload.value.items())[0]
        db.collection("plant_images").add({
            "plant": plant_name.value.strip(),
            "file": filename,
            "time": datetime.now(timezone.utc)
        })
        print("Saved")

plant_upload.observe(plant_show, names="value")
plant_save.on_click(plant_save_meta)

screenA = widgets.VBox([plant_name, plant_upload, plant_save, plant_out])

# -------------------------
# Screen B — IoT Data
# -------------------------
iot_feed = widgets.Dropdown(options=["humidity","soil","temperature"], value="humidity", description="Feed:")
iot_limit = widgets.IntSlider(value=10, min=1, max=100, step=1, description="Samples:")
iot_btn = widgets.Button(description="Get Data", button_style="success")
iot_out = widgets.Output()

def iot_click(_):
    with iot_out:
        clear_output()
        print("Waking server (first request can take time)...")
        df = fetch_history(iot_feed.value, iot_limit.value)
        print("Rows returned:", len(df))
        display(df)
        print("\nLatest value:", df["value"].iloc[-1], "| at:", df["created_at"].iloc[-1])

iot_btn.on_click(iot_click)

screenB = widgets.VBox([iot_feed, iot_limit, iot_btn, iot_out])

# -------------------------
# Screen C — Query / Search
# -------------------------
index_box = widgets.Text(value="inverted_index", description="Index:")
query_box = widgets.Text(value="about", description="Query:")
search_btn = widgets.Button(description="Search", button_style="primary")
search_out = widgets.Output()

def search_inverted_index(index_name: str, term: str):
    index_name = index_name.strip()
    term = term.strip().lower()
    if not index_name or not term:
        return None, "Enter both Index and Query."

    # doc id == term
    doc = db.collection(index_name).document(term).get()
    if doc.exists:
        data = doc.to_dict() or {}
        return {
            "term": term,
            "df": data.get("df"),
            "doc_ids": data.get("doc_ids", [])
        }, None

    # fallback by field
    qs = list(db.collection(index_name).where("term", "==", term).limit(1).stream())
    if qs:
        data = qs[0].to_dict() or {}
        return {
            "term": term,
            "df": data.get("df"),
            "doc_ids": data.get("doc_ids", [])
        }, None

    return None, f"No results for '{term}' in '{index_name}'."

def on_search(_):
    with search_out:
        clear_output()
        result, err = search_inverted_index(index_box.value, query_box.value)
        if err:
            print(err)
            return
        print("term:", result["term"])
        if result["df"] is not None:
            print("df:", result["df"])
        print("doc_ids:")
        for x in result["doc_ids"]:
            print(" -", x)

search_btn.on_click(on_search)

screenC = widgets.VBox([index_box, query_box, search_btn, search_out])

# -------------------------
# Screen D — Dashboard
# -------------------------
dash_feed = widgets.Dropdown(options=["humidity","soil","temperature"], value="soil", description="Feed:")
dash_limit = widgets.IntSlider(value=30, min=10, max=200, step=10, description="Samples:")
dash_btn = widgets.Button(description="Build", button_style="warning")
dash_out = widgets.Output()

def dashboard_status(feed, latest):
    if feed == "soil":
        if latest < 30: return "Critical"
        if latest < 45: return "Warning"
        return "Healthy"
    if feed == "humidity":
        if latest < 30: return "Warning"
        return "OK"
    if feed == "temperature":
        if latest < 10 or latest > 35: return "Warning"
        return "OK"
    return "OK"

def on_dash(_):
    with dash_out:
        clear_output()
        df = fetch_history(dash_feed.value, dash_limit.value)
        latest = df["value"].iloc[-1]
        print("Status:", dashboard_status(dash_feed.value, latest))

        plt.figure()
        plt.plot(df["created_at"], df["value"], marker="o")
        plt.title(f"{dash_feed.value} over time")
        plt.xlabel("time")
        plt.ylabel("value")
        plt.xticks(rotation=30)
        plt.show()

dash_btn.on_click(on_dash)

screenD = widgets.VBox([dash_feed, dash_limit, dash_btn, dash_out])

# -------------------------
# Tabs
# -------------------------
tabs = widgets.Tab(children=[screenA, screenB, screenC, screenD])
tabs.set_title(0, "Plant Image")
tabs.set_title(1, "IoT Data")
tabs.set_title(2, "Query/Search")
tabs.set_title(3, "Dashboard")

display(tabs)


Tab(children=(VBox(children=(Text(value='', placeholder='Plant name'), FileUpload(value={}, accept='image/*', …