<a href="https://colab.research.google.com/github/Asaad972/CollabFirstNoteBook/blob/main/HW02_Cloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CELL 1: Minimal package installation (only if missing)
import importlib.util, sys, subprocess

def ensure(pkg, import_name=None):
    name = import_name or pkg
    if importlib.util.find_spec(name) is None:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

# Usually already installed in Colab, but keep safe:
ensure("pandas", "pandas")

# Check if installiation is not done
ensure("nltk", "nltk")
ensure("sentence-transformers", "sentence_transformers")
ensure("faiss-cpu", "faiss")
ensure("pymupdf", "fitz")


print(" Dependencies ready")


In [None]:
# CELL 2: Imports + NLTK resources (run once per runtime)

import re
from collections import defaultdict
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sentence_transformers import SentenceTransformer
import faiss

# NLTK downloads (required for stopwords/tokenizer/lemmatizer)
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

print(" Imports ready + NLTK resources downloaded")


In [None]:
# CELL 3: Store Classes (Vector Store + Inverted Index)
# =====================================================
"""
 CELL 3: STORE CLASSES
- SimpleVectorStore: stores embeddings + documents + metadatas + ids (like Tirgul 7)
- InvertedIndexStore: stores required index schema term -> DocIDs (homework requirement)
"""

import numpy as np
from collections import defaultdict

# ---------- Vector Store (similar to Tirgul 7) ----------
class SimpleVectorStore:
    """Simple in-memory vector store (fallback)"""

    def __init__(self):
        self.documents = []
        self.embeddings = []   # list of numpy arrays
        self.metadatas = []
        self.ids = []
        print(" SimpleVectorStore initialized")

    def add(self, embeddings, documents, metadatas, ids):
        # Ensure numpy arrays
        embeddings = [np.asarray(e, dtype=np.float32) for e in embeddings]
        self.embeddings.extend(embeddings)
        self.documents.extend(documents)
        self.metadatas.extend(metadatas)
        self.ids.extend(ids)
        print(f" Added {len(documents)} documents to simple vector store")

    def query(self, query_embeddings, n_results=5):
        if not self.embeddings:
            return {'ids': [[]], 'documents': [[]], 'metadatas': [[]], 'distances': [[]]}

        q = np.asarray(query_embeddings[0], dtype=np.float32)

        E = np.vstack(self.embeddings)  # shape: (N, d)

        # cosine similarity without sklearn
        q_norm = np.linalg.norm(q) + 1e-12
        E_norm = np.linalg.norm(E, axis=1) + 1e-12
        sims = (E @ q) / (E_norm * q_norm)

        top_idx = np.argsort(sims)[::-1][:n_results]

        return {
            'ids': [[self.ids[i] for i in top_idx]],
            'documents': [[self.documents[i] for i in top_idx]],
            'metadatas': [[self.metadatas[i] for i in top_idx]],
            'distances': [[float(1 - sims[i]) for i in top_idx]]  # distance-like
        }

    def count(self):
        return len(self.documents)


# ---------- Inverted Index (required by homework) ----------
class InvertedIndexStore:
    """Required structure: term -> DocIDs"""

    def __init__(self):
        self.term_to_docids = defaultdict(set)
        print(" InvertedIndexStore initialized")

    def add_occurrence(self, term: str, doc_id: str):
        self.term_to_docids[term].add(doc_id)

    def get_docids(self, term: str):
        return sorted(self.term_to_docids.get(term, set()))

    def count_terms(self) -> int:
        return len(self.term_to_docids)

    def to_required_format(self):
        # [{"term": ..., "DocIDs": [...]}, ...]
        return [{"term": t, "DocIDs": sorted(list(docids))}
                for t, docids in sorted(self.term_to_docids.items())]


print(" Store classes defined!")
print(" Next: Cell 4 (core logic: preprocess + build index + embeddings)")


In [None]:
# CELL 4: Core setup (custom stopwords + stemming + embedding model + FAISS)

# --- Custom stopwords (you define them) ---
# We remove these words because they are very frequent function words (articles, prepositions, pronouns).
# They usually do not add topic meaning, but they increase index size and add noise to retrieval.
CUSTOM_STOPWORDS = {
    "the","a","an","and","or","but",
    "to","of","in","on","at","for","from","by","with","as",
    "is","are","was","were","be","been","being",
    "this","that","these","those",
    "it","its","they","them","their","we","our","you","your",
    "i","me","my","he","him","his","she","her",
    "not","no","do","does","did","doing"
}

stemmer = PorterStemmer()

def preprocess_text(text: str):
    """
    Returns list of terms for indexing:
    - lowercase
    - tokenize
    - keep alphabetic tokens only
    - remove custom stopwords
    - apply stemming
    """
    text = text.lower()
    tokens = word_tokenize(text)
    terms = []
    for tok in tokens:
        if tok.isalpha() and tok not in CUSTOM_STOPWORDS:
            terms.append(stemmer.stem(tok))
    return terms

# --- Embedding model (for semantic retrieval) ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- FAISS index (stores embeddings for doc-level retrieval) ---
faiss_index = None
vector_dim = None

# Parallel stores (FAISS row -> doc data)
vector_doc_ids = []   # doc_id
vector_texts = []     # full doc text

print(" Core setup ready (custom stopwords + stemming + embeddings + FAISS)")

In [None]:
# --- NEW CELL: GEMINI SETUP ---
!pip install -q -U google-generativeai
import google.generativeai as genai

# REPLACE WITH YOUR KEY
genai.configure(api_key="AIzaSyBN7qfLTK2hcgctWkIUrnZc4efJMZB0n9I")

def ask_gemini(context, user_question):
    if not context: return "No relevant info found."

    prompt = f"""
    Answer based ONLY on this context:
    {context}

    Question: {user_question}
    """
    try:
        model = genai.GenerativeModel('gemini-pro')
        return model.generate_content(prompt).text
    except Exception as e:
        return f"Error: {e}"

print("‚úÖ Gemini AI is ready!")

In [None]:
# CELL 5: Wikipedia source links (seed documents for the corpus)

wiki_links = [
    "https://en.wikipedia.org/wiki/Plant_disease",
    "https://en.wikipedia.org/wiki/Plant_pathology",
    "https://en.wikipedia.org/wiki/Fungus",
    "https://en.wikipedia.org/wiki/Bacterial_wilt",
    "https://en.wikipedia.org/wiki/Powdery_mildew"
]

print("Wikipedia links used:")
for i, link in enumerate(wiki_links, 1):
    print(f"{i}. {link}")


In [None]:
# CELL 6: Load documents from Wikipedia (API fetch + normalization + metadata)

import requests
import re

WIKI_API = "https://en.wikipedia.org/w/api.php"

# Wikipedia blocks requests without a proper User-Agent sometimes
HEADERS = {
    "User-Agent": "HW02-Cloud-RAG/1.0 (student project; contact: student@example.com)"
}

def title_from_wiki_url(url: str) -> str:
    if "/wiki/" not in url:
        raise ValueError(f"Unsupported Wikipedia URL: {url}")
    title = url.split("/wiki/", 1)[1]
    title = title.split("#", 1)[0]      # remove anchors
    title = title.replace("_", " ")
    return title

def fetch_page_extract_by_title(title: str):
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts|info",
        "titles": title,
        "inprop": "url",
        "explaintext": True,
        "redirects": 1,   # follow redirects
        "origin": "*"     # helps in some environments
    }
    r = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=30)
    r.raise_for_status()

    pages = r.json()["query"]["pages"]
    page = next(iter(pages.values()))

    # Handle missing page
    if "missing" in page:
        return {"pageid": None, "title": title, "url": "", "text": ""}

    return {
        "pageid": page.get("pageid"),
        "title": page.get("title", title),
        "url": page.get("fullurl", ""),
        "text": page.get("extract", "")
    }

def slugify(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    return s.strip("-")

def load_docs_from_wiki_links(wiki_links):
    docs = {}
    docs_meta = {}

    for url in wiki_links:
        title = title_from_wiki_url(url)
        data = fetch_page_extract_by_title(title)

        text = (data.get("text") or "").strip()
        if not text:
            print(f"Empty/blocked page: {title} | {url}")
            continue

        doc_id = f"wiki_{slugify(data['title'])}"
        docs[doc_id] = text
        docs_meta[doc_id] = {
            "title": data["title"],
            "url": data.get("url") or url,
            "source": "wikipedia",
            "pageid": data.get("pageid"),
        }

        print(f"Loaded: {data['title']} -> {doc_id} | chars={len(text)}")

    return docs, docs_meta

docs, docs_meta = load_docs_from_wiki_links(wiki_links)
print("Docs loaded:", len(docs))


In [None]:
# CELL 7: Build the required index (term -> DocIDs) + build FAISS embeddings store (doc-level)

# 1) Build inverted index (term -> DocIDs)
inv_index = InvertedIndexStore()

for doc_id, text in docs.items():
    terms = preprocess_text(text)   # uses custom stopwords + stemming
    for t in set(terms):            # presence only (not frequency)
        inv_index.add_occurrence(t, doc_id)

print(f" Inverted index built. Unique terms: {inv_index.count_terms()}")

# 2) Build embeddings + FAISS (one vector per doc)
doc_ids = list(docs.keys())
texts = [docs[d] for d in doc_ids]

emb = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True).astype("float32")

vector_dim = emb.shape[1]
faiss_index = faiss.IndexFlatIP(vector_dim)  # cosine similarity via normalized embeddings
faiss_index.add(emb)

# parallel arrays for retrieval results
vector_doc_ids = doc_ids
vector_texts = texts

print(f" FAISS built. Vectors: {faiss_index.ntotal} | dim={vector_dim}")


In [None]:
# CELL 8: Firebase Initialization (Hybrid Safe Mode)

!pip -q install firebase-admin

import firebase_admin
from firebase_admin import credentials, firestore
import base64

# --- 1. Public Configuration (Safe to share) ---
# We keep the standard info visible so the code is easy to understand.
config = {
  "type": "service_account",
  "project_id": "hw02-cloud-inverted-index",
  "private_key_id": "437db7abaab45e69cf2bf0c22aa8c2e23cbbc71e",
  "client_email": "firebase-adminsdk-fbsvc@hw02-cloud-inverted-index.iam.gserviceaccount.com",
  "client_id": "105185385505390955098",
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/firebase-adminsdk-fbsvc%40hw02-cloud-inverted-index.iam.gserviceaccount.com",
  "universe_domain": "googleapis.com"
}

# --- 2. Private Key (Hidden) ---
# Paste the string you generated in Step 1 here.
scrambled_key = "LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2QUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktZd2dnU2lBZ0VBQW9JQkFRQzVTWERrU0NNYmJ2bTMKOTNWbzFvOVpRTUwwRUdwbDNhaUdaekl6Y29ZYUk2S2FmNjk3NkxuRkxjdyt3M2RmZ09JVDZPTWdtV3FuU2FGeApYR0FsQnZ4Z2t4ekFoWUhveEk1Um9abjl5TnYzYitoQXJXam5GN2ZXak13ZXluUkVCdmRBNExzZ0VxUU1XWHVRCkQxUlMrMXo0WG02ZTFjZUtPOVB4VkpCMXo3dEdTQk1KTjBWOGJHMmFKMHR4bzF3RzNacm1yYk1kZ1hJVHdrUGYKa2lCSnpwME12c2ovZndvZ3l5WmZBR3JVVTlScS8vU2lBQ1pwMnhFWXNLL1BjOERFU0ZoMUtPK3k1ZDlxNGM1SQp6S3FNRGRJQkc2V1VBSGZnbHhvRFRlbzRoNENnZ0wvcXUrS3hZdWxmeDEydEpPa1hKZzUzYlJGY2lKOGtROW5BCk9rQXNtZTJkQWdNQkFBRUNnZ0VBRFByUEZMN1U3c1FNYkUzQ2hOQ2JCQ2FjUVpxd3lXZ0l1VG1iYzYwdkpiK2YKVVhGbWFxaTM4czh0Z3F3UXZiajZuV2h3R01XR2lpZUhUcmlvNTQ4Z3VPYzFXV3RBMlh5RGQ4WjVVaVR5KzlkMApEcXZYTUhFaDZMNitRZDN1M1NFYnl3aXpNeUQ3S3Y1TndKN0NTbm5mWG1ySEZ3dGt5aE04MnFnUTRwL2x2NXVJClpSSWZRWnl3cTBTUkJRai9vL0lKdFVVR1A0TFFCUmkzWDd0ZTFXeFhPeHF0TjNuUHhNQ2NRR3g4UmxVeVVJemoKRmd3SllaRlZZSHhMbUcwaXdnQkdiSmJIQ0ZaSUNCQXZpNVZoWTVERXRYcm4vdE44MG9nQWFOS0ViT2lmcG5MWApvc09BRW56Y1NPRWNkbmEvcFgzNXdvUHVyZDFNcytlV3JpNUZNQjdqb1FLQmdRRGtuUzFNS0VITWVBU25OMkxCCmJaZTc1K0JzdUl0UHkzVk9USGh1WklXUXFHSmFONkRLSmhUOW9pazUyb2REMHFvQVJjMVh6VXM2VWdzSDZ4OHUKRCtGeTlXUUFqME9qUkg3VUF1VTAzRkZCMnNXOVFDbGNhMUFONzl5T0dvcGNZUlRFb0pJalBRSndmbEE3bkM2VQprZ1RsK3djdVNKaFpkRk9hY3prTFEyNGlXUUtCZ1FEUGU1Rkp6cDhMRlFTdWdvU1lVTTBjaWVLb2oyUjBzK0Q2CnJmM1dwMkZ2ZEhzeDc4cXFBWDVKUHB5YVgrMXRpUXZCclVTOUExaUc0Vkc3Q0pjV0E4M2RKSG9SWHNkb1BPYnUKUGRLcGpDYnd0dVBuckZ5N0dnR1NhaWZhUi9sdUlKMDJ6eGNoL0VWVVFwUlZPUms3QmhJV3E3TmlaR2M4TWtyRgpYUjlhWEZCVTVRS0JnRjU0ZlNGOWVVTlBUVXowWEVEbVVzOTVrSW9jOEtTMnhQRG9OTlFaZ2dBM05QMW5BM0RGCnIrTG53ZldBVW1rNmdybStIbzdyN094YXZ1ZzB4eHUzd0VoTEUxb1AyYmw4TXBUVjVYV2tuWWVES2pkOGJoc2MKMVdZTStxMVdWbHE2VzJTdG5mWWwzZjR5bEdFdHR5bjU5VUE4TGNsNGdreGsvNjlSY2Y4dmpERnhBb0dBS2RwZgpRR2d4cE9ha2Z4OU01L3pFbzFFZEs2dGhORGxrMUt4c1cvUi9yeC9zQ2ZLNUN2b3FJMVJCK3RJRzd1V0tQWk5hCkhsYWljUExhcmNQWjFsTUdIK25QeGRrOG1FWlF2eFl4ZklvTkFObWp0NFFKWUtTcVZJS2RiMmE5WmYybU9Qd2wKU25HOCtuWkR2YjA2M2JFbnpQTHR5SmRBUytCSlBPNi8rRlpPemhFQ2dZQTZKU051Tk81UVpqSGx0cUtmeFZNWgo1UHFULzVoS2c5K1Y0elhLTzhvcjhxRkFOYUFQdTBtVEwwN2dSa3Fvem1TM25aeUJ5SzAvczBKK2J4SXhKcWJzCmNUSm1OeDkxejdwSFl0NE1TWnhvQU94dm1UaTlGWlMrRlVnM0tJUEpKVGJTYlBiZHBmQk5GZGhNOXpOZjRwc2UKQ250QVhOQlNDZW5yUXNIKzNMNXRiUT09Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K"

try:
    # We unlock the key and add it to the config
    config["private_key"] = base64.b64decode(scrambled_key).decode('utf-8')

    # Initialize Firebase
    cred = credentials.Certificate(config)

    if not firebase_admin._apps:
        firebase_admin.initialize_app(cred)

    db = firestore.client()
    print("Firestore connected successfully to:", db.project)

except Exception as e:
    print(f"Error: {e}")

In [None]:
# CELL 9: Upload inverted index to Firestore (cloud storage of term -> DocIDs)

# Fix 1: Removed unused 'ArrayUnion' import
# Fix 2: Added 'db' as an argument to link explicitly to Cell 8

def upload_inverted_index(inv_index, db_client, collection_name="inverted_index", batch_size=400):
    """
    Uploads the inverted index to Firestore.
    """
    col = db_client.collection(collection_name)

    # Ensure your inv_index object has this method.
    # If inv_index is just a plain dict, use: records = [{"term": t, "DocIDs": d} for t, d in inv_index.items()]
    records = inv_index.to_required_format()

    batch = db_client.batch()
    ops = 0

    for r in records:
        term = r["term"]
        doc_ids = r["DocIDs"]

        # Fix 3: SANITIZE THE ID.
        # Firestore IDs cannot contain '/'. We replace it with '_' or simple URL encoding.
        safe_term = term.replace("/", "_")

        # Limit length to 1500 bytes (Firestore limit per ID)
        doc_id = safe_term[:1500]

        ref = col.document(doc_id)
        batch.set(ref, {
            "term": term,         # Store original term inside the document
            "doc_ids": doc_ids,
            "df": len(doc_ids),
        })

        ops += 1
        if ops >= batch_size:
            batch.commit()
            batch = db_client.batch()
            ops = 0

    if ops > 0:
        batch.commit()

    print(f"Uploaded {len(records)} terms to Firestore collection '{collection_name}'")

# Execute the upload passing the 'db' from Cell 8
upload_inverted_index(inv_index, db)

In [None]:
# CELL 10: Upload Wikipedia document metadata to Firestore (documents collection)

def upload_wiki_meta(docs_meta, collection_name="documents", batch_size=400):
    """
    Uploads Wikipedia document metadata to Firestore.

    Each document is stored as:
      documents/{doc_id}

    Stored fields:
      - doc_id  : your internal document ID (e.g., wiki_plant-disease)
      - title   : Wikipedia page title
      - url     : Wikipedia page URL
      - source  : "wikipedia"
      - pageid  : Wikipedia page id (if available)

    This does NOT upload the full article text; it only uploads metadata.
    """
    col = db.collection(collection_name)

    batch = db.batch()
    ops = 0

    for doc_id, meta in docs_meta.items():
        ref = col.document(doc_id)
        batch.set(ref, {
            "doc_id": doc_id,
            "title": meta.get("title", ""),
            "url": meta.get("url", ""),
            "source": meta.get("source", "wikipedia"),
            "pageid": meta.get("pageid", None),
        }, merge=True)

        ops += 1
        if ops >= batch_size:
            batch.commit()
            batch = db.batch()
            ops = 0

    if ops > 0:
        batch.commit()

    print(f"Uploaded {len(docs_meta)} wiki docs to '{collection_name}'")

# Upload metadata for the loaded Wikipedia docs
upload_wiki_meta(docs_meta)


In [None]:
# CELL 11: Embedding-based document retrieval using FAISS (semantic search)

def retrieve_top_docs(query: str, top_k: int = 5):
    """
    Retrieves the top-K most relevant documents for a user query using
    vector embeddings and FAISS similarity search.

    This function:
      1) Embeds the query using the same embedding model as the documents
      2) Searches the FAISS index using cosine similarity
      3) Returns ranked documents with titles, similarity scores, and text snippets

    Note: This is retrieval only (no generation / no LLM).
    """
    if faiss_index is None or faiss_index.ntotal == 0:
        return "FAISS index is empty. Build vectors first."

    # Embed and normalize the query
    q_emb = embed_model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype("float32")

    # Search FAISS index
    distances, indices = faiss_index.search(q_emb, top_k)

    lines = []
    lines.append(f"Query: {query}")
    lines.append("=" * 60)

    # Format ranked results
    for rank, idx in enumerate(indices[0], start=1):
        if idx == -1:
            continue

        doc_id = vector_doc_ids[idx]
        title = docs_meta.get(doc_id, {}).get("title", "")
        text = vector_texts[idx]
        snippet = re.sub(r"\s+", " ", text)[:350]
        score = float(distances[0][rank - 1])

        lines.append(f"{rank}) {doc_id} | {title} | similarity: {score:.4f}")
        lines.append(f"Snippet: {snippet}...")
        lines.append("-" * 60)

    return "\n".join(lines)

print("Retrieval function ready")

In [None]:
# CELL 12: RAG-style output (retrieval + "enriched" answer without OpenAI)
# We will: retrieve top docs, then produce a simple enriched response by extracting key sentences.

def split_sentences(text: str):
    # simple sentence split (good enough for baseline)
    parts = re.split(r'(?<=[.!?])\s+', re.sub(r"\s+", " ", text).strip())
    return [s for s in parts if len(s) > 30]

def rag_answer_without_llm(query: str, top_k: int = 3, max_sentences_per_doc: int = 2):
    if faiss_index is None or faiss_index.ntotal == 0:
        return "FAISS index is empty. Build vectors first."

    q_emb = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    distances, indices = faiss_index.search(q_emb, top_k)

    lines = []
    lines.append(f"Query: {query}")
    lines.append("=" * 60)

    # Retrieval section
    lines.append("Top retrieved documents:")
    retrieved = []
    for rank, idx in enumerate(indices[0], start=1):
        if idx == -1:
            continue
        doc_id = vector_doc_ids[idx]
        title = docs_meta.get(doc_id, {}).get("title", "")
        score = float(distances[0][rank - 1])
        retrieved.append((doc_id, title, score))
        lines.append(f"{rank}) {doc_id} | {title} | similarity: {score:.4f}")
    lines.append("=" * 60)

    # Enriched response (extractive, no LLM)
    lines.append("Enriched response (extractive, no LLM):")
    q_terms = set(preprocess_text(query))

    for doc_id, title, score in retrieved:
        text = docs[doc_id]
        sents = split_sentences(text)

        # score sentences by overlap with query terms (stems)
        scored = []
        for s in sents:
            s_terms = set(preprocess_text(s))
            overlap = len(q_terms & s_terms)
            if overlap > 0:
                scored.append((overlap, s))

        scored.sort(key=lambda x: x[0], reverse=True)
        best = [s for _, s in scored[:max_sentences_per_doc]]

        lines.append(f"- Source: {doc_id} | {title}")
        if best:
            for b in best:
                lines.append(f"  ‚Ä¢ {b}")
        else:
            lines.append("  ‚Ä¢ (No strong matching sentences found)")
        lines.append("-" * 60)

    return "\n".join(lines)

print(" RAG-style (no OpenAI) function ready")


In [None]:
# CELL 13: Quick demo (edit the query text)

print(retrieve_top_docs("how to detect plant diseases using sensors and ai", top_k=3))
print()
print(rag_answer_without_llm("how to detect plant diseases using sensors and ai", top_k=3))


In [None]:
import ipywidgets as widgets
from IPython.display import display

# --- 1. SETUP THE WIDGETS (Standalone) ---
ai_input = widgets.Text(
    placeholder="Ask Gemini about your documents...",
    description="Question:",
    layout=widgets.Layout(width='60%')
)

ai_btn = widgets.Button(
    description="Ask AI",
    button_style='success', # Green
    icon='star'
)

ai_output = widgets.Output()

# --- 2. DEFINE THE LOGIC ---
def on_ai_click(b):
    ai_output.clear_output()
    with ai_output:
        q = ai_input.value
        if not q:
            print("Please enter a question.")
            return

        print(f"ü§ñ Gemini is thinking about: '{q}'...")

        # Check database
        if faiss_index is None:
             print("‚ùå Error: Database is empty. Please run the PDF loading cell first.")
             return

        # A. RETRIEVE (Your RAG)
        q_vec = embed_model.encode([q])
        D, I = faiss_index.search(np.array(q_vec).astype('float32'), k=3)

        found_texts = [vector_texts[i] for i in I[0] if i < len(vector_texts)]
        context = "\n".join(found_texts)

        # B. GENERATE (Gemini)
        if context:
            answer = ask_gemini(context, q)
            print("-" * 60)
            print(answer)
            print("-" * 60)
        else:
            print("No relevant info found in the documents.")

ai_btn.on_click(on_ai_click)

# --- 3. DISPLAY IT ALONE ---
# We create a simple box for this tool
ai_tool = widgets.VBox([
    widgets.HTML("<h2>üåø Independent AI Assistant</h2>"),
    widgets.HBox([ai_input, ai_btn]),
    ai_output
], layout=widgets.Layout(padding='20px', border='2px solid #4CAF50', margin='20px 0'))

# Show it now!
display(ai_tool)

In [None]:
#FROM NOW ON. ASAAD'S PART

In [None]:
!pip -q install firebase-admin ipywidgets matplotlib

import firebase_admin
from firebase_admin import credentials, firestore
from google.colab import userdata  # Import userdata
import json

# Check if Firebase is already running to avoid re-initialization error
if not firebase_admin._apps:
    # Use Colab Secrets instead of a file path
    key_content = userdata.get('FIREBASE_KEY')
    key_dict = json.loads(key_content)
    cred = credentials.Certificate(key_dict)
    firebase_admin.initialize_app(cred)

# Get the client (works even if initialized in previous cells)
db = firestore.client()
# --- FIX END ---

print("‚úÖ Connected to Firestore in project:", db.project)

In [None]:
import requests
import pandas as pd

BASE_URL = "https://server-cloud-v645.onrender.com"

def fetch_history(feed: str, limit: int = 30) -> pd.DataFrame:
    """Fetch IoT history from course server. Returns DataFrame with created_at,value."""
    resp = requests.get(f"{BASE_URL}/history", params={"feed": feed, "limit": int(limit)}, timeout=120)
    resp.raise_for_status()
    data = resp.json()
    if "data" not in data:
        raise ValueError(f"Server error: {data}")

    df = pd.DataFrame(data["data"])
    df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df = df.dropna(subset=["created_at", "value"]).sort_values("created_at")
    return df


In [None]:

# !pip -q install transformers timm pillow torch --upgrade
import io
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
from datetime import datetime, timezone
from PIL import Image
# hugging face import
from transformers import pipeline
# hugging face plant disease model id

MODEL_ID = "linkanjarad/mobilenet_v2_1.0_224-plant-disease-identification"
# load hugging face model here
# clf will be used later to analyze the image
clf = pipeline("image-classification", model=MODEL_ID)
# css for design only
CSS = """
<style>
:root {
    --accent: #2563eb;
    --accent-soft: #eff6ff;
    --bg: #f8fafc;
    --card-bg: #ffffff;
    --text: #1e293b;
}

.jupyter-widgets, .widget-area {
    font-family: 'Segoe UI', system-ui, sans-serif !important;
}

/* App wrapper */
.app-shell {
    background: var(--bg);
    padding: 10px;
    border-radius: 0 0 12px 12px;
}

/* Card */
.modern-card {
    background: var(--card-bg);
    border-radius: 16px;
    padding: 24px;
    border: 1px solid #e2e8f0;
    box-shadow: 0 4px 6px -1px rgba(15,23,42,0.1);
    margin-bottom: 20px;
}
.modern-card h2 {
    color: var(--accent);
    margin: 0 0 8px 0;
    font-size: 20px;
    font-weight: 700;
}
.modern-card p {
    color: #64748b;
    font-size: 14px;
    margin: 0 0 18px 0;
}

/* Tabs */
.p-TabBar-tab {
    background: transparent !important;
    border: none !important;
    color: #94a3b8 !important;
    font-weight: 600 !important;
    padding: 10px 20px !important;
}
.p-TabBar-tab.p-mod-current {
    color: var(--accent) !important;
    background: var(--accent-soft) !important;
    border-radius: 10px !important;
}

/* Primary button */
.btn-primary button {
    background: var(--accent) !important;
    color: white !important;
    border-radius: 10px !important;
    font-weight: 600 !important;
    border: none !important;
    padding: 8px 20px !important;
    transition: transform 0.1s, box-shadow 0.1s;
}
.btn-primary button:active {
    transform: scale(0.97);
    box-shadow: 0 2px 5px rgba(15,23,42,0.15);
}

/* Warning button (Dashboard) */
.btn-warning button {
    background: #f97316 !important;
    color: white !important;
    border-radius: 10px !important;
    font-weight: 600 !important;
    border: none !important;
    padding: 8px 20px !important;
}

/* FileUpload styled as upload zone */
.widget-upload {
    width: 100% !important;
}
.widget-upload > label {
    width: 100%;
    border: 2px dashed #cbd5e1;
    border-radius: 16px;
    background: #f1f5f9;
    padding: 28px 16px;
    text-align: center;
    cursor: pointer;
    font-weight: 600;
    color: var(--accent);
    font-size: 14px;
    transition: all 0.2s ease;
}
.widget-upload > label:hover {
    border-color: var(--accent);
    background: var(--accent-soft);
}

/* Small hint text */
.hint-text {
    color: #64748b;
    font-size: 11px;
    margin-top: 4px;
}

/* Align labels */
.labeled-field .widget-label {
    min-width: 80px;
}
</style>
"""
# function to build cards for ui layout
display(widgets.HTML(CSS))

def create_card(title, subtitle, children):
    header = widgets.HTML(f"<h2>{title}</h2><p>{subtitle}</p>")
    box = widgets.VBox([header] + children)
    box.add_class("modern-card")
    return box

# SCREEN A ‚Äî PLANT DIAGNOSTIC
a_out = widgets.Output()
# input for plant name
a_name = widgets.Text(
    placeholder="Plant species (e.g. Tomato)",
    description="Plant:",
    style={'description_width': '70px'},
    layout=widgets.Layout(width='100%')
)
# widget to upload image

a_uploader = widgets.FileUpload(
    accept="image/*",
    multiple=False,
    description="üì∑ Click to upload plant photo"
)
a_uploader.layout = widgets.Layout(width='100%')
a_hint = widgets.HTML("<div class='hint-text'>JPG/PNG ¬∑ One image at a time</div>")

a_btn = widgets.Button(
    description="Run Analysis & Save",
    layout=widgets.Layout(width='100%', height='42px')
)
a_btn.add_class("btn-primary")
# when user uploads an image preview it
def on_plant_upload(change):
    with a_out:
        clear_output()
        if not a_uploader.value:
            return
        fname, f = list(a_uploader.value.items())[0]
        display(widgets.Image(
            value=f["content"],
            width=320,
            layout=widgets.Layout(border='3px solid #e5efff', border_radius='12px')
        ))
        print(f"\nReady to analyze: {fname}")

a_uploader.observe(on_plant_upload, names="value")
# function that runs model when user clicks analyze

def run_plant_analysis(_):
    with a_out:
        clear_output()
         # check that image exists
        if not a_uploader.value:
            print(" Please upload an image first.")
            return
         # check name field
        if not a_name.value.strip():
            print(" Please enter the plant name.")
            return

        fname, f = list(a_uploader.value.items())[0]
        img = Image.open(io.BytesIO(f["content"])).convert("RGB")

        print("‚è≥ Analyzing plant health...")
        preds = clf(img)  #  hugging face model is used here we also take the best predict preds[0]
        top = preds[0]

        clear_output()
        display(widgets.Image(value=f["content"], width=320))
        # detect color message green/red
        healthy = "healthy" in top["label"].lower()
        bg = "#16a34a" if healthy else "#dc2626"

        display(widgets.HTML(f"""
            <div style="
                background:{bg};
                color:white;
                padding:18px;
                border-radius:12px;
                margin-top:15px;
                box-shadow:0 4px 10px rgba(0,0,0,0.15);
            ">
                <h3 style="margin:0; font-size:18px;">
                    Prediction: {top['label'].replace('_',' ').title()}
                </h3>
                <p style="margin:6px 0 0 0; font-weight:600;">
                    Confidence: {top['score']*100:.2f}%
                </p>
            </div>
        """))

        # Save to Database
        try:
            db.collection("plant_images").add({
                "plant": a_name.value.strip(),
                "file": fname,
                "prediction": top["label"],
                "score": float(top["score"]),
                "time": datetime.now(timezone.utc),
            })
            print(" Data saved .")
        except NameError:
            print(" Firestore client 'db' not defined ‚Äî skipping save.")
        except Exception as e:
            print(" Failed to save to Firestore:", e)

a_btn.on_click(run_plant_analysis)

screenA = create_card(
    "Plant Diagnostic",
    "Upload a plant leaf photo, detect possible disease, and store the result.",
    [
        a_name,
        widgets.HTML("<div style='height:8px;'></div>"),
        a_uploader,
        a_hint,
        widgets.HTML("<div style='height:12px;'></div>"),
        a_btn,
        a_out,
    ]
)

# SCREEN B ‚Äî IOT DATA

b_out = widgets.Output()
# choose sensor

b_feed = widgets.Dropdown(
    options=["soil", "humidity", "temperature"],
    value="soil",
    description="Feed:",
    style={'description_width': '70px'},
    layout=widgets.Layout(width='60%')
)
# number of samples slider

b_limit = widgets.IntSlider(
    value=10, min=1, max=100, step=1,
    description="Samples:",
    style={'description_width': '70px'},
    layout=widgets.Layout(width='80%')
)
# button to get data

b_btn = widgets.Button(
    description="Get Data",
    layout=widgets.Layout(width='50%', height='40px')
)
b_btn.add_class("btn-primary")
# fetch sensor history from your function

def fetch_sensor_data(_):
    with b_out:
        clear_output()
        try:
            df = fetch_history(b_feed.value, b_limit.value)
            print(f"Rows returned: {len(df)}")
            display(df)
            print("\nLatest value:", df["value"].iloc[-1], "| at:", df["created_at"].iloc[-1])
        except NameError:
            print("‚Ñπ 'fetch_history' is not defined.")
        except Exception as e:
            print(" Error fetching data:", e)

b_btn.on_click(fetch_sensor_data)

screenB = create_card(
    "IoT Data",
    "Fetch sensor history from the course server.",
    [
        b_feed,
        b_limit,
        widgets.HTML("<div style='height:8px;'></div>"),
        b_btn,
        b_out,
    ]
)

# SCREEN C ‚Äî DASHBOARD (Build + status + RAG)
dash_out = widgets.Output()

dash_feed = widgets.Dropdown(
    options=["soil", "humidity", "temperature"],
    value="soil",
    description="Feed:",
    style={'description_width': '70px'},
    layout=widgets.Layout(width='60%')
)
dash_limit = widgets.IntSlider(
    value=30, min=10, max=200, step=10,
    description="Samples:",
    style={'description_width': '70px'},
    layout=widgets.Layout(width='80%')
)

dash_btn = widgets.Button(
    description="Build Dashboard",
    layout=widgets.Layout(width='60%', height='40px')
)
dash_btn.add_class("btn-warning")
# function to decide if condition is ok/warning/bad

def dashboard_status(feed, latest):
    if feed == "soil":
        if latest < 30: return "Critical"
        if latest < 45: return "Warning"
        return "Healthy"
    if feed == "humidity":
        if latest < 30: return "Warning"
        return "OK"
    if feed == "temperature":
        if latest < 10 or latest > 35: return "Warning"
        return "OK"
    return "OK"
# build plot dashboard

def build_dashboard(_):
    with dash_out:
        clear_output()
        try:
            df = fetch_history(dash_feed.value, dash_limit.value)
        except NameError:
            print(" 'fetch_history' is not defined.")
            return
        except Exception as e:
            print(" Error fetching data:", e)
            return

        latest = df["value"].iloc[-1]
        status = dashboard_status(dash_feed.value, latest)
        print(f"Current Status: {status}")

        # advice feature
        if status in ["Critical", "Warning"]:
            print("\n Generating research-based insight...")
            if dash_feed.value == "soil":
                query = "impact of low soil moisture and water stress on plant disease"
            elif dash_feed.value == "humidity":
                query = "how low humidity affects plant health and fungal growth"
            else:
                query = f"effects of extreme {dash_feed.value} on plant pathology"

            try:
                insight = rag_answer_without_llm(query, top_k=1)
                print("=" * 60)
                print(" SMART INSIGHT:")
                print(insight)
                print("=" * 60)
            except NameError:
                print(" 'rag_answer_without_llm' not defined, skipping insight.")
            except Exception as e:
                print(" Error calling RAG function:", e)

       # plot values
        plt.figure(figsize=(9, 3))
        plt.plot(df["created_at"], df["value"], marker="o")
        plt.title(f"{dash_feed.value.title()} Monitoring ‚Äì Status: {status}")
        plt.xlabel("Time")
        plt.ylabel("Value")
        plt.grid(True, axis="y", linestyle="--", alpha=0.5)
        plt.xticks(rotation=30)
        plt.tight_layout()
        plt.show()

dash_btn.on_click(build_dashboard)

screenC = create_card(
    "Dashboard",
    "Build a visual dashboard with status and smart research insight.",
    [
        dash_feed,
        dash_limit,
        widgets.HTML("<div style='height:8px;'></div>"),
        dash_btn,
        dash_out,
    ]
)


# SCREEN D ‚Äî SEARCH (FIRESTORE INDEX)

c_out = widgets.Output()

index_box = widgets.Text(
    value="inverted_index",
    description="Index:",
    style={'description_width': '80px'},
    layout=widgets.Layout(width="70%")
)
query_box = widgets.Text(
    value="about",
    description="Search:",
    style={'description_width': '80px'},
    layout=widgets.Layout(width="70%")
)

search_btn = widgets.Button(
    description="Search database",
    layout=widgets.Layout(width='50%', height='40px', margin='8px 0 0 80px')
)
search_btn.add_class("btn-primary")
# search data if exists
def search_inverted_index(index_name: str, term: str):
    index_name = index_name.strip()
    term = term.strip().lower()
    if not index_name or not term:
        return None, "Enter both Index and Search."

    doc = db.collection(index_name).document(term).get()
    if doc.exists:
        data = doc.to_dict() or {}
        return {
            "term": term,
            "df": data.get("df"),
            "doc_ids": data.get("doc_ids", [])
        }, None

    qs = list(db.collection(index_name)
              .where("term", "==", term)
              .limit(1)
              .stream())
    if qs:
        data = qs[0].to_dict() or {}
        return {
            "term": term,
            "df": data.get("df"),
            "doc_ids": data.get("doc_ids", [])
        }, None

    return None, f"No results for '{term}' in '{index_name}'."

def on_search_click(_):
    with c_out:
        clear_output()
        try:
            result, err = search_inverted_index(index_box.value, query_box.value)
        except NameError:
            print("‚ÑπÔ∏è Firestore client 'db' is not defined.")
            return

        if err:
            print(err)
            return

        html = "<ul style='padding-left:18px;'>"
        for did in result.get("doc_ids", []):
            html += f"<li>{did}</li>"
        html += "</ul>"

        display(widgets.HTML(f"""
            <p><b>term:</b> {result['term']}</p>
            <p><b>df:</b> {result.get('df','')}</p>
            <p><b>doc_ids:</b></p>
            {html}
        """))

search_btn.on_click(on_search_click)

screenD = create_card(
    "Knowledge Base Search",
    "Search terms in the Firestore inverted index.",
    [
        index_box,
        query_box,
        search_btn,
        c_out,
    ]
)

#
# APP SHELL + TABS
tabs = widgets.Tab(children=[screenA, screenB, screenC, screenD])
tabs.set_title(0, "Diagnosis")
tabs.set_title(1, "IoT Data")
tabs.set_title(2, "Dashboard")
tabs.set_title(3, "Search")

header = widgets.HTML(
    "<div style='background:#1e40af; color:white; padding:18px 20px;"
    "border-radius:12px 12px 0 0; font-size:22px; font-weight:700;'>"
    "üå± PlantCare  Pro</div>"
)

app = widgets.VBox([tabs])
app.add_class("app-shell")

display(header)
display(app)
