<a href="https://colab.research.google.com/github/Asaad972/CollabFirstNoteBook/blob/main/HW02_Cloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1: Minimal package installation (only if missing)
import importlib.util, sys, subprocess

def ensure(pkg, import_name=None):
    name = import_name or pkg
    if importlib.util.find_spec(name) is None:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

# Usually already installed in Colab, but keep safe:
ensure("pandas", "pandas")

# Required for your homework plan:
ensure("nltk", "nltk")
ensure("sentence-transformers", "sentence_transformers")
ensure("faiss-cpu", "faiss")

print("‚úÖ Dependencies ready")


‚úÖ Dependencies ready


In [2]:
# CELL 2: Imports + NLTK resources (run once per runtime)

import re
from collections import defaultdict
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sentence_transformers import SentenceTransformer
import faiss

# NLTK downloads (required for stopwords/tokenizer/lemmatizer)
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

print("‚úÖ Imports ready + NLTK resources downloaded")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


‚úÖ Imports ready + NLTK resources downloaded


In [3]:
!pip -q install firebase-admin

In [9]:
# CELL 3: Store Classes (Vector Store + Inverted Index)
# =====================================================
"""
üóÑÔ∏è CELL 3: STORE CLASSES
- SimpleVectorStore: stores embeddings + documents + metadatas + ids (like Tirgul 7)
- InvertedIndexStore: stores required index schema term -> DocIDs (homework requirement)
"""

import numpy as np
from collections import defaultdict

# ---------- Vector Store (similar to Tirgul 7) ----------
class SimpleVectorStore:
    """Simple in-memory vector store (fallback)"""

    def __init__(self):
        self.documents = []
        self.embeddings = []   # list of numpy arrays
        self.metadatas = []
        self.ids = []
        print("üì¶ SimpleVectorStore initialized")

    def add(self, embeddings, documents, metadatas, ids):
        # Ensure numpy arrays
        embeddings = [np.asarray(e, dtype=np.float32) for e in embeddings]
        self.embeddings.extend(embeddings)
        self.documents.extend(documents)
        self.metadatas.extend(metadatas)
        self.ids.extend(ids)
        print(f"‚úÖ Added {len(documents)} documents to simple vector store")

    def query(self, query_embeddings, n_results=5):
        if not self.embeddings:
            return {'ids': [[]], 'documents': [[]], 'metadatas': [[]], 'distances': [[]]}

        q = np.asarray(query_embeddings[0], dtype=np.float32)

        E = np.vstack(self.embeddings)  # shape: (N, d)

        # cosine similarity without sklearn
        q_norm = np.linalg.norm(q) + 1e-12
        E_norm = np.linalg.norm(E, axis=1) + 1e-12
        sims = (E @ q) / (E_norm * q_norm)

        top_idx = np.argsort(sims)[::-1][:n_results]

        return {
            'ids': [[self.ids[i] for i in top_idx]],
            'documents': [[self.documents[i] for i in top_idx]],
            'metadatas': [[self.metadatas[i] for i in top_idx]],
            'distances': [[float(1 - sims[i]) for i in top_idx]]  # distance-like
        }

    def count(self):
        return len(self.documents)


# ---------- Inverted Index (required by homework) ----------
class InvertedIndexStore:
    """Required structure: term -> DocIDs"""

    def __init__(self):
        self.term_to_docids = defaultdict(set)
        print("üì¶ InvertedIndexStore initialized")

    def add_occurrence(self, term: str, doc_id: str):
        self.term_to_docids[term].add(doc_id)

    def get_docids(self, term: str):
        return sorted(self.term_to_docids.get(term, set()))

    def count_terms(self) -> int:
        return len(self.term_to_docids)

    def to_required_format(self):
        # [{"term": ..., "DocIDs": [...]}, ...]
        return [{"term": t, "DocIDs": sorted(list(docids))}
                for t, docids in sorted(self.term_to_docids.items())]


print("‚úÖ Store classes defined!")
print("üìã Next: Cell 4 (core logic: preprocess + build index + embeddings)")


‚úÖ Store classes defined!
üìã Next: Cell 4 (core logic: preprocess + build index + embeddings)


In [5]:
# CELL 4: Core setup (custom stopwords + stemming + embedding model + FAISS)

# --- Custom stopwords (you define them) ---
# We remove these words because they are very frequent function words (articles, prepositions, pronouns).
# They usually do not add topic meaning, but they increase index size and add noise to retrieval.
CUSTOM_STOPWORDS = {
    "the","a","an","and","or","but",
    "to","of","in","on","at","for","from","by","with","as",
    "is","are","was","were","be","been","being",
    "this","that","these","those",
    "it","its","they","them","their","we","our","you","your",
    "i","me","my","he","him","his","she","her",
    "not","no","do","does","did","doing"
}

stemmer = PorterStemmer()

def preprocess_text(text: str):
    """
    Returns list of terms for indexing:
    - lowercase
    - tokenize
    - keep alphabetic tokens only
    - remove custom stopwords
    - apply stemming
    """
    text = text.lower()
    tokens = word_tokenize(text)
    terms = []
    for tok in tokens:
        if tok.isalpha() and tok not in CUSTOM_STOPWORDS:
            terms.append(stemmer.stem(tok))
    return terms

# --- Embedding model (for semantic retrieval) ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- FAISS index (stores embeddings for doc-level retrieval) ---
faiss_index = None
vector_dim = None

# Parallel stores (FAISS row -> doc data)
vector_doc_ids = []   # doc_id
vector_texts = []     # full doc text

print("‚úÖ Core setup ready (custom stopwords + stemming + embeddings + FAISS)")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


‚úÖ Core setup ready (custom stopwords + stemming + embeddings + FAISS)


In [6]:
# CELL 5: Sample Papers (metadata + file names)

sample_papers = [
    {
        "title": "AI-IoT Based Smart Agriculture Pivot for Plant Diseases Detection and Treatment",
        "authors": "AS Ibrahim et al.",
        "journal": "Scientific Reports (Nature)",
        "year": 2025,
        "doi": "10.1038/s41598-025-98454-6",
        "abstract": "Proposes an AI-IoT smart agriculture pivot architecture for detecting and treating plant diseases, including a hardware pilot and mobile-app support.",
        "file": "s41598-025-98454-6.pdf"
    },
    {
        "title": "Infectious Plant Diseases: Etiology, Current Status, Problems and Prospects in Plant Protection",
        "authors": "PA Nazarov et al.",
        "journal": "Acta Naturae",
        "year": 2020,
        "doi": None,
        "abstract": "Review of infectious plant diseases caused by viruses, bacteria, and fungi; current status and prospects for plant protection.",
        "file": "actanaturae_11026.pdf"
    },
    {
        "title": "Recent Approaches towards Control of Fungal Diseases in Plants: An Updated Review",
        "authors": "NA El-Baky, AAAF Amara",
        "journal": "Journal of Fungi (MDPI)",
        "year": 2021,
        "doi": "10.3390/jof7110900",
        "abstract": "Reviews strategies to control plant fungal diseases including biocontrol and other approaches.",
        "file": "jof-07-00900.pdf"
    },
    {
        "title": "The Potential Risk of Plant-Virus Disease Initiation by Infected Tomatoes",
        "authors": "C Klap et al.",
        "journal": "Plants (MDPI)",
        "year": 2020,
        "doi": "10.3390/plants9050623",
        "abstract": "Study on how infected tomatoes can contribute to plant-virus disease spread and transmission risk.",
        "file": "plants-09-00623.pdf"
    },
    {
        "title": "Current status and future perspectives of the diagnostic of plant bacterial pathogens",
        "authors": "X Wang et al.",
        "journal": "Frontiers in Plant Science",
        "year": 2025,
        "doi": None,
        "abstract": "Review of plant bacterial pathogen diagnostics; shift from culture-based to culture-free detection; limitations in real plant extracts and recent progress.",
        "file": "fpls-2025-bacterial-pathogen-diagnostics.pdf"
    }
]

print(f"üìö sample_papers ready: {len(sample_papers)} papers")
for i, p in enumerate(sample_papers, 1):
    print(f"{i}. {p['title']}  -->  {p['file']}")


üìö sample_papers ready: 5 papers
1. AI-IoT Based Smart Agriculture Pivot for Plant Diseases Detection and Treatment  -->  s41598-025-98454-6.pdf
2. Infectious Plant Diseases: Etiology, Current Status, Problems and Prospects in Plant Protection  -->  actanaturae_11026.pdf
3. Recent Approaches towards Control of Fungal Diseases in Plants: An Updated Review  -->  jof-07-00900.pdf
4. The Potential Risk of Plant-Virus Disease Initiation by Infected Tomatoes  -->  plants-09-00623.pdf
5. Current status and future perspectives of the diagnostic of plant bacterial pathogens  -->  fpls-2025-bacterial-pathogen-diagnostics.pdf


In [7]:
# CELL 6: Load PDFs into docs (doc_id -> full text) using sample_papers

import os
import fitz  # PyMuPDF

PDF_FOLDER = "/content"  # change if your PDFs are in a subfolder

def load_docs_from_sample_papers(sample_papers, folder_path):
    docs = {}
    docs_meta = {}

    for p in sample_papers:
        fname = p["file"]
        pdf_path = os.path.join(folder_path, fname)

        doc_id = os.path.splitext(fname)[0]  # filename without .pdf
        docs_meta[doc_id] = {
            "title": p.get("title", ""),
            "year": p.get("year", None),
            "authors": p.get("authors", ""),
            "journal": p.get("journal", "")
        }

        if not os.path.exists(pdf_path):
            print(f"‚ùå Missing file: {pdf_path}")
            continue

        pdf = fitz.open(pdf_path)
        text = "\n".join(page.get_text("text") for page in pdf).strip()
        pdf.close()

        if text:
            docs[doc_id] = text
        else:
            print(f"‚ö†Ô∏è Empty text extracted: {fname}")

    return docs, docs_meta

docs, docs_meta = load_docs_from_sample_papers(sample_papers, PDF_FOLDER)
print(f"‚úÖ Loaded docs: {len(docs)} / {len(sample_papers)}")


‚úÖ Loaded docs: 5 / 5


In [10]:
# CELL 7: Build the required index (term -> DocIDs) + build FAISS embeddings store (doc-level)

# 1) Build inverted index (term -> DocIDs)
inv_index = InvertedIndexStore()

for doc_id, text in docs.items():
    terms = preprocess_text(text)   # uses custom stopwords + stemming
    for t in set(terms):            # presence only (not frequency)
        inv_index.add_occurrence(t, doc_id)

print(f"‚úÖ Inverted index built. Unique terms: {inv_index.count_terms()}")

# 2) Build embeddings + FAISS (one vector per doc)
doc_ids = list(docs.keys())
texts = [docs[d] for d in doc_ids]

emb = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True).astype("float32")

vector_dim = emb.shape[1]
faiss_index = faiss.IndexFlatIP(vector_dim)  # cosine similarity via normalized embeddings
faiss_index.add(emb)

# parallel arrays for retrieval results
vector_doc_ids = doc_ids
vector_texts = texts

print(f"‚úÖ FAISS built. Vectors: {faiss_index.ntotal} | dim={vector_dim}")


üì¶ InvertedIndexStore initialized
‚úÖ Inverted index built. Unique terms: 5120
‚úÖ FAISS built. Vectors: 5 | dim=384


In [11]:
# CELL 8: Export + quick preview of the required index format (term + DocIDs)

records = inv_index.to_required_format()

print(f"‚úÖ Index records created: {len(records)} terms")
print("Preview (first 10):")
for row in records[:10]:
    print(row)


‚úÖ Index records created: 5120 terms
Preview (first 10):
{'term': 'aab', 'DocIDs': ['plants-09-00623']}
{'term': 'aamra', 'DocIDs': ['jof-07-00900']}
{'term': 'abad', 'DocIDs': ['jof-07-00900', 'plants-09-00623']}
{'term': 'abbrevi', 'DocIDs': ['actanaturae_11026']}
{'term': 'abd', 'DocIDs': ['actanaturae_11026', 'jof-07-00900']}
{'term': 'abdallah', 'DocIDs': ['jof-07-00900']}
{'term': 'abdel', 'DocIDs': ['jof-07-00900']}
{'term': 'abdelkhalek', 'DocIDs': ['jof-07-00900']}
{'term': 'abdellatef', 'DocIDs': ['jof-07-00900']}
{'term': 'abdelrahman', 'DocIDs': ['jof-07-00900']}


In [14]:
# CELL 8: Sanity checks + index preview (NO PlantDiseaseIndexRAG)

import pandas as pd

print("‚úÖ Sanity checks:")

# 1) Documents
print("Docs loaded:", len(docs))
assert len(docs) > 0, "No documents loaded!"

# 2) Inverted index
num_terms = inv_index.count_terms()
print("Unique terms in index:", num_terms)
assert num_terms > 0, "Index is empty!"

# 3) FAISS
print("FAISS vectors:", faiss_index.ntotal)
assert faiss_index.ntotal == len(docs), "FAISS vectors != docs count"

# 4) Export index in REQUIRED schema
records = inv_index.to_required_format()
df_index = pd.DataFrame(records)

print("\n‚úÖ Index preview (first 5 rows):")
display(df_index.head(5))

print("\n‚úÖ CELL 8 completed successfully")


‚úÖ Sanity checks:
Docs loaded: 5
Unique terms in index: 5120
FAISS vectors: 5

‚úÖ Index preview (first 5 rows):


Unnamed: 0,term,DocIDs
0,aab,[plants-09-00623]
1,aamra,[jof-07-00900]
2,abad,"[jof-07-00900, plants-09-00623]"
3,abbrevi,[actanaturae_11026]
4,abd,"[actanaturae_11026, jof-07-00900]"



‚úÖ CELL 8 completed successfully


In [15]:
# CELL 9: Embedding-based retrieval (FAISS) for a user query (no OpenAI yet)

def retrieve_top_docs(query: str, top_k: int = 5):
    if faiss_index is None or faiss_index.ntotal == 0:
        return "FAISS index is empty. Build vectors first."

    q_emb = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    distances, indices = faiss_index.search(q_emb, top_k)

    lines = []
    lines.append(f"Query: {query}")
    lines.append("=" * 60)

    for rank, idx in enumerate(indices[0], start=1):
        if idx == -1:
            continue
        doc_id = vector_doc_ids[idx]
        title = docs_meta.get(doc_id, {}).get("title", "")
        text = vector_texts[idx]
        snippet = re.sub(r"\s+", " ", text)[:350]
        score = float(distances[0][rank - 1])

        lines.append(f"{rank}) {doc_id} | {title} | similarity: {score:.4f}")
        lines.append(f"Snippet: {snippet}...")
        lines.append("-" * 60)

    return "\n".join(lines)

print("‚úÖ Retrieval function ready")


‚úÖ Retrieval function ready


In [16]:
# CELL 10: RAG-style output (retrieval + "enriched" answer without OpenAI)
# We will: retrieve top docs, then produce a simple enriched response by extracting key sentences.

def split_sentences(text: str):
    # simple sentence split (good enough for baseline)
    parts = re.split(r'(?<=[.!?])\s+', re.sub(r"\s+", " ", text).strip())
    return [s for s in parts if len(s) > 30]

def rag_answer_without_llm(query: str, top_k: int = 3, max_sentences_per_doc: int = 2):
    if faiss_index is None or faiss_index.ntotal == 0:
        return "FAISS index is empty. Build vectors first."

    q_emb = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    distances, indices = faiss_index.search(q_emb, top_k)

    lines = []
    lines.append(f"Query: {query}")
    lines.append("=" * 60)

    # Retrieval section
    lines.append("Top retrieved documents:")
    retrieved = []
    for rank, idx in enumerate(indices[0], start=1):
        if idx == -1:
            continue
        doc_id = vector_doc_ids[idx]
        title = docs_meta.get(doc_id, {}).get("title", "")
        score = float(distances[0][rank - 1])
        retrieved.append((doc_id, title, score))
        lines.append(f"{rank}) {doc_id} | {title} | similarity: {score:.4f}")
    lines.append("=" * 60)

    # Enriched response (extractive, no LLM)
    lines.append("Enriched response (extractive, no LLM):")
    q_terms = set(preprocess_text(query))

    for doc_id, title, score in retrieved:
        text = docs[doc_id]
        sents = split_sentences(text)

        # score sentences by overlap with query terms (stems)
        scored = []
        for s in sents:
            s_terms = set(preprocess_text(s))
            overlap = len(q_terms & s_terms)
            if overlap > 0:
                scored.append((overlap, s))

        scored.sort(key=lambda x: x[0], reverse=True)
        best = [s for _, s in scored[:max_sentences_per_doc]]

        lines.append(f"- Source: {doc_id} | {title}")
        if best:
            for b in best:
                lines.append(f"  ‚Ä¢ {b}")
        else:
            lines.append("  ‚Ä¢ (No strong matching sentences found)")
        lines.append("-" * 60)

    return "\n".join(lines)

print("‚úÖ RAG-style (no OpenAI) function ready")


‚úÖ RAG-style (no OpenAI) function ready


In [17]:
# CELL 11: Quick demo (edit the query text)

print(retrieve_top_docs("how to detect plant diseases using sensors and ai", top_k=3))
print()
print(rag_answer_without_llm("how to detect plant diseases using sensors and ai", top_k=3))


Query: how to detect plant diseases using sensors and ai
1) s41598-025-98454-6 | AI-IoT Based Smart Agriculture Pivot for Plant Diseases Detection and Treatment | similarity: 0.5921
Snippet: AI-IoT based smart agriculture pivot for plant diseases detection and treatment Amin S. Ibrahim1, Saeed Mohsen 2,3ÔÄ™, I. M. Selim4, Roobaea Alroobaea 5, Majed Alsafyani5, Abdullah M. Baqasah6 & Mohamed Eassa7,8 There are some key problems faced in modern agriculture that IoT-based smart farming. These problems such shortage of water, plant diseases,...
------------------------------------------------------------
2) actanaturae_11026 | Infectious Plant Diseases: Etiology, Current Status, Problems and Prospects in Plant Protection | similarity: 0.4710
Snippet: 46 | ACTA NATURAE | VOL. 12 ‚Ññ 3 (46) 2020 REVIEWS ABSTRACT In recent years, there has been an increase in the number of diseases caused by bacterial, fungal, and viral infections. Infections affect plants at different stages of agricultural 

In [18]:
# CELL 12: Evaluation / sanity checks (index + FAISS + stopwords + stemming)

def evaluate_system():
    lines = []
    lines.append("=== EVALUATION (Sanity Checks) ===")

    # Docs
    num_docs = len(docs) if isinstance(docs, dict) else 0
    lines.append(f"Docs loaded: {num_docs}")
    if num_docs == 0:
        lines.append("‚ùå No documents loaded. Check PDF_FOLDER path and filenames in sample_papers.")
        return "\n".join(lines)

    # Index
    num_terms = inv_index.count_terms() if 'inv_index' in globals() else 0
    lines.append(f"Unique terms in inverted index: {num_terms}")
    if num_terms == 0:
        lines.append("‚ùå Index is empty. Check preprocess_text() and PDF text extraction.")
        return "\n".join(lines)

    # FAISS
    faiss_total = faiss_index.ntotal if faiss_index is not None else 0
    lines.append(f"FAISS vectors: {faiss_total}")
    if faiss_total != num_docs:
        lines.append(f"‚ö†Ô∏è FAISS vectors ({faiss_total}) != docs ({num_docs}). Check embedding build step.")

    # Stopwords + stemming check on a tiny sample
    sample_doc_id = next(iter(docs.keys()))
    sample_text = docs[sample_doc_id][:800]
    terms = preprocess_text(sample_text)

    lines.append(f"Sample doc_id: {sample_doc_id}")
    lines.append(f"Sample extracted chars (first 80): {repr(docs[sample_doc_id][:80])}")
    lines.append(f"Preprocess produced {len(terms)} terms from first 800 chars.")
    lines.append(f"First 25 terms (stems): {terms[:25]}")

    # Check a few stopwords are removed
    test_sentence = "This is a simple test of the system and the index."
    test_terms = preprocess_text(test_sentence)
    lines.append(f"Stopword test input: {test_sentence}")
    lines.append(f"Stopword test output terms: {test_terms}")
    if any(w in test_terms for w in ["the", "is", "and", "this"]):
        lines.append("‚ö†Ô∏è Some stopwords may still be appearing. Check CUSTOM_STOPWORDS and token filtering.")
    else:
        lines.append("‚úÖ Stopwords appear to be removed (basic check).")

    # Quick retrieval check
    q = "plant disease detection"
    preview = retrieve_top_docs(q, top_k=2)
    lines.append("Retrieval check (top 2):")
    lines.append(preview)

    return "\n".join(lines)

print(evaluate_system())


=== EVALUATION (Sanity Checks) ===
Docs loaded: 5
Unique terms in inverted index: 5120
FAISS vectors: 5
Sample doc_id: s41598-025-98454-6
Sample extracted chars (first 80): 'AI-IoT based smart agriculture \npivot for plant diseases detection \nand treatmen'
Preprocess produced 75 terms from first 800 chars.
First 25 terms (stems): ['base', 'smart', 'agricultur', 'pivot', 'plant', 'diseas', 'detect', 'treatment', 'amin', 'saeed', 'mohsen', 'roobaea', 'alroobaea', 'maje', 'abdullah', 'moham', 'there', 'some', 'key', 'problem', 'face', 'modern', 'agricultur', 'smart', 'farm']
Stopword test input: This is a simple test of the system and the index.
Stopword test output terms: ['simpl', 'test', 'system', 'index']
‚úÖ Stopwords appear to be removed (basic check).
Retrieval check (top 2):
Query: plant disease detection
1) actanaturae_11026 | Infectious Plant Diseases: Etiology, Current Status, Problems and Prospects in Plant Protection | similarity: 0.5681
Snippet: 46 | ACTA NATURAE | VOL. 12 