<a href="https://colab.research.google.com/github/Asaad972/CollabFirstNoteBook/blob/main/HW02_Cloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1: Package Installation with progress tracking
import sys, subprocess, time

def pip_install(pkg: str):
    print(f"üì¶ Installing {pkg} ...")
    start = time.time()
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
    print(f"‚úÖ Installed {pkg} ({time.time() - start:.1f}s)")

# Required for reading PDFs per page
pip_install("pymupdf")

# Useful for preview tables (index preview)
pip_install("pandas")

# Optional (only if you plan to use Cell 11)
# pip_install("gradio")


üì¶ Installing pymupdf ...
‚úÖ Installed pymupdf (6.1s)
üì¶ Installing pandas ...
‚úÖ Installed pandas (7.1s)


In [2]:
# CELL 2: Import Libraries with fallback detection

import re
from collections import defaultdict
import pandas as pd

print("üîç Checking available libraries...")

# PDF reader (PyMuPDF)
try:
    import fitz  # pymupdf
    PYMUPDF_AVAILABLE = True
    print("‚úÖ PyMuPDF: Available")
except ImportError:
    PYMUPDF_AVAILABLE = False
    print("‚ùå PyMuPDF: Not available")

# Gradio (optional, used later only if available)
try:
    import gradio as gr
    GRADIO_AVAILABLE = True
    print("‚úÖ Gradio: Available")
except ImportError:
    GRADIO_AVAILABLE = False
    print("‚ùå Gradio: Not available (optional)")

print("\nüìã System Status:")
print(f"   PDF extraction: {'PyMuPDF' if PYMUPDF_AVAILABLE else 'Unavailable'}")
print(f"   UI interface: {'Gradio' if GRADIO_AVAILABLE else 'Simple text'}")

print("\nüéØ Ready for Cell 3!")


üîç Checking available libraries...
‚úÖ PyMuPDF: Available
‚úÖ Gradio: Available

üìã System Status:
   PDF extraction: PyMuPDF
   UI interface: Gradio

üéØ Ready for Cell 3!


In [3]:
!pip -q install firebase-admin

In [4]:
# CELL 3: Store Classes (Simple fallback)
# For this assignment, we store an inverted index:
# term -> set of (docId, page)

class SimpleIndexStore:
    def __init__(self):
        self.posting_sets = defaultdict(set)  # term -> set[(docId, page)]
        print("üì¶ SimpleIndexStore initialized")

    def add_occurrence(self, term: str, docId: str, page: int):
        self.posting_sets[term].add((docId, page))

    def get_postings(self, term: str):
        term = term.lower()
        return sorted(self.posting_sets.get(term, set()))

    def count_terms(self) -> int:
        return len(self.posting_sets)

print("‚úÖ Store class defined!")
print("üìã Next: Cell 4 (Core class)")

‚úÖ Store class defined!
üìã Next: Cell 4 (Core class)


In [5]:
# CELL 4: Core System Class + STOP_WORDS + STEMMING

# --- NEW: stemming ---
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Explicit stop-words list (required by assignment)
STOP_WORDS = {
    "the","a","an","and","or","but","if","while","with","without","to","from","of","in","on","at","by","for",
    "is","are","was","were","be","been","being","this","that","these","those","it","its","as","into","than",
    "we","our","you","your","they","their","he","she","his","her","them","us",
    "can","could","should","would","may","might","must","will","also",
    "et","al","figure","fig","table","section","introduction","conclusion","references",
    "method","methods","result","results","study","paper","data","analysis",
    "using","based","approach"
}

class PlantDiseaseIndexRAG:
    def __init__(self):
        self.store = SimpleIndexStore()
        self.docs_pages = {}   # docId -> list of page texts
        self.docs_meta  = {}   # docId -> metadata (title/file/etc.)
        print("üéâ PlantDiseaseIndexRAG initialized")

    def preprocess_text(self, text: str) -> str:
        if not text:
            return ""
        text = text.lower()
        text = re.sub(r"[^a-z\s]", " ", text)   # keep letters/spaces only
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def tokenize(self, text: str):
        text = self.preprocess_text(text)
        raw_tokens = [w for w in text.split() if w not in STOP_WORDS and len(w) >= 3]

        # --- NEW: apply stemming ---
        tokens = [stemmer.stem(w) for w in raw_tokens]
        return tokens

print("‚úÖ Core class + STOP_WORDS + stemming ready!")
print("üìã Next: Cell 5 (PDF loading)")


‚úÖ Core class + STOP_WORDS + stemming ready!
üìã Next: Cell 5 (PDF loading)


In [6]:
# CELL 5: Data Loading Methods (PDF -> page texts)

def add_loading_methods():
    def load_pdfs(self, papers):
        """
        papers: list of dicts like:
          {"title":..., "file":..., ...}
        Saves:
          self.docs_pages[docId] = [page1_text, page2_text, ...]
          self.docs_meta[docId]  = paper metadata
        """
        if not PYMUPDF_AVAILABLE:
            raise RuntimeError("PyMuPDF not available. Install pymupdf in Cell 1.")

        self.docs_pages = {}
        self.docs_meta = {}

        for i, paper in enumerate(papers, start=1):
            docId = f"doc{i}"
            pdf_path = paper["file"]

            doc = fitz.open(pdf_path)
            pages = []
            for p in range(len(doc)):
                pages.append(doc[p].get_text("text") or "")

            self.docs_pages[docId] = pages

            meta = dict(paper)
            meta["_id"] = docId
            self.docs_meta[docId] = meta

        print(f"‚úÖ Loaded {len(self.docs_pages)} PDFs into memory (doc1..doc{len(self.docs_pages)})")

    PlantDiseaseIndexRAG.load_pdfs = load_pdfs

add_loading_methods()
print("‚úÖ Loading methods added!")
print("üìã Next: Cell 6 (build index + query)")


‚úÖ Loading methods added!
üìã Next: Cell 6 (build index + query)


In [7]:
# CELL 6: Search and Query Methods (Index building + export + RAG output)

def add_search_methods():
    def build_index(self):
        """
        Build inverted index:
        term -> DocIDs (doc/page links)
        NOTE: terms are STEMS because we enabled stemming in tokenize().
        """
        self.store = SimpleIndexStore()  # reset store

        for docId, pages in self.docs_pages.items():
            for page_num, page_text in enumerate(pages, start=1):
                terms_on_page = set(self.tokenize(page_text))  # presence on that page
                for term in terms_on_page:
                    self.store.add_occurrence(term, docId, page_num)

        print(f"‚úÖ Index built. Unique terms: {self.store.count_terms()}")

    def export_index_records(self):
        """
        Return list of dicts with EXACT field names required:
        - term
        - DocIDs (list of links/IDs to pages containing the term)
        """
        records = []
        for term, postings in self.store.posting_sets.items():
            docids = [{"doc": d, "page": p} for (d, p) in sorted(postings)]
            records.append({"term": term, "DocIDs": docids})
        return records

    def rag_query(self, query, top_k=5):
        """
        Retrieval: rank (doc,page) by overlap with query terms
        Generation: rich output with doc/page + snippet
        """
        q_terms = list(dict.fromkeys(self.tokenize(query)))  # stems
        scores = defaultdict(int)

        for t in q_terms:
            for (docId, page) in self.store.get_postings(t):
                scores[(docId, page)] += 1

        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

        lines = []
        lines.append(f"Query: {query}")
        lines.append(f"Query terms (stems): {q_terms}")
        lines.append("=" * 60)

        if not ranked:
            lines.append("No matches found.")
            return "\n".join(lines)

        for (docId, page), score in ranked:
            text = self.docs_pages[docId][page - 1]
            snippet = re.sub(r"\s+", " ", text)[:350]
            title = self.docs_meta[docId].get("title", "")
            lines.append(f"[Score {score}] {docId} | page {page} | {title}")
            lines.append(f"Snippet: {snippet}...")
            lines.append("-" * 60)

        return "\n".join(lines)

    PlantDiseaseIndexRAG.build_index = build_index
    PlantDiseaseIndexRAG.export_index_records = export_index_records
    PlantDiseaseIndexRAG.rag_query = rag_query

add_search_methods()
print("‚úÖ Index build/export/query methods added!")
print("üìã Next: Cell 7 (sample_papers list)")


‚úÖ Index build/export/query methods added!
üìã Next: Cell 7 (sample_papers list)


In [8]:
# CELL 7: Sample Papers (metadata + file names)

sample_papers = [
    {
        "title": "AI-IoT Based Smart Agriculture Pivot for Plant Diseases Detection and Treatment",
        "authors": "AS Ibrahim et al.",
        "journal": "Scientific Reports (Nature)",
        "year": 2025,
        "doi": "10.1038/s41598-025-98454-6",
        "abstract": "Proposes an AI-IoT smart agriculture pivot architecture for detecting and treating plant diseases, including a hardware pilot and mobile-app support.",
        "file": "s41598-025-98454-6.pdf"
    },
    {
        "title": "Infectious Plant Diseases: Etiology, Current Status, Problems and Prospects in Plant Protection",
        "authors": "PA Nazarov et al.",
        "journal": "Acta Naturae",
        "year": 2020,
        "doi": None,
        "abstract": "Review of infectious plant diseases caused by viruses, bacteria, and fungi; current status and prospects for plant protection.",
        "file": "actanaturae_11026.pdf"
    },
    {
        "title": "Recent Approaches towards Control of Fungal Diseases in Plants: An Updated Review",
        "authors": "NA El-Baky, AAAF Amara",
        "journal": "Journal of Fungi (MDPI)",
        "year": 2021,
        "doi": "10.3390/jof7110900",
        "abstract": "Reviews strategies to control plant fungal diseases including biocontrol and other approaches.",
        "file": "jof-07-00900.pdf"
    },
    {
        "title": "The Potential Risk of Plant-Virus Disease Initiation by Infected Tomatoes",
        "authors": "C Klap et al.",
        "journal": "Plants (MDPI)",
        "year": 2020,
        "doi": "10.3390/plants9050623",
        "abstract": "Study on how infected tomatoes can contribute to plant-virus disease spread and transmission risk.",
        "file": "plants-09-00623.pdf"
    },
    {
        "title": "Current status and future perspectives of the diagnostic of plant bacterial pathogens",
        "authors": "X Wang et al.",
        "journal": "Frontiers in Plant Science",
        "year": 2025,
        "doi": None,
        "abstract": "Review of plant bacterial pathogen diagnostics; shift from culture-based to culture-free detection; limitations in real plant extracts and recent progress.",
        "file": "fpls-2025-bacterial-pathogen-diagnostics.pdf"
    }
]

print(f"üìö sample_papers ready: {len(sample_papers)} papers")
for i, p in enumerate(sample_papers, 1):
    print(f"{i}. {p['title']}  -->  {p['file']}")


üìö sample_papers ready: 5 papers
1. AI-IoT Based Smart Agriculture Pivot for Plant Diseases Detection and Treatment  -->  s41598-025-98454-6.pdf
2. Infectious Plant Diseases: Etiology, Current Status, Problems and Prospects in Plant Protection  -->  actanaturae_11026.pdf
3. Recent Approaches towards Control of Fungal Diseases in Plants: An Updated Review  -->  jof-07-00900.pdf
4. The Potential Risk of Plant-Virus Disease Initiation by Infected Tomatoes  -->  plants-09-00623.pdf
5. Current status and future perspectives of the diagnostic of plant bacterial pathogens  -->  fpls-2025-bacterial-pathogen-diagnostics.pdf


In [9]:
# CELL 8: Initialize RAG + Load PDFs + Build Index (+ optional Firestore upload)

# 1) Create system
rag_system = PlantDiseaseIndexRAG()

# 2) Load PDFs using the metadata list from Cell 7
rag_system.load_pdfs(sample_papers)

# 3) Build inverted index
rag_system.build_index()

# 4) Quick sanity checks
print("\n‚úÖ Sanity checks:")
print("Docs loaded:", len(rag_system.docs_pages))
print("Unique terms:", rag_system.store.count_terms())

# 5) Preview index schema (term, DocIDs)
records = rag_system.export_index_records()
df_index = pd.DataFrame(records)
print("\n‚úÖ Index preview (first 5 rows):")
display(df_index.head(5))

# 6) OPTIONAL: upload to Firestore (set to True only after you confirm everything works)
UPLOAD_TO_FIRESTORE = False

if UPLOAD_TO_FIRESTORE:
    # This assumes you already ran your Firestore init block after Cell 2 and have `db`
    # Also assumes you added upload_index_to_firestore in Cell 6 (we can add it next if not yet)
    rag_system.upload_index_to_firestore(db, collection="inverted_index")


üì¶ SimpleIndexStore initialized
üéâ PlantDiseaseIndexRAG initialized
‚úÖ Loaded 5 PDFs into memory (doc1..doc5)
üì¶ SimpleIndexStore initialized
‚úÖ Index built. Unique terms: 5470

‚úÖ Sanity checks:
Docs loaded: 5
Unique terms: 5470

‚úÖ Index preview (first 5 rows):


Unnamed: 0,term,DocIDs
0,system,"[{'doc': 'doc1', 'page': 1}, {'doc': 'doc1', '..."
1,natur,"[{'doc': 'doc1', 'page': 1}, {'doc': 'doc1', '..."
2,drop,"[{'doc': 'doc1', 'page': 1}, {'doc': 'doc3', '..."
3,requir,"[{'doc': 'doc1', 'page': 1}, {'doc': 'doc1', '..."
4,eng,"[{'doc': 'doc1', 'page': 1}, {'doc': 'doc1', '..."


In [10]:
# CELL 9: Simple Query Interface

def single_query(question, top_k=5):
    print(rag_system.rag_query(question, top_k=top_k))

def query_interface():
    print("PLANT DISEASE INDEX - QUERY INTERFACE")
    print("=" * 60)
    print("Type 'quit' to exit.")
    print("=" * 60)

    while True:
        q = input("\nYour question: ").strip()
        if q.lower() == "quit":
            break
        if not q:
            print("Please type a question.")
            continue
        print("\n" + rag_system.rag_query(q, top_k=5))

# Quick test
single_query("fungal disease symptoms leaf", top_k=5)


Query: fungal disease symptoms leaf
Query terms (stems): ['fungal', 'diseas', 'symptom', 'leaf']
[Score 4] doc2 | page 1 | Infectious Plant Diseases: Etiology, Current Status, Problems and Prospects in Plant Protection
Snippet: 46 | ACTA NATURAE | VOL. 12 ‚Ññ 3 (46) 2020 REVIEWS ABSTRACT In recent years, there has been an increase in the number of diseases caused by bacterial, fungal, and viral infections. Infections affect plants at different stages of agricultural production. Depending on weather conditions and the phytosanitary condition of crops, the prevalence of disea...
------------------------------------------------------------
[Score 4] doc2 | page 8 | Infectious Plant Diseases: Etiology, Current Status, Problems and Prospects in Plant Protection
Snippet: REVIEWS VOL. 12 ‚Ññ 3 (46) 2020 | ACTA NATURAE | 53 phytoplasmosis and growth retardation. Like myco- plasmas, a related genus of bacteria, phytoplasmas are apparently one of the most primitive and autonomously reproducing l

In [11]:
# CELL 10: Load Your Own Papers (optional)

def load_new_papers(new_papers_list):
    """
    new_papers_list: same structure as sample_papers (list of dicts with 'file', 'title', etc.)
    This will reload PDFs and rebuild the index.
    """
    global rag_system
    rag_system = PlantDiseaseIndexRAG()
    rag_system.load_pdfs(new_papers_list)
    rag_system.build_index()
    print("‚úÖ New papers loaded and index rebuilt.")

# Example usage (uncomment when you have new files uploaded):
# my_papers = [
#     {"title": "Paper A", "authors": "...", "journal": "...", "year": 2024, "doi": None, "abstract": "...", "file": "paperA.pdf"},
#     {"title": "Paper B", "authors": "...", "journal": "...", "year": 2023, "doi": None, "abstract": "...", "file": "paperB.pdf"},
# ]
# load_new_papers(my_papers)

print("‚úÖ Cell 10 ready. Use load_new_papers(...) when needed.")


‚úÖ Cell 10 ready. Use load_new_papers(...) when needed.


In [12]:
# CELL 11: Gradio Web Interface (optional)

if 'gr' in globals():
    def gradio_query(q):
        return rag_system.rag_query(q, top_k=5)

    interface = gr.Interface(
        fn=gradio_query,
        inputs=gr.Textbox(label="Ask about plant diseases"),
        outputs=gr.Textbox(label="Result"),
        title="Plant Disease Index (RAG)",
        description="Simple retrieval over indexed plant disease articles"
    )

    interface.launch()
else:
    print("‚ö†Ô∏è Gradio not available. Skipping Cell 11.")


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b9793975fc695d8569.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [13]:
# CELL 12: Analytics and Evaluation

def index_statistics():
    total_terms = rag_system.store.count_terms()
    total_docs = len(rag_system.docs_pages)

    pages_per_doc = {
        doc: len(pages) for doc, pages in rag_system.docs_pages.items()
    }

    return {
        "total_documents": total_docs,
        "total_terms": total_terms,
        "pages_per_document": pages_per_doc
    }

stats = index_statistics()
print("üìä INDEX STATISTICS")
for k, v in stats.items():
    print(f"{k}: {v}")


üìä INDEX STATISTICS
total_documents: 5
total_terms: 5470
pages_per_document: {'doc1': 16, 'doc2': 14, 'doc3': 17, 'doc4': 15, 'doc5': 11}


In [14]:
# CELL 13: Advanced Query Features

def suggest_terms(prefix, limit=10):
    """
    Suggest indexed terms that start with a prefix (after stemming)
    """
    prefix = prefix.lower()
    matches = [t for t in rag_system.store.posting_sets.keys() if t.startswith(prefix)]
    return matches[:limit]

# Example
print("Suggestions for 'fung':", suggest_terms("fung"))


Suggestions for 'fung': ['fungi', 'fungal', 'fungu', 'fungicid', 'fungicola']


In [15]:
# CELL 14: System Summary and Testing

def system_summary():
    print("SYSTEM SUMMARY")
    print("=" * 50)
    print(f"Documents indexed: {len(rag_system.docs_pages)}")
    print(f"Unique terms: {rag_system.store.count_terms()}")
    print("Database: In-memory (Firestore optional)")
    print("Text processing: stop-words + stemming")
    print("=" * 50)

system_summary()

print("\nFINAL TEST QUERY\n")
print(rag_system.rag_query("bacterial plant disease detection", top_k=5))


SYSTEM SUMMARY
Documents indexed: 5
Unique terms: 5470
Database: In-memory (Firestore optional)
Text processing: stop-words + stemming

FINAL TEST QUERY

Query: bacterial plant disease detection
Query terms (stems): ['bacteri', 'plant', 'diseas', 'detect']
[Score 4] doc1 | page 4 | AI-IoT Based Smart Agriculture Pivot for Plant Diseases Detection and Treatment
Snippet: Fig. 1. The proposed AI-IoT system architecture. Reference Methodology Dataset Strengths Limitations 21 Proposed FL-EfficientNet CNN utilizing Neural Architecture Search 10 diseases across 5 crop types Fast convergence (4.7 h for 15 epochs), effective for real-time applications There is no mobile deployment 22 Enhanced CNN based on VGG16, Inception...
------------------------------------------------------------
[Score 4] doc1 | page 9 | AI-IoT Based Smart Agriculture Pivot for Plant Diseases Detection and Treatment
Snippet: diseases treatment. It operates the water pump to pull down the liquid from the fertilization liqu