In [3]:
# 🧰 System Dependencies (Poppler + Urdu OCR)
!apt-get -y install poppler-utils tesseract-ocr tesseract-ocr-urd

# 🧪 Python Libraries
!pip install -q \
    gradio \
    groq \
    langchain \
    langchain-community \
    langchain-huggingface \
    pytesseract \
    pdf2image \
    requests \
    Pillow \
    torch \
    sentence-transformers \
    faiss-cpu

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
tesseract-ocr-urd is already the newest version (1:4.00~git30-7274cfa-1.1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.9).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
# =======================
# 🌾 Crop Advisory — build once, reuse FAISS
# =======================
import os, re, json, shutil
from io import BytesIO
import requests
from pdf2image import convert_from_bytes
import pytesseract
import gradio as gr

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from groq import Groq

# ---------- Config ----------
INDEX_DIR   = "/content/faiss_urdu_index"
EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
GROQ_MODEL  = "llama3-70b-8192"

# 🔐 Read key from environment (recommended: set with os.environ["GROQ_API_KEY"]="...")
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")

# Replace with your own Drive links
DRIVE_LINKS = {
    "PDF 1": "https://drive.google.com/file/d/16VRkuegHXbhQPPH6kB3jTcdlg1eh95Og/view?usp=sharing",
    "PDF 2": "https://drive.google.com/file/d/1e4Zi9vYXHEtuU_mkpBKDjZ-s0fIFqdzO/view?usp=sharing",
    "PDF 3": "https://drive.google.com/file/d/149Js-w01KO085cRibqXyra9_oPNRAYqZ/view?usp=sharing"
}

# ---------- Helpers ----------
def _download_pdf_from_drive(drive_link: str):
    try:
        file_id = drive_link.split("/d/")[1].split("/")[0]
        url = f"https://drive.google.com/uc?export=download&id={file_id}"
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        if r.content[:4] != b"%PDF":
            raise ValueError("Not a PDF (Drive interstitial?)")
        return BytesIO(r.content)
    except Exception as e:
        print("Download error:", e); return None

def _clean(t: str) -> str:
    t = re.sub(r"\.{2,}", ".", t)
    t = re.sub(r"\n+", "\n", t)
    t = re.sub(r" +", " ", t)
    return t.strip()

def extract_text_urdu(links: dict, max_pages=2) -> str:
    """OCR first `max_pages` of each PDF in Urdu."""
    all_text = ""
    for title, url in links.items():
        f = _download_pdf_from_drive(url)
        if not f:
            continue
        try:
            for img in convert_from_bytes(f.read())[:max_pages]:
                all_text += pytesseract.image_to_string(img, lang="urd") + "\n"
        except Exception as e:
            print(f"OCR failed for {title}:", e)
    return _clean(all_text)

def chunk_text(text: str):
    return RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50).create_documents([text])

def index_exists() -> bool:
    return os.path.isdir(INDEX_DIR) and os.path.exists(os.path.join(INDEX_DIR, "index.faiss"))

def build_index(links: dict) -> str:
    try:
        if index_exists():
            shutil.rmtree(INDEX_DIR)
        text = extract_text_urdu(links)
        if not text.strip():
            return "❌ No text extracted. Check OCR/links."
        docs = chunk_text(text)
        embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
        db = FAISS.from_documents(docs, embedding=embeddings)
        db.save_local(INDEX_DIR)
        with open(os.path.join(INDEX_DIR, "manifest.json"), "w", encoding="utf-8") as f:
            json.dump({"docs": len(docs), "model": EMBED_MODEL}, f, ensure_ascii=False, indent=2)
        return f"✅ Built index: {len(docs)} chunks"
    except Exception as e:
        return f"❌ Build error: {e}"

def load_index():
    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
    return FAISS.load_local(INDEX_DIR, embeddings, allow_dangerous_deserialization=True)

# Lazy global cache
_db = None
def get_db():
    global _db
    if _db is None and index_exists():
        _db = load_index()
    return _db

# ---------- QA ----------
def answer_query(q: str) -> str:
    if not GROQ_API_KEY:
        return "❌ GROQ_API_KEY not set. Please set it in the environment."
    db = get_db()
    if db is None:
        return "❌ No index found. Go to Admin tab and Build."
    hits = db.similarity_search(q, k=3)
    if not hits:
        return "❌ No relevant information found."

    context = "\n\n".join([h.page_content for h in hits])

    # ▶️ Pointer-style bilingual prompt (English + Urdu)
    prompt = f"""
You are a helpful agricultural advisor. Use ONLY the following Urdu context to answer the user's question.
Respond in BOTH English and Urdu. Format your answer as clear bullet points (•) with short, actionable guidance.
Avoid long paragraphs. Where appropriate, add quantities/doses.

---
{context}
---

Question: {q}

Answer in bullet points, first English then Urdu:
"""

    try:
        client = Groq(api_key=GROQ_API_KEY)
        resp = client.chat.completions.create(
            model=GROQ_MODEL,
            messages=[{"role": "user", "content": prompt}],
        )
        return resp.choices[0].message.content.strip()
    except Exception as e:
        return f"❌ Groq error: {e}"

# ---------- Gradio ----------
with gr.Blocks(title="Crop Advisory") as demo:
    gr.Markdown("## 🌾 Crop Advisory\nAsk crop care, pest control, irrigation, and fertilizer questions (English or Roman Urdu).")

    with gr.Tab("🧠 Ask"):
        q = gr.Textbox(label="💬 Your question", lines=2, placeholder="e.g., Best time to irrigate wheat in winter?")
        a = gr.Textbox(label="📘 Advisory (Bulleted • English → Urdu)", lines=12, show_copy_button=True)
        gr.Button("🔎 Get Advisory").click(answer_query, q, a)

    with gr.Tab("🛠 Admin"):
        gr.Markdown("Build / Rebuild the FAISS index from Google Drive PDFs (Urdu OCR).")
        links_in = gr.Textbox(label="Google Drive links (JSON dict)", value=json.dumps(DRIVE_LINKS, indent=2), lines=8)
        status = gr.Textbox(label="Status")

        def build_action(json_links):
            try:
                links = json.loads(json_links)
            except Exception as e:
                return f"❌ Invalid JSON: {e}"
            global _db; _db = None  # reset cache
            return build_index(links)

        gr.Button("⚙️ Build / Rebuild Index").click(build_action, links_in, status)

        def check():
            if not index_exists():
                return "❌ No index on disk."
            man_path = os.path.join(INDEX_DIR, "manifest.json")
            man = json.load(open(man_path)) if os.path.exists(man_path) else {}
            return f"✅ Index present\nChunks: {man.get('docs','?')}\nModel: {man.get('model','?')}\nPath: {INDEX_DIR}"
        gr.Button("🔎 Check Index").click(lambda: check(), outputs=status)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0ea4393ac1990202dd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


