In [None]:

!pip install gradio==4.44.0 \
    openai-whisper \
    gTTS \
    groq \
    requests \
    pandas \
    PyPDF2 \
    python-docx \
    nltk \
    sentence-transformers \
    faiss-cpu \
    torch \
    transformers \
    langchain==0.2.16 \
    langchain-community==0.2.10


Collecting gradio==4.44.0
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting groq
  Downloading groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting langchain==0.2.16
  Downloading langchain-0.2.16-py3-none-any.whl

In [None]:
!pip install -U "gradio==4.44.1"
# (gradio_client will update to a compatible version automatically)


Collecting gradio==4.44.1
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Downloading gradio-4.44.1-py3-none-any.whl (18.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gradio
  Attempting uninstall: gradio
    Found existing installation: gradio 4.44.0
    Uninstalling gradio-4.44.0:
      Successfully uninstalled gradio-4.44.0
Successfully installed gradio-4.44.1


In [None]:
import os
import re
import requests
from io import BytesIO
from pdf2image import convert_from_bytes
import pytesseract
import gradio as gr

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from groq import Groq

# 🔐 API Key
GROQ_API_KEY = "YOUR_GROQ_KEY_HERE"            # paste locally
WEATHER_API_KEY = "YOUR_OPENWEATHER_KEY_HERE"
client = Groq(api_key=os.environ["GROQ_API_KEY"])

# 📎 PDF Drive Links
drive_links = {
    "PDF 1": "https://drive.google.com/file/d/16VRkuegHXbhQPPH6kB3jTcdlg1eh95Og/view?usp=sharing",
    "PDF 2": "https://drive.google.com/file/d/1e4Zi9vYXHEtuU_mkpBKDjZ-s0fIFqdzO/view?usp=sharing",
    "PDF 3": "https://drive.google.com/file/d/149Js-w01KO085cRibqXyra9_oPNRAYqZ/view?usp=sharing"
}

# 📥 Download PDF
def download_pdf_from_drive(drive_link):
    try:
        file_id = drive_link.split("/d/")[1].split("/")[0]
        url = f"https://drive.google.com/uc?export=download&id={file_id}"
        response = requests.get(url)
        response.raise_for_status()
        if response.content[:4] != b"%PDF":
            raise ValueError("Invalid PDF format")
        return BytesIO(response.content)
    except Exception as e:
        print(f"❌ Error downloading PDF: {e}")
        return None

# 🧼 Clean OCR text
def clean_ocr_text(text):
    text = re.sub(r"\.{2,}", ".", text)
    text = re.sub(r"\n+", "\n", text)
    text = re.sub(r" +", " ", text)
    return text.strip()

# 🧠 Extract Urdu text from all PDFs
def extract_all_texts_from_drive(drive_links, max_pages=2):
    all_text = ""
    for title, url in drive_links.items():
        file = download_pdf_from_drive(url)
        if file:
            try:
                images = convert_from_bytes(file.read())[:max_pages]
                for img in images:
                    all_text += pytesseract.image_to_string(img, lang="urd") + "\n"
            except Exception as e:
                print(f"❌ OCR failed for {title}: {e}")
    return clean_ocr_text(all_text)

# 📚 Chunk text
def chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    return splitter.create_documents([text])

# 💾 Create FAISS index
def create_faiss_index(docs):
    if not docs:
        raise ValueError("❌ No chunks created from text.")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    return FAISS.from_documents(docs, embedding=embeddings)

# 🤖 Query with Groq
def query_vector_db(query, db):
    results = db.similarity_search(query, k=3)
    if not results:
        return "❌ No relevant information found."

    context = "\n\n".join([doc.page_content for doc in results])
    prompt = f"""Use the following Urdu text to answer the user's question. Provide your answer in BOTH English and Urdu.

---
{context}
---

Question: {query}
Answer:"""
    try:
        response = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"❌ Error from Groq: {e}"

# 🚀 Preprocessing once
print("📦 Processing Urdu PDFs...")
extracted_text = extract_all_texts_from_drive(drive_links)
print(f"✅ Extracted {len(extracted_text)} characters of Urdu text")

documents = chunk_text(extracted_text)
print(f"📄 {len(documents)} chunks created")

vector_db = create_faiss_index(documents)
print("✅ FAISS index created")

# 🎛 Gradio UI
def answer_question(query):
    return query_vector_db(query, vector_db)

gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(label="💬 Ask your question (in English or Roman Urdu)"),
    outputs=gr.Textbox(label="📘 Answer"),
    title="📚 Urdu PDF QnA (Groq + FAISS)",
    description="Ask questions from 3 Urdu PDFs using OCR, FAISS, and Groq's LLaMA model.",
    theme="default"
).launch(share=True)


📦 Processing Urdu PDFs...
✅ Extracted 14642 characters of Urdu text
📄 59 chunks created


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ FAISS index created
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://87ce9fc6545328eee8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
!pip install -U "pydantic==2.10.6"
# (optional if it fights you)
# !pip install --force-reinstall --no-cache-dir "pydantic==2.7.1"

# IMPORTANT: restart runtime so the new version is used


Collecting pydantic==2.10.6
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting pydantic-core==2.27.2 (from pydantic==2.10.6)
  Downloading pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading pydantic-2.10.6-py3-none-any.whl (431 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.7/431.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydantic-core, pydantic
  Attempting uninstall: pydantic-core
    Found existing installation: pydantic_core 2.33.2
    Uninstalling pydantic_core-2.33.2:
      Successfully uninstalled pydantic_core-2.33.2
  Attempting uninstall: pydantic
    Found existing installation: pydantic 2.11.7
    Uninstallin

In [None]:
import pydantic; print(pydantic.__version__)



2.10.6


In [None]:
# System deps for pdf2image & Tesseract OCR
!apt-get -y install poppler-utils tesseract-ocr tesseract-ocr-urd

# Python packages
!pip install -U gradio==4.44.1 groq pdf2image pytesseract gTTS \
  sentence-transformers transformers faiss-cpu \
  langchain==0.2.* langchain-community==0.2.* langchain-huggingface==0.0.* \
  openai-whisper


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils tesseract-ocr-urd
0 upgraded, 2 newly installed, 0 to remove and 35 not upgraded.
Need to get 1,186 kB of archives.
After this operation, 2,110 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.9 [186 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-urd all 1:4.00~git30-7274cfa-1.1 [1,000 kB]
Fetched 1,186 kB in 1s (1,395 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126284 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.9_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.9) ...
Selecting previously unselected package tesseract-ocr-urd.
Preparing to unpack

In [None]:
# --- Standard library
import os
import re
import tempfile
from io import BytesIO

# --- Third-party
import requests
import gradio as gr
from pdf2image import convert_from_bytes
import pytesseract
import whisper
from gtts import gTTS
from groq import Groq

# --- LangChain (v0.2.x layout)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/95.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.5/95.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.3/73.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently

--------


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://ef1ab371d70386f049.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
# app.py — Smart Zameen Dost (updated for LangChain 0.2.x)
# --------------------------------------------------------
# - Uses langchain_core.documents.Document
# - Uses langchain_huggingface for embeddings
# - Robust membership tests, metadata handling
# - Safe Google Drive PDF download + PyPDF2 extraction

import os
import io
import re
import json
import tempfile
from datetime import datetime

import gradio as gr
import requests
import nltk
import PyPDF2
from gtts import gTTS
import whisper
import torch  # noqa: F401

from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# ----------------------------
# Keys (⚠️ hard-code only if you accept the risk)
# ----------------------------
GROQ_API_KEY = "YOUR_GROQ_KEY_HERE"            # paste locally
WEATHER_API_KEY = "YOUR_OPENWEATHER_KEY_HERE"  # paste locally
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
os.environ["WEATHER_API_KEY"] = WEATHER_API_KEY

if not GROQ_API_KEY.strip():
    raise RuntimeError("GROQ_API_KEY not set.")
if not WEATHER_API_KEY.strip():
    raise RuntimeError("WEATHER_API_KEY not set.")

groq_client = Groq(api_key=GROQ_API_KEY)

# ----------------------------
# NLTK (quiet)
# ----------------------------
try:
    nltk.download("punkt", quiet=True)
    nltk.download("stopwords", quiet=True)
except Exception:
    pass

# ----------------------------
# Models / Embeddings
# ----------------------------
print("🤖 Loading Whisper model...")
whisper_model = whisper.load_model("base")
print("✅ Whisper model loaded.")

print("🔤 Loading multilingual sentence embeddings...")
multilingual_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
print("✅ Multilingual embeddings loaded.")

# ----------------------------
# Predefined Google Drive PDFs
# ----------------------------
PREDEFINED_PDF_LINKS = [
    "https://drive.google.com/file/d/1H7b-1PG2SLB99gjogfSl7QTOmLd1iGX0/view?usp=sharing",
]

# ----------------------------
# Helpers
# ----------------------------
def safe_str(x) -> str:
    if x is None:
        return ""
    if isinstance(x, (bool, int, float)):
        return str(x)
    return str(x)

def normalize_mixed_text(text: str) -> str:
    s = re.sub(r"\s+", " ", safe_str(text)).strip()
    return re.sub(
        r"[^\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF\w\s.,;:!?()\-]",
        " ",
        s,
    )

def extract_numerical_data(text: str):
    t = safe_str(text)
    info = {}
    prices = re.findall(r"[\$Rs\.]\s*(\d+(?:,\d{3})*(?:\.\d{2})?)", t)
    if prices:
        info["prices"] = prices
    perc = re.findall(r"(\d+(?:\.\d+)?)\s*%", t)
    if perc:
        info["percentages"] = perc
    yields = re.findall(
        r"(\d+(?:\.\d+)?)\s*(tons?|kg|quintals?|maunds?)\s*(?:per|/)?\s*(acre|hectare|ایکڑ)",
        t,
        re.IGNORECASE,
    )
    if yields:
        info["yields"] = yields
    return info

# ----------------------------
# Google Drive PDF Processor
# ----------------------------
class GoogleDrivePDFProcessor:
    @staticmethod
    def convert_gdrive_link(share_link: str):
        patterns = [r"/file/d/([a-zA-Z0-9\-_]+)", r"id=([a-zA-Z0-9\-_]+)", r"/d/([a-zA-Z0-9\-_]+)"]
        file_id = None
        link = safe_str(share_link)
        for pat in patterns:
            m = re.search(pat, link)
            if m:
                file_id = m.group(1)
                break
        if not file_id:
            return None
        return f"https://drive.google.com/uc?export=download&id={file_id}"

    @staticmethod
    def download_pdf_from_gdrive(gdrive_link: str):
        try:
            download_link = GoogleDrivePDFProcessor.convert_gdrive_link(gdrive_link)
            if not download_link:
                return None, "Invalid Google Drive link format"

            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
            resp = requests.get(download_link, headers=headers, stream=True, timeout=60)
            txt = safe_str(resp.text)

            if ("confirm=" in txt) or ("virus scan warning" in txt.lower()):
                token = re.search(r"confirm=([^&]+)", txt)
                if token:
                    confirmed = f"{download_link}&confirm={token.group(1)}"
                    resp = requests.get(confirmed, headers=headers, stream=True, timeout=60)

            if resp.status_code == 200:
                return resp.content, "Success"
            return None, f"Download failed: HTTP {resp.status_code}"
        except Exception as e:
            return None, f"Download error: {e}"

    @staticmethod
    def extract_text_from_pdf(pdf_content: bytes):
        try:
            reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
            pages = len(reader.pages)
            out = []
            for i in range(pages):
                try:
                    pg = reader.pages[i]
                    t = (pg.extract_text() or "").strip()
                    if t:
                        out.append(f"\n--- Page {i+1} ---\n{t}\n")
                except Exception:
                    out.append(f"\n--- Page {i+1} (Error extracting) ---\n")
            return "".join(out), pages
        except Exception as e:
            return f"PDF text extraction error: {e}", 0

# ----------------------------
# Knowledge Base / RAG
# ----------------------------
class AdvancedPakistaniAgriRAG:
    def __init__(self):
        self.embeddings = multilingual_embeddings
        self.vector_store = None
        self.gdrive = GoogleDrivePDFProcessor()
        self.processed_documents = []
        self._setup_seed_knowledge()
        self._auto_process_predefined_pdfs()

    def _setup_seed_knowledge(self):
        seed = [
            {
                "content": """Punjab Wheat Varieties for Export:
                اعلیٰ قسم کی گندم کی اقسام:
                - Anmol-91: Yield 45-50 maunds/acre, Export price $320-350/ton
                - Faisalabad-2008: High protein 12-14%, Premium export variety
                - Galaxy-2013: Disease resistant, Suitable for UAE market
                - Punjab-2011: Good for bread making, Export to Afghanistan
                Urdu: یہ اقسام برآمد کے لیے بہترین ہیں اور زیادہ قیمت ملتی ہے""",
                "metadata": {"type": "crop_varieties", "region": "Punjab", "crop": "wheat", "language": "mixed"},
            },
            {
                "content": """Rice Export Opportunities - چاول کی برآمدات:
                Basmati Varieties with International Prices:
                - Super Basmati: $900-1200/ton (UAE, Saudi Arabia)
                - Basmati 385: Premium grade, $1000-1300/ton
                - IRRI-6: $450-550/ton (Philippines, Malaysia)
                - Kainaat: $700-850/ton (Middle East markets)

                Export Requirements:
                - Moisture: Maximum 14%
                - Broken grains: Less than 5%
                - Length: Minimum 6.0mm for Basmati

                اردو میں: بسمتی چاول کی برآمد سب سے زیادہ منافع بخش ہے""",
                "metadata": {"type": "export_markets", "crop": "rice", "price_range": "450-1300", "language": "mixed"},
            },
            {
                "content": """Government Support Schemes - حکومتی اسکیمز:
                Kisan Card Program:
                - 25% subsidy on fertilizers
                - 20% discount on certified seeds
                - Easy loan access through banks

                Solar Tube Well Scheme:
                - 60% government subsidy
                - Remaining 40% through easy installments
                - Electricity bill savings: Rs. 50,000+ annually

                Crop Insurance Program:
                - Premium: 5% of sum insured
                - Government pays 75% of premium
                - Coverage: Natural disasters, pest attacks

                کسان ڈویلپمنٹ پروگرام سے مفت تربیت اور مشورے""",
                "metadata": {"type": "government_schemes", "schemes": "kisan_card,solar_tubewell,crop_insurance", "language": "mixed"},
            },
        ]

        docs = []
        for item in seed:
            content = normalize_mixed_text(item["content"])
            meta = dict(item.get("metadata") or {})
            nums = extract_numerical_data(content)
            if nums:
                meta.update(nums)
            docs.append(Document(page_content=content, metadata=meta))

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", "۔", ".", ":", ";", " "], length_function=len
        )
        pieces = splitter.split_documents(docs)
        self.vector_store = FAISS.from_documents(pieces, self.embeddings)
        print("✅ Seed agricultural knowledge initialized with", len(pieces), "chunks.")

    def _auto_process_predefined_pdfs(self):
        if not PREDEFINED_PDF_LINKS:
            print("ℹ️ No predefined PDFs configured.")
            return

        print(f"🚀 Auto-processing {len(PREDEFINED_PDF_LINKS)} Google Drive PDF(s)...")
        ok = 0
        for i, link in enumerate(PREDEFINED_PDF_LINKS, start=1):
            try:
                blob, msg = self.gdrive.download_pdf_from_gdrive(link)
                if blob is None:
                    print(f"❌ Doc {i}: {msg}")
                    continue

                text, pages = self.gdrive.extract_text_from_pdf(blob)
                if "pdf text extraction error" in safe_str(text).lower():
                    print(f"❌ Doc {i}: {text}")
                    continue

                if len(safe_str(text).strip()) < 100:
                    print(f"⚠️ Doc {i}: likely image-based or encrypted; minimal text.")

                processed = normalize_mixed_text(text)
                numbers = extract_numerical_data(processed)

                doc = Document(
                    page_content=processed,
                    metadata={
                        "type": "auto_processed_pdf",
                        "source": f"Auto PDF {i}",
                        "pages": pages,
                        "numerical_data": numbers,
                        "processing_date": datetime.now().strftime("%Y-%m-%d %H:%M"),
                        "original_link": link[:50] + "..." if len(link) > 50 else link,
                    },
                )

                splitter = RecursiveCharacterTextSplitter(
                    chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", "۔", ".", ":", ";", " "]
                )
                chunks = splitter.split_documents([doc])

                if self.vector_store:
                    self.vector_store.add_documents(chunks)
                else:
                    self.vector_store = FAISS.from_documents(chunks, self.embeddings)

                self.processed_documents.append(
                    {"id": i, "pages": pages, "chunks": len(chunks), "source": doc.metadata["original_link"], "status": "✅ Success"}
                )
                print(f"✅ Doc {i}: {pages} pages → {len(chunks)} chunks")
                ok += 1
            except Exception as e:
                self.processed_documents.append(
                    {"id": i, "pages": 0, "chunks": 0, "source": link[:50] + "..." if len(link) > 50 else link, "status": f"❌ Error: {e}"}
                )
                print(f"❌ Doc {i}: {e}")

        print(f"🎉 Finished: {ok}/{len(PREDEFINED_PDF_LINKS)} document(s) processed.")

    def get_stats_html(self) -> str:
        if not self.processed_documents:
            return "📊 Knowledge Base: Seed Pakistani agricultural data only (no PDFs yet)"
        total_chunks = sum(d.get("chunks", 0) for d in self.processed_documents)
        total_pages = sum(d.get("pages", 0) for d in self.processed_documents)
        return f"""📊 Knowledge Base Statistics:

🗂️ Auto-processed Documents: {len(self.processed_documents)}
📄 Total Pages Processed: {total_pages}
🧩 Total Text Chunks: {total_chunks}
📚 Seed Knowledge: Pakistani agriculture (Urdu + English)
🔍 Search Capability: Multilingual (English + Urdu)
✅ Status: Ready for queries
"""

    def get_relevant_info(self, query: str, k: int = 4) -> str:
        if not self.vector_store:
            return "Knowledge base not available"
        try:
            q = safe_str(query)
            hits = self.vector_store.similarity_search(q, k=k)

            context = ""
            nums_summary = []

            for i, doc in enumerate(hits, start=1):
                context += f"معلومات {i}: {doc.page_content}\n\n"

                meta = doc.metadata or {}
                if not isinstance(meta, dict):
                    meta = {}

                nd = meta.get("numerical_data")
                if isinstance(nd, dict):
                    meta = {**meta, **nd}

                if isinstance(meta.get("prices"), list) and meta["prices"]:
                    nums_summary.append(f"💰 قیمتیں: {', '.join(map(safe_str, meta['prices']))}")
                if isinstance(meta.get("percentages"), list) and meta["percentages"]:
                    nums_summary.append(f"📊 فیصد: {', '.join(map(safe_str, meta['percentages']))}%")
                if isinstance(meta.get("yields"), list) and meta["yields"]:
                    y_fmt = []
                    for y in meta["yields"]:
                        try:
                            val, unit, per = y
                            y_fmt.append(f"{val} {unit} per {per}")
                        except Exception:
                            y_fmt.append(safe_str(y))
                    nums_summary.append(f"🌾 پیداوار: {', '.join(y_fmt)}")

            if nums_summary:
                context = "📈 اہم اعداد و شمار:\n" + "\n".join(nums_summary) + "\n\n" + context
            return context or "No relevant information found."
        except Exception as e:
            return f"Error retrieving information: {e}"

# Initialize RAG
print("🧠 Initializing Advanced Pakistani Agricultural Knowledge Base...")
pak_agri_rag = AdvancedPakistaniAgriRAG()

# ----------------------------
# Voice, Weather, AI
# ----------------------------
def voice_to_text(audio_file_path):
    if not audio_file_path:
        return ""
    try:
        result = whisper_model.transcribe(audio_file_path, language="ur")
        return normalize_mixed_text(result.get("text", ""))
    except Exception as e:
        return f"آواز سمجھ نہیں آئی: {e}"

def get_weather_with_farming_advice(city="Lahore"):
    try:
        city = safe_str(city).strip() or "Lahore"
        url = f"http://api.openweathermap.org/data/2.5/weather?q={city},PK&appid={WEATHER_API_KEY}&units=metric"
        resp = requests.get(url, timeout=20)
        try:
            data = resp.json()
        except Exception:
            return "मوسمی JSON درست نہیں۔"

        main = data.get("main") or {}
        wind = data.get("wind") or {}
        weather_l = data.get("weather") or [{}]

        temp = main.get("temp")
        humidity = main.get("humidity")
        wind_speed = wind.get("speed")
        description = weather_l[0].get("description", "")

        if any(v is None for v in (temp, humidity, wind_speed)):
            return "موسمی معلومات مکمل نہیں مل سکیں۔"

        if temp > 35:
            advice = f"⚠️ زیادہ گرمی ({temp}°C): صبح 6-8 بجے پانی دیں، دوپہر میں نہیں۔ پانی کی مقدار 20% بڑھائیں۔"
        elif humidity > 80:
            advice = f"🌧️ زیادہ نمی ({humidity}%): فنگیسائیڈ سپرے کریں۔ Mancozeb 2g/لیٹر یا Copper Oxychloride 3g/لیٹر۔"
        elif temp < 10:
            advice = f"❄️ سردی ({temp}°C): پودوں کو ڈھانپیں، پانی 50% کم دیں۔ Frost protection ضروری۔"
        elif wind_speed > 5:
            advice = f"💨 تیز ہوا ({wind_speed} m/s): کیڑے مار دوا کا سپرے نہ کریں۔ Wind barriers لگائیں۔"
        else:
            advice = f"✅ موسم اچھا ہے ({temp}°C, {humidity}% نمی): کھیتی کے کام کر سکتے ہیں۔"

        return f"آج {city} میں {temp}°C، نمی {humidity}%، ہوا {wind_speed} m/s، موسم {description}\n\n{advice}"
    except Exception as e:
        return f"موسمی معلومات نہیں مل سکیں: {e}"

def text_to_voice(text):
    try:
        clean = normalize_mixed_text(text)
        if len(clean) > 500:
            clean = clean[:500] + "... مکمل جواب اوپر پڑھیں"
        tts = gTTS(text=clean, lang="ur", slow=False)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            return tmp.name
    except Exception as e:
        print(f"TTS Error: {e}")
        return None

def get_enhanced_ai_response(user_message: str, location: str = "") -> str:
    relevant_context = pak_agri_rag.get_relevant_info(user_message)
    system_prompt = f"""
آپ "زمین دوست" ہیں - پاکستانی کسانوں کے ماہر مشیر۔

آپ کے پاس پاکستانی زراعت کی معلومات (English اور Urdu میں) ہیں:
{relevant_context}

کسان کا علاقہ: {safe_str(location)}

آپ کا کام:
1) پاکستانی حالات کے مطابق مشورہ دینا
2) برآمدی فصلوں کی تجویز دینا (numerical data کے ساتھ)
3) مقامی اقسام اور قیمتوں کا ذکر کرنا
4) حکومتی اسکیموں کی معلومات دینا
5) نقصان سے بچاؤ کے طریقے بتانا
6) اعداد و شمار استعمال کرنا (prices, yields, percentages)

Guidelines:
- ہمیشہ "بھائی" کہہ کر شروع کریں
- آسان اردو استعمال کریں
- Numbers اور prices ضرور بتائیں
- Export opportunities highlight کریں
- Government schemes mention کریں
""".strip()

    try:
        chat = groq_client.chat.completions.create(
            messages=[{"role": "system", "content": system_prompt},
                      {"role": "user", "content": safe_str(user_message)}],
            model="llama-3.1-8b-instant",
            max_tokens=1200,
            temperature=0.7,
        )
        return chat.choices[0].message.content
    except Exception as e:
        return f"معذرت، AI سے رابطہ نہیں ہو سکا: {e}"

# ----------------------------
# Main chat handler (robust)
# ----------------------------
def zameen_dost_advanced_chat(audio_input, text_input, city_name, focus_area):
    user_message = ""
    input_display = ""

    if audio_input:
        user_message = voice_to_text(audio_input)
        input_display = f"💬 آپ نے کہا: {user_message}"
    elif text_input:
        user_message = safe_str(text_input)
        input_display = f"💬 آپ نے لکھا: {user_message}"

    if not isinstance(user_message, str) or not user_message.strip():
        return "کرپیا کوئی سوال پوچھیں", None, "❌ کوئی سوال نہیں ملا"

    enhanced = user_message
    if focus_area and safe_str(focus_area) != "عام سوال":
        enhanced += f" (کسان کی دلچسپی: {focus_area})"

    terms = ["موسم", "بارش", "پانی", "weather", "irrigation", "spray", "سپرے"]
    if isinstance(user_message, str) and any(t in user_message for t in terms):
        weather_info = get_weather_with_farming_advice(city_name or "Lahore")
        enhanced += f"\n\nموسمی حالات: {weather_info}"

    ai_response = get_enhanced_ai_response(enhanced, city_name or "")
    voice_response = text_to_voice(ai_response)
    return input_display, voice_response, ai_response

# ----------------------------
# UI
# ----------------------------
with gr.Blocks(
    title="Smart Zameen Dost - زمین دوست",
    theme=gr.themes.Base(),
    css="""
    .gradio-container { background: linear-gradient(135deg, #f8fdff 0%, #e8f7f8 100%); font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
    .header-box { background: white; padding: 20px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); margin: 10px 0; border-left: 4px solid #2E8B57; }
    .stats-box { background: linear-gradient(45deg, #e8f5e8, #f0f8e8); padding: 15px; border-radius: 8px; border: 1px solid #c8e6c9; margin: 10px 0; font-size: 0.9em; }
    """
) as app:
    gr.HTML("""
        <div class='header-box'>
          <div style='text-align: center;'>
            <h1 style='color: #2E8B57; font-size: 2.2em; margin: 0 0 8px 0;'>🌾 Smart Zameen Dost</h1>
            <p style='color: #666; font-size: 1.1em; margin: 0;'>پاکستانی کسانوں کا ذہین مشیر</p>
          </div>
        </div>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🎤 اپنا سوال پوچھیں")
            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="آواز میں پوچھیں")
            text_input = gr.Textbox(label="یا یہاں لکھیں (اردو/English)", placeholder="مثال: کون سی فصل زیادہ منافع دے گی؟", lines=2)
            with gr.Row():
                city_input = gr.Textbox(label="آپ کا شہر", placeholder="Lahore, Karachi, Faisalabad", value="Lahore", scale=1)
                focus_area = gr.Dropdown(
                    label="دلچسپی کا شعبہ",
                    choices=["عام سوال","برآمدی فصلیں","گندم کی کاشت","چاول کی کاشت","کپاس کی کاشت","سبزیوں کی کاشت","پھلوں کی کاشت","کھاد اور بیج","بیماریوں کا علاج","حکومتی اسکیمز","منڈی کی قیمتیں"],
                    value="عام سوال",
                    scale=1,
                )
            chat_btn = gr.Button("🚀 جواب حاصل کریں", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("### 🧠 ذہین جواب")
            input_display = gr.Textbox(label="آپ کا سوال", lines=2, interactive=False)
            audio_output = gr.Audio(label="🔊 آواز میں جواب")
            text_output = gr.Textbox(label="📝 تفصیلی جواب", lines=10, interactive=False, show_copy_button=True)

    with gr.Row():
        kb_stats = gr.HTML(value=pak_agri_rag.get_stats_html(), elem_classes=["stats-box"])

    chat_btn.click(
        zameen_dost_advanced_chat,
        inputs=[audio_input, text_input, city_input, focus_area],
        outputs=[input_display, audio_output, text_output],
    )

print("🎉 App ready!")
print(f"✅ Auto-processed {len(PREDEFINED_PDF_LINKS)} Google Drive PDF link(s)")
print("🔍 Multilingual RAG + Voice + Weather integrated")

gr.close_all()
app.launch(share=True, debug=True, show_api=False)


🤖 Loading Whisper model...
✅ Whisper model loaded.
🔤 Loading multilingual sentence embeddings...
✅ Multilingual embeddings loaded.
🧠 Initializing Advanced Pakistani Agricultural Knowledge Base...
✅ Seed agricultural knowledge initialized with 3 chunks.
🚀 Auto-processing 1 Google Drive PDF(s)...
✅ Doc 1: 322 pages → 1516 chunks
🎉 Finished: 1/1 document(s) processed.
🎉 App ready!
✅ Auto-processed 1 Google Drive PDF link(s)
🔍 Multilingual RAG + Voice + Weather integrated
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://aff99ca7b6b3e43a3c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://aff99ca7b6b3e43a3c.gradio.live




In [None]:
# app.py — Smart Zameen Dost (trimmed context to avoid 413/TPM)
# --------------------------------------------------------------
# - Uses langchain_core.documents.Document (LangChain 0.2.x)
# - Caps retrieval + context length to stay under Groq limits
# - Robust membership tests, metadata handling
# - Safe Google Drive PDF download + PyPDF2 extraction

import os
import io
import re
import json
import tempfile
from datetime import datetime

import gradio as gr
import requests
import nltk
import PyPDF2
from gtts import gTTS
import whisper
import torch  # noqa: F401

from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# =========================
# 🔑 Keys (⚠️ hard-code only if you accept the risk)
# =========================
GROQ_API_KEY = "YOUR_GROQ_KEY_HERE"            # paste locally
WEATHER_API_KEY = "YOUR_OPENWEATHER_KEY_HERE""  # paste locally
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
os.environ["WEATHER_API_KEY"] = WEATHER_API_KEY

if not GROQ_API_KEY.strip():
    raise RuntimeError("GROQ_API_KEY not set.")
if not WEATHER_API_KEY.strip():
    raise RuntimeError("WEATHER_API_KEY not set.")

groq_client = Groq(api_key=GROQ_API_KEY)

# =========================
# 🔧 Global limits to keep prompt small
# =========================
K_RETRIEVE = 3               # fewer chunks retrieved
PER_DOC_CHARS = 700          # per-chunk char cap (~175 tokens)
MAX_CONTEXT_CHARS = 4000     # total context cap
MAX_OUTPUT_TOKENS = 512      # shorter generations

def _limit_chars(s: str, n: int) -> str:
    s = str(s or "")
    return s if len(s) <= n else (s[:n] + " …")

def _clip_context(snippets, max_chars: int) -> str:
    out, used = [], 0
    for snip in snippets:
        snip = str(snip or "")
        if used + len(snip) > max_chars:
            snip = snip[: max(0, max_chars - used)]
        if snip:
            out.append(snip)
            used += len(snip)
        if used >= max_chars:
            break
    return "\n\n".join(out)

# =========================
# 🔇 NLTK (quiet)
# =========================
try:
    nltk.download("punkt", quiet=True)
    nltk.download("stopwords", quiet=True)
except Exception:
    pass

# =========================
# 🤖 Models / Embeddings
# =========================
print("🤖 Loading Whisper model...")
whisper_model = whisper.load_model("base")
print("✅ Whisper model loaded.")

print("🔤 Loading multilingual sentence embeddings...")
multilingual_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
print("✅ Multilingual embeddings loaded.")

# =========================
# 📄 Predefined Google Drive PDFs
# =========================
PREDEFINED_PDF_LINKS = [
    "https://drive.google.com/file/d/1H7b-1PG2SLB99gjogfSl7QTOmLd1iGX0/view?usp=sharing",

]

# =========================
# 🧰 Helpers
# =========================
def safe_str(x) -> str:
    if x is None:
        return ""
    if isinstance(x, (bool, int, float)):
        return str(x)
    return str(x)

def normalize_mixed_text(text: str) -> str:
    s = re.sub(r"\s+", " ", safe_str(text)).strip()
    return re.sub(
        r"[^\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF\w\s.,;:!?()\-]",
        " ",
        s,
    )

def extract_numerical_data(text: str):
    t = safe_str(text)
    info = {}
    prices = re.findall(r"[\$Rs\.]\s*(\d+(?:,\d{3})*(?:\.\d{2})?)", t)
    if prices:
        info["prices"] = prices
    perc = re.findall(r"(\d+(?:\.\d+)?)\s*%", t)
    if perc:
        info["percentages"] = perc
    yields = re.findall(
        r"(\d+(?:\.\d+)?)\s*(tons?|kg|quintals?|maunds?)\s*(?:per|/)?\s*(acre|hectare|ایکڑ)",
        t,
        re.IGNORECASE,
    )
    if yields:
        info["yields"] = yields
    return info

# =========================
# 📥 Google Drive PDF Processor
# =========================
class GoogleDrivePDFProcessor:
    @staticmethod
    def convert_gdrive_link(share_link: str):
        patterns = [r"/file/d/([a-zA-Z0-9\-_]+)", r"id=([a-zA-Z0-9\-_]+)", r"/d/([a-zA-Z0-9\-_]+)"]
        file_id = None
        link = safe_str(share_link)
        for pat in patterns:
            m = re.search(pat, link)
            if m:
                file_id = m.group(1)
                break
        if not file_id:
            return None
        return f"https://drive.google.com/uc?export=download&id={file_id}"

    @staticmethod
    def download_pdf_from_gdrive(gdrive_link: str):
        try:
            download_link = GoogleDrivePDFProcessor.convert_gdrive_link(gdrive_link)
            if not download_link:
                return None, "Invalid Google Drive link format"

            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
            resp = requests.get(download_link, headers=headers, stream=True, timeout=60)
            txt = safe_str(resp.text)

            if ("confirm=" in txt) or ("virus scan warning" in txt.lower()):
                token = re.search(r"confirm=([^&]+)", txt)
                if token:
                    confirmed = f"{download_link}&confirm={token.group(1)}"
                    resp = requests.get(confirmed, headers=headers, stream=True, timeout=60)

            if resp.status_code == 200:
                return resp.content, "Success"
            return None, f"Download failed: HTTP {resp.status_code}"
        except Exception as e:
            return None, f"Download error: {e}"

    @staticmethod
    def extract_text_from_pdf(pdf_content: bytes):
        try:
            reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
            pages = len(reader.pages)
            out = []
            for i in range(pages):
                try:
                    pg = reader.pages[i]
                    t = (pg.extract_text() or "").strip()
                    if t:
                        out.append(f"\n--- Page {i+1} ---\n{t}\n")
                except Exception:
                    out.append(f"\n--- Page {i+1} (Error extracting) ---\n")
            return "".join(out), pages
        except Exception as e:
            return f"PDF text extraction error: {e}", 0

# =========================
# 🧠 Knowledge Base / RAG
# =========================
class AdvancedPakistaniAgriRAG:
    def __init__(self):
        self.embeddings = multilingual_embeddings
        self.vector_store = None
        self.gdrive = GoogleDrivePDFProcessor()
        self.processed_documents = []
        self._setup_seed_knowledge()
        self._auto_process_predefined_pdfs()

    def _setup_seed_knowledge(self):
        seed = [
            {
                "content": """Punjab Wheat Varieties for Export:
                اعلیٰ قسم کی گندم کی اقسام:
                - Anmol-91: Yield 45-50 maunds/acre, Export price $320-350/ton
                - Faisalabad-2008: High protein 12-14%, Premium export variety
                - Galaxy-2013: Disease resistant, Suitable for UAE market
                - Punjab-2011: Good for bread making, Export to Afghanistan
                Urdu: یہ اقسام برآمد کے لیے بہترین ہیں اور زیادہ قیمت ملتی ہے""",
                "metadata": {"type": "crop_varieties", "region": "Punjab", "crop": "wheat", "language": "mixed"},
            },
            {
                "content": """Rice Export Opportunities - چاول کی برآمدات:
                Basmati Varieties with International Prices:
                - Super Basmati: $900-1200/ton (UAE, Saudi Arabia)
                - Basmati 385: Premium grade, $1000-1300/ton
                - IRRI-6: $450-550/ton (Philippines, Malaysia)
                - Kainaat: $700-850/ton (Middle East markets)

                Export Requirements:
                - Moisture: Maximum 14%
                - Broken grains: Less than 5%
                - Length: Minimum 6.0mm for Basmati

                اردو میں: بسمتی چاول کی برآمد سب سے زیادہ منافع بخش ہے""",
                "metadata": {"type": "export_markets", "crop": "rice", "price_range": "450-1300", "language": "mixed"},
            },
            {
                "content": """Government Support Schemes - حکومتی اسکیمز:
                Kisan Card Program:
                - 25% subsidy on fertilizers
                - 20% discount on certified seeds
                - Easy loan access through banks

                Solar Tube Well Scheme:
                - 60% government subsidy
                - Remaining 40% through easy installments
                - Electricity bill savings: Rs. 50,000+ annually

                Crop Insurance Program:
                - Premium: 5% of sum insured
                - Government pays 75% of premium
                - Coverage: Natural disasters, pest attacks

                کسان ڈویلپمنٹ پروگرام سے مفت تربیت اور مشورے""",
                "metadata": {"type": "government_schemes", "schemes": "kisan_card,solar_tubewell,crop_insurance", "language": "mixed"},
            },
        ]

        docs = []
        for item in seed:
            content = normalize_mixed_text(item["content"])
            meta = dict(item.get("metadata") or {})
            nums = extract_numerical_data(content)
            if nums:
                meta.update(nums)
            docs.append(Document(page_content=content, metadata=meta))

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", "۔", ".", ":", ";", " "], length_function=len
        )
        pieces = splitter.split_documents(docs)
        self.vector_store = FAISS.from_documents(pieces, self.embeddings)
        print("✅ Seed agricultural knowledge initialized with", len(pieces), "chunks.")

    def _auto_process_predefined_pdfs(self):
        if not PREDEFINED_PDF_LINKS:
            print("ℹ️ No predefined PDFs configured.")
            return

        print(f"🚀 Auto-processing {len(PREDEFINED_PDF_LINKS)} Google Drive PDF(s)...")
        ok = 0
        for i, link in enumerate(PREDEFINED_PDF_LINKS, start=1):
            try:
                blob, msg = self.gdrive.download_pdf_from_gdrive(link)
                if blob is None:
                    print(f"❌ Doc {i}: {msg}")
                    continue

                text, pages = self.gdrive.extract_text_from_pdf(blob)
                if "pdf text extraction error" in safe_str(text).lower():
                    print(f"❌ Doc {i}: {text}")
                    continue

                if len(safe_str(text).strip()) < 100:
                    print(f"⚠️ Doc {i}: likely image-based or encrypted; minimal text.")

                processed = normalize_mixed_text(text)
                numbers = extract_numerical_data(processed)

                doc = Document(
                    page_content=processed,
                    metadata={
                        "type": "auto_processed_pdf",
                        "source": f"Auto PDF {i}",
                        "pages": pages,
                        "numerical_data": numbers,
                        "processing_date": datetime.now().strftime("%Y-%m-%d %H:%M"),
                        "original_link": link[:50] + "..." if len(link) > 50 else link,
                    },
                )

                splitter = RecursiveCharacterTextSplitter(
                    chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", "۔", ".", ":", ";", " "]
                )
                chunks = splitter.split_documents([doc])

                if self.vector_store:
                    self.vector_store.add_documents(chunks)
                else:
                    self.vector_store = FAISS.from_documents(chunks, self.embeddings)

                self.processed_documents.append(
                    {"id": i, "pages": pages, "chunks": len(chunks), "source": doc.metadata["original_link"], "status": "✅ Success"}
                )
                print(f"✅ Doc {i}: {pages} pages → {len(chunks)} chunks")
                ok += 1
            except Exception as e:
                self.processed_documents.append(
                    {"id": i, "pages": 0, "chunks": 0, "source": link[:50] + "..." if len(link) > 50 else link, "status": f"❌ Error: {e}"}
                )
                print(f"❌ Doc {i}: {e}")

        print(f"🎉 Finished: {ok}/{len(PREDEFINED_PDF_LINKS)} document(s) processed.")

    def get_stats_html(self) -> str:
        if not self.processed_documents:
            return "📊 Knowledge Base: Seed Pakistani agricultural data only (no PDFs yet)"
        total_chunks = sum(d.get("chunks", 0) for d in self.processed_documents)
        total_pages = sum(d.get("pages", 0) for d in self.processed_documents)
        return f"""📊 Knowledge Base Statistics:

🗂️ Auto-processed Documents: {len(self.processed_documents)}
📄 Total Pages Processed: {total_pages}
🧩 Total Text Chunks: {total_chunks}
📚 Seed Knowledge: Pakistani agriculture (Urdu + English)
🔍 Search Capability: Multilingual (English + Urdu)
✅ Status: Ready for queries
"""

    def get_relevant_info(self, query: str, k: int = K_RETRIEVE) -> str:
        if not self.vector_store:
            return "Knowledge base not available"
        try:
            q = safe_str(query)
            hits = self.vector_store.similarity_search(q, k=k)

            snippets = []
            nums_summary = []

            for i, doc in enumerate(hits, start=1):
                body = _limit_chars(doc.page_content, PER_DOC_CHARS)
                snippets.append(f"معلومات {i}: {body}")

                meta = doc.metadata or {}
                if not isinstance(meta, dict):
                    meta = {}
                nd = meta.get("numerical_data")
                if isinstance(nd, dict):
                    meta = {**meta, **nd}

                if isinstance(meta.get("prices"), list) and meta["prices"]:
                    nums_summary.append(f"💰 قیمتیں: {', '.join(map(safe_str, meta['prices']))}")
                if isinstance(meta.get("percentages"), list) and meta["percentages"]:
                    nums_summary.append(f"📊 فیصد: {', '.join(map(safe_str, meta['percentages']))}%")
                if isinstance(meta.get("yields"), list) and meta["yields"]:
                    y_fmt = []
                    for y in meta["yields"]:
                        try:
                            val, unit, per = y
                            y_fmt.append(f"{val} {unit} per {per}")
                        except Exception:
                            y_fmt.append(safe_str(y))
                    nums_summary.append(f"🌾 پیداوار: {', '.join(y_fmt)}")

            context = "\n\n".join(snippets)
            if nums_summary:
                context = "📈 اہم اعداد و شمار:\n" + "\n".join(nums_summary) + "\n\n" + context

            return _clip_context([context], MAX_CONTEXT_CHARS) or "No relevant information found."
        except Exception as e:
            return f"Error retrieving information: {e}"

# =========================
# 🚀 Initialize RAG
# =========================
print("🧠 Initializing Advanced Pakistani Agricultural Knowledge Base...")
pak_agri_rag = AdvancedPakistaniAgriRAG()

# =========================
# 🎙️ Voice, 🌦️ Weather, 🤝 AI
# =========================
def voice_to_text(audio_file_path):
    if not audio_file_path:
        return ""
    try:
        result = whisper_model.transcribe(audio_file_path, language="ur")
        return normalize_mixed_text(result.get("text", ""))
    except Exception as e:
        return f"آواز سمجھ نہیں آئی: {e}"

def get_weather_with_farming_advice(city="Lahore"):
    try:
        city = safe_str(city).strip() or "Lahore"
        url = f"http://api.openweathermap.org/data/2.5/weather?q={city},PK&appid={WEATHER_API_KEY}&units=metric"
        resp = requests.get(url, timeout=20)
        try:
            data = resp.json()
        except Exception:
            return "موسمی JSON درست نہیں۔"

        main = data.get("main") or {}
        wind = data.get("wind") or {}
        weather_l = data.get("weather") or [{}]

        temp = main.get("temp")
        humidity = main.get("humidity")
        wind_speed = wind.get("speed")
        description = weather_l[0].get("description", "")

        if any(v is None for v in (temp, humidity, wind_speed)):
            return "موسمی معلومات مکمل نہیں مل سکیں۔"

        if temp > 35:
            advice = f"⚠️ زیادہ گرمی ({temp}°C): صبح 6-8 بجے پانی دیں، دوپہر میں نہیں۔ پانی کی مقدار 20% بڑھائیں۔"
        elif humidity > 80:
            advice = f"🌧️ زیادہ نمی ({humidity}%): فنگیسائیڈ سپرے کریں۔ Mancozeb 2g/لیٹر یا Copper Oxychloride 3g/لیٹر۔"
        elif temp < 10:
            advice = f"❄️ سردی ({temp}°C): پودوں کو ڈھانپیں، پانی 50% کم دیں۔ Frost protection ضروری۔"
        elif wind_speed > 5:
            advice = f"💨 تیز ہوا ({wind_speed} m/s): کیڑے مار دوا کا سپرے نہ کریں۔ Wind barriers لگائیں۔"
        else:
            advice = f"✅ موسم اچھا ہے ({temp}°C, {humidity}% نمی): کھیتی کے کام کر سکتے ہیں۔"

        return f"آج {city} میں {temp}°C، نمی {humidity}%، ہوا {wind_speed} m/s، موسم {description}\n\n{advice}"
    except Exception as e:
        return f"موسمی معلومات نہیں مل سکیں: {e}"

def text_to_voice(text):
    try:
        clean = normalize_mixed_text(text)
        if len(clean) > 500:
            clean = clean[:500] + "... مکمل جواب اوپر پڑھیں"
        tts = gTTS(text=clean, lang="ur", slow=False)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            return tmp.name
    except Exception as e:
        print(f"TTS Error: {e}")
        return None

def get_enhanced_ai_response(user_message: str, location: str = "") -> str:
    relevant_context = pak_agri_rag.get_relevant_info(user_message)

    system_prompt = (
        "You are Zameen Dost, a Pakistani agriculture advisor. "
        "Answer in simple Urdu, start with 'بھائی', use numbers when available, "
        "and keep it concise and actionable. If weather is included, integrate it. "
        "Only use the provided context; do not invent facts."
    )

    prompt_user = (
        f"Context:\n{relevant_context}\n\n"
        f"Location: {safe_str(location)}\n"
        f"Question: {safe_str(user_message)}"
    )

    try:
        chat = groq_client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_user},
            ],
            model="llama-3.1-8b-instant",
            max_tokens=MAX_OUTPUT_TOKENS,
            temperature=0.5,
        )
        return chat.choices[0].message.content
    except Exception as e:
        msg = safe_str(e)
        if ("rate_limit" in msg) or ("tokens per minute" in msg) or ("Request too large" in msg):
            return "معذرت، پیغام بڑا تھا یا رفتار حد سے زیادہ تھی۔ براہِ کرم چھوٹا سوال کریں، یا دوبارہ کوشش کریں۔"
        return f"معذرت، AI سے رابطہ نہیں ہو سکا: {e}"

# =========================
# 💬 Main chat handler
# =========================
def zameen_dost_advanced_chat(audio_input, text_input, city_name, focus_area):
    user_message = ""
    input_display = ""

    if audio_input:
        user_message = voice_to_text(audio_input)
        input_display = f"💬 آپ نے کہا: {user_message}"
    elif text_input:
        user_message = safe_str(text_input)
        input_display = f"💬 آپ نے لکھا: {user_message}"

    if not isinstance(user_message, str) or not user_message.strip():
        return "کرپیا کوئی سوال پوچھیں", None, "❌ کوئی سوال نہیں ملا"

    enhanced = user_message
    if focus_area and safe_str(focus_area) != "عام سوال":
        enhanced += f" (کسان کی دلچسپی: {focus_area})"

    terms = ["موسم", "بارش", "پانی", "weather", "irrigation", "spray", "سپرے"]
    if isinstance(user_message, str) and any(t in user_message for t in terms):
        weather_info = get_weather_with_farming_advice(city_name or "Lahore")
        enhanced += f"\n\nموسمی حالات: {weather_info}"

    ai_response = get_enhanced_ai_response(enhanced, city_name or "")
    voice_response = text_to_voice(ai_response)
    return input_display, voice_response, ai_response

# =========================
# 🖥️ UI
# =========================
with gr.Blocks(
    title="Smart Zameen Dost - زمین دوست",
    theme=gr.themes.Base(),
    css="""
    .gradio-container { background: linear-gradient(135deg, #f8fdff 0%, #e8f7f8 100%); font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
    .header-box { background: white; padding: 20px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); margin: 10px 0; border-left: 4px solid #2E8B57; }
    .stats-box { background: linear-gradient(45deg, #e8f5e8, #f0f8e8); padding: 15px; border-radius: 8px; border: 1px solid #c8e6c9; margin: 10px 0; font-size: 0.9em; }
    """
) as app:
    gr.HTML("""
        <div class='header-box'>
          <div style='text-align: center;'>
            <h1 style='color: #2E8B57; font-size: 2.2em; margin: 0 0 8px 0;'>🌾 Smart Zameen Dost</h1>
            <p style='color: #666; font-size: 1.1em; margin: 0;'>پاکستانی کسانوں کا ذہین مشیر</p>
          </div>
        </div>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🎤 اپنا سوال پوچھیں")
            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="آواز میں پوچھیں")
            text_input = gr.Textbox(label="یا یہاں لکھیں (اردو/English)", placeholder="مثال: کون سی فصل زیادہ منافع دے گی؟", lines=2)
            with gr.Row():
                city_input = gr.Textbox(label="آپ کا شہر", placeholder="Lahore, Karachi, Faisalabad", value="Lahore", scale=1)
                focus_area = gr.Dropdown(
                    label="دلچسپی کا شعبہ",
                    choices=["عام سوال","برآمدی فصلیں","گندم کی کاشت","چاول کی کاشت","کپاس کی کاشت","سبزیوں کی کاشت","پھلوں کی کاشت","کھاد اور بیج","بیماریوں کا علاج","حکومتی اسکیمز","منڈی کی قیمتیں"],
                    value="عام سوال",
                    scale=1,
                )
            chat_btn = gr.Button("🚀 جواب حاصل کریں", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("### 🧠 ذہین جواب")
            input_display = gr.Textbox(label="آپ کا سوال", lines=2, interactive=False)
            audio_output = gr.Audio(label="🔊 آواز میں جواب")
            text_output = gr.Textbox(label="📝 تفصیلی جواب", lines=10, interactive=False, show_copy_button=True)

    with gr.Row():
        kb_stats = gr.HTML(value=pak_agri_rag.get_stats_html(), elem_classes=["stats-box"])

    chat_btn.click(
        zameen_dost_advanced_chat,
        inputs=[audio_input, text_input, city_input, focus_area],
        outputs=[input_display, audio_output, text_output],
    )

print("🎉 App ready!")
print(f"✅ Auto-processed {len(PREDEFINED_PDF_LINKS)} Google Drive PDF link(s)")
print("🔍 Multilingual RAG + Voice + Weather integrated")

gr.close_all()
app.launch(share=True, debug=True, show_api=False)


🤖 Loading Whisper model...
✅ Whisper model loaded.
🔤 Loading multilingual sentence embeddings...
✅ Multilingual embeddings loaded.
🧠 Initializing Advanced Pakistani Agricultural Knowledge Base...
✅ Seed agricultural knowledge initialized with 3 chunks.
🚀 Auto-processing 1 Google Drive PDF(s)...
✅ Doc 1: 322 pages → 1516 chunks
🎉 Finished: 1/1 document(s) processed.
🎉 App ready!
✅ Auto-processed 1 Google Drive PDF link(s)
🔍 Multilingual RAG + Voice + Weather integrated
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://2663e74ed3d1b0e2b5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://2663e74ed3d1b0e2b5.gradio.live




In [None]:
# app.py — Smart Zameen Dost (trimmed context to avoid 413/TPM)
# --------------------------------------------------------------
# - Uses langchain_core.documents.Document (LangChain 0.2.x)
# - Caps retrieval + context length to stay under Groq limits
# - Robust membership tests, metadata handling
# - Safe Google Drive PDF download + PyPDF2 extraction

import os
import io
import re
import json
import tempfile
from datetime import datetime

import gradio as gr
import requests
import nltk
import PyPDF2
from gtts import gTTS
import whisper
import torch  # noqa: F401

from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# =========================
# 🔑 Keys (⚠️ hard-code only if you accept the risk)
# =========================
GROQ_API_KEY = "YOUR_GROQ_KEY_HERE"            # paste locally
WEATHER_API_KEY = "YOUR_OPENWEATHER_KEY_HERE" # paste locally
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
os.environ["WEATHER_API_KEY"] = WEATHER_API_KEY

if not GROQ_API_KEY.strip():
    raise RuntimeError("GROQ_API_KEY not set.")
if not WEATHER_API_KEY.strip():
    raise RuntimeError("WEATHER_API_KEY not set.")

groq_client = Groq(api_key=GROQ_API_KEY)

# =========================
# 🔧 Global limits to keep prompt small
# =========================
K_RETRIEVE = 3               # fewer chunks retrieved
PER_DOC_CHARS = 700          # per-chunk char cap (~175 tokens)
MAX_CONTEXT_CHARS = 4000     # total context cap
MAX_OUTPUT_TOKENS = 512      # shorter generations

def _limit_chars(s: str, n: int) -> str:
    s = str(s or "")
    return s if len(s) <= n else (s[:n] + " …")

def _clip_context(snippets, max_chars: int) -> str:
    out, used = [], 0
    for snip in snippets:
        snip = str(snip or "")
        if used + len(snip) > max_chars:
            snip = snip[: max(0, max_chars - used)]
        if snip:
            out.append(snip)
            used += len(snip)
        if used >= max_chars:
            break
    return "\n\n".join(out)

# =========================
# 🔇 NLTK (quiet)
# =========================
try:
    nltk.download("punkt", quiet=True)
    nltk.download("stopwords", quiet=True)
except Exception:
    pass

# =========================
# 🤖 Models / Embeddings
# =========================
print("🤖 Loading Whisper model...")
whisper_model = whisper.load_model("base")
print("✅ Whisper model loaded.")

print("🔤 Loading multilingual sentence embeddings...")
multilingual_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
print("✅ Multilingual embeddings loaded.")

# =========================
# 📄 Predefined Google Drive PDFs
# =========================
PREDEFINED_PDF_LINKS = [
    "https://drive.google.com/file/d/1H7b-1PG2SLB99gjogfSl7QTOmLd1iGX0/view?usp=sharing",

]

# =========================
# 🧰 Helpers
# =========================
def safe_str(x) -> str:
    if x is None:
        return ""
    if isinstance(x, (bool, int, float)):
        return str(x)
    return str(x)

def normalize_mixed_text(text: str) -> str:
    s = re.sub(r"\s+", " ", safe_str(text)).strip()
    return re.sub(
        r"[^\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF\w\s.,;:!?()\-]",
        " ",
        s,
    )

def extract_numerical_data(text: str):
    t = safe_str(text)
    info = {}
    prices = re.findall(r"[\$Rs\.]\s*(\d+(?:,\d{3})*(?:\.\d{2})?)", t)
    if prices:
        info["prices"] = prices
    perc = re.findall(r"(\d+(?:\.\d+)?)\s*%", t)
    if perc:
        info["percentages"] = perc
    yields = re.findall(
        r"(\d+(?:\.\d+)?)\s*(tons?|kg|quintals?|maunds?)\s*(?:per|/)?\s*(acre|hectare|ایکڑ)",
        t,
        re.IGNORECASE,
    )
    if yields:
        info["yields"] = yields
    return info

# =========================
# 📥 Google Drive PDF Processor
# =========================
class GoogleDrivePDFProcessor:
    @staticmethod
    def convert_gdrive_link(share_link: str):
        patterns = [r"/file/d/([a-zA-Z0-9\-_]+)", r"id=([a-zA-Z0-9\-_]+)", r"/d/([a-zA-Z0-9\-_]+)"]
        file_id = None
        link = safe_str(share_link)
        for pat in patterns:
            m = re.search(pat, link)
            if m:
                file_id = m.group(1)
                break
        if not file_id:
            return None
        return f"https://drive.google.com/uc?export=download&id={file_id}"

    @staticmethod
    def download_pdf_from_gdrive(gdrive_link: str):
        try:
            download_link = GoogleDrivePDFProcessor.convert_gdrive_link(gdrive_link)
            if not download_link:
                return None, "Invalid Google Drive link format"

            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
            resp = requests.get(download_link, headers=headers, stream=True, timeout=60)
            txt = safe_str(resp.text)

            if ("confirm=" in txt) or ("virus scan warning" in txt.lower()):
                token = re.search(r"confirm=([^&]+)", txt)
                if token:
                    confirmed = f"{download_link}&confirm={token.group(1)}"
                    resp = requests.get(confirmed, headers=headers, stream=True, timeout=60)

            if resp.status_code == 200:
                return resp.content, "Success"
            return None, f"Download failed: HTTP {resp.status_code}"
        except Exception as e:
            return None, f"Download error: {e}"

    @staticmethod
    def extract_text_from_pdf(pdf_content: bytes):
        try:
            reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
            pages = len(reader.pages)
            out = []
            for i in range(pages):
                try:
                    pg = reader.pages[i]
                    t = (pg.extract_text() or "").strip()
                    if t:
                        out.append(f"\n--- Page {i+1} ---\n{t}\n")
                except Exception:
                    out.append(f"\n--- Page {i+1} (Error extracting) ---\n")
            return "".join(out), pages
        except Exception as e:
            return f"PDF text extraction error: {e}", 0

# =========================
# 🧠 Knowledge Base / RAG
# =========================
class AdvancedPakistaniAgriRAG:
    def __init__(self):
        self.embeddings = multilingual_embeddings
        self.vector_store = None
        self.gdrive = GoogleDrivePDFProcessor()
        self.processed_documents = []
        self._setup_seed_knowledge()
        self._auto_process_predefined_pdfs()

    def _setup_seed_knowledge(self):
        seed = [
            {
                "content": """Punjab Wheat Varieties for Export:
                اعلیٰ قسم کی گندم کی اقسام:
                - Anmol-91: Yield 45-50 maunds/acre, Export price $320-350/ton
                - Faisalabad-2008: High protein 12-14%, Premium export variety
                - Galaxy-2013: Disease resistant, Suitable for UAE market
                - Punjab-2011: Good for bread making, Export to Afghanistan
                Urdu: یہ اقسام برآمد کے لیے بہترین ہیں اور زیادہ قیمت ملتی ہے""",
                "metadata": {"type": "crop_varieties", "region": "Punjab", "crop": "wheat", "language": "mixed"},
            },
            {
                "content": """Rice Export Opportunities - چاول کی برآمدات:
                Basmati Varieties with International Prices:
                - Super Basmati: $900-1200/ton (UAE, Saudi Arabia)
                - Basmati 385: Premium grade, $1000-1300/ton
                - IRRI-6: $450-550/ton (Philippines, Malaysia)
                - Kainaat: $700-850/ton (Middle East markets)

                Export Requirements:
                - Moisture: Maximum 14%
                - Broken grains: Less than 5%
                - Length: Minimum 6.0mm for Basmati

                اردو میں: بسمتی چاول کی برآمد سب سے زیادہ منافع بخش ہے""",
                "metadata": {"type": "export_markets", "crop": "rice", "price_range": "450-1300", "language": "mixed"},
            },
            {
                "content": """Government Support Schemes - حکومتی اسکیمز:
                Kisan Card Program:
                - 25% subsidy on fertilizers
                - 20% discount on certified seeds
                - Easy loan access through banks

                Solar Tube Well Scheme:
                - 60% government subsidy
                - Remaining 40% through easy installments
                - Electricity bill savings: Rs. 50,000+ annually

                Crop Insurance Program:
                - Premium: 5% of sum insured
                - Government pays 75% of premium
                - Coverage: Natural disasters, pest attacks

                کسان ڈویلپمنٹ پروگرام سے مفت تربیت اور مشورے""",
                "metadata": {"type": "government_schemes", "schemes": "kisan_card,solar_tubewell,crop_insurance", "language": "mixed"},
            },
        ]

        docs = []
        for item in seed:
            content = normalize_mixed_text(item["content"])
            meta = dict(item.get("metadata") or {})
            nums = extract_numerical_data(content)
            if nums:
                meta.update(nums)
            docs.append(Document(page_content=content, metadata=meta))

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", "۔", ".", ":", ";", " "], length_function=len
        )
        pieces = splitter.split_documents(docs)
        self.vector_store = FAISS.from_documents(pieces, self.embeddings)
        print("✅ Seed agricultural knowledge initialized with", len(pieces), "chunks.")

    def _auto_process_predefined_pdfs(self):
        if not PREDEFINED_PDF_LINKS:
            print("ℹ️ No predefined PDFs configured.")
            return

        print(f"🚀 Auto-processing {len(PREDEFINED_PDF_LINKS)} Google Drive PDF(s)...")
        ok = 0
        for i, link in enumerate(PREDEFINED_PDF_LINKS, start=1):
            try:
                blob, msg = self.gdrive.download_pdf_from_gdrive(link)
                if blob is None:
                    print(f"❌ Doc {i}: {msg}")
                    continue

                text, pages = self.gdrive.extract_text_from_pdf(blob)
                if "pdf text extraction error" in safe_str(text).lower():
                    print(f"❌ Doc {i}: {text}")
                    continue

                if len(safe_str(text).strip()) < 100:
                    print(f"⚠️ Doc {i}: likely image-based or encrypted; minimal text.")

                processed = normalize_mixed_text(text)
                numbers = extract_numerical_data(processed)

                doc = Document(
                    page_content=processed,
                    metadata={
                        "type": "auto_processed_pdf",
                        "source": f"Auto PDF {i}",
                        "pages": pages,
                        "numerical_data": numbers,
                        "processing_date": datetime.now().strftime("%Y-%m-%d %H:%M"),
                        "original_link": link[:50] + "..." if len(link) > 50 else link,
                    },
                )

                splitter = RecursiveCharacterTextSplitter(
                    chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", "۔", ".", ":", ";", " "]
                )
                chunks = splitter.split_documents([doc])

                if self.vector_store:
                    self.vector_store.add_documents(chunks)
                else:
                    self.vector_store = FAISS.from_documents(chunks, self.embeddings)

                self.processed_documents.append(
                    {"id": i, "pages": pages, "chunks": len(chunks), "source": doc.metadata["original_link"], "status": "✅ Success"}
                )
                print(f"✅ Doc {i}: {pages} pages → {len(chunks)} chunks")
                ok += 1
            except Exception as e:
                self.processed_documents.append(
                    {"id": i, "pages": 0, "chunks": 0, "source": link[:50] + "..." if len(link) > 50 else link, "status": f"❌ Error: {e}"}
                )
                print(f"❌ Doc {i}: {e}")

        print(f"🎉 Finished: {ok}/{len(PREDEFINED_PDF_LINKS)} document(s) processed.")

    def get_stats_html(self) -> str:
        if not self.processed_documents:
            return "📊 Knowledge Base: Seed Pakistani agricultural data only (no PDFs yet)"
        total_chunks = sum(d.get("chunks", 0) for d in self.processed_documents)
        total_pages = sum(d.get("pages", 0) for d in self.processed_documents)
        return f"""📊 Knowledge Base Statistics:

🗂️ Auto-processed Documents: {len(self.processed_documents)}
📄 Total Pages Processed: {total_pages}
🧩 Total Text Chunks: {total_chunks}
📚 Seed Knowledge: Pakistani agriculture (Urdu + English)
🔍 Search Capability: Multilingual (English + Urdu)
✅ Status: Ready for queries
"""

    def get_relevant_info(self, query: str, k: int = K_RETRIEVE) -> str:
        if not self.vector_store:
            return "Knowledge base not available"
        try:
            q = safe_str(query)
            hits = self.vector_store.similarity_search(q, k=k)

            snippets = []
            nums_summary = []

            for i, doc in enumerate(hits, start=1):
                body = _limit_chars(doc.page_content, PER_DOC_CHARS)
                snippets.append(f"معلومات {i}: {body}")

                meta = doc.metadata or {}
                if not isinstance(meta, dict):
                    meta = {}
                nd = meta.get("numerical_data")
                if isinstance(nd, dict):
                    meta = {**meta, **nd}

                if isinstance(meta.get("prices"), list) and meta["prices"]:
                    nums_summary.append(f"💰 قیمتیں: {', '.join(map(safe_str, meta['prices']))}")
                if isinstance(meta.get("percentages"), list) and meta["percentages"]:
                    nums_summary.append(f"📊 فیصد: {', '.join(map(safe_str, meta['percentages']))}%")
                if isinstance(meta.get("yields"), list) and meta["yields"]:
                    y_fmt = []
                    for y in meta["yields"]:
                        try:
                            val, unit, per = y
                            y_fmt.append(f"{val} {unit} per {per}")
                        except Exception:
                            y_fmt.append(safe_str(y))
                    nums_summary.append(f"🌾 پیداوار: {', '.join(y_fmt)}")

            context = "\n\n".join(snippets)
            if nums_summary:
                context = "📈 اہم اعداد و شمار:\n" + "\n".join(nums_summary) + "\n\n" + context

            return _clip_context([context], MAX_CONTEXT_CHARS) or "No relevant information found."
        except Exception as e:
            return f"Error retrieving information: {e}"

# =========================
# 🚀 Initialize RAG
# =========================
print("🧠 Initializing Advanced Pakistani Agricultural Knowledge Base...")
pak_agri_rag = AdvancedPakistaniAgriRAG()

# =========================
# 🎙️ Voice, 🌦️ Weather, 🤝 AI
# =========================
def voice_to_text(audio_file_path):
    if not audio_file_path:
        return ""
    try:
        result = whisper_model.transcribe(audio_file_path, language="ur")
        return normalize_mixed_text(result.get("text", ""))
    except Exception as e:
        return f"آواز سمجھ نہیں آئی: {e}"

def get_weather_with_farming_advice(city="Lahore"):
    try:
        city = safe_str(city).strip() or "Lahore"
        url = f"http://api.openweathermap.org/data/2.5/weather?q={city},PK&appid={WEATHER_API_KEY}&units=metric"
        resp = requests.get(url, timeout=20)
        try:
            data = resp.json()
        except Exception:
            return "موسمی JSON درست نہیں۔"

        main = data.get("main") or {}
        wind = data.get("wind") or {}
        weather_l = data.get("weather") or [{}]

        temp = main.get("temp")
        humidity = main.get("humidity")
        wind_speed = wind.get("speed")
        description = weather_l[0].get("description", "")

        if any(v is None for v in (temp, humidity, wind_speed)):
            return "موسمی معلومات مکمل نہیں مل سکیں۔"

        if temp > 35:
            advice = f"⚠️ زیادہ گرمی ({temp}°C): صبح 6-8 بجے پانی دیں، دوپہر میں نہیں۔ پانی کی مقدار 20% بڑھائیں۔"
        elif humidity > 80:
            advice = f"🌧️ زیادہ نمی ({humidity}%): فنگیسائیڈ سپرے کریں۔ Mancozeb 2g/لیٹر یا Copper Oxychloride 3g/لیٹر۔"
        elif temp < 10:
            advice = f"❄️ سردی ({temp}°C): پودوں کو ڈھانپیں، پانی 50% کم دیں۔ Frost protection ضروری۔"
        elif wind_speed > 5:
            advice = f"💨 تیز ہوا ({wind_speed} m/s): کیڑے مار دوا کا سپرے نہ کریں۔ Wind barriers لگائیں۔"
        else:
            advice = f"✅ موسم اچھا ہے ({temp}°C, {humidity}% نمی): کھیتی کے کام کر سکتے ہیں۔"

        return f"آج {city} میں {temp}°C، نمی {humidity}%، ہوا {wind_speed} m/s، موسم {description}\n\n{advice}"
    except Exception as e:
        return f"موسمی معلومات نہیں مل سکیں: {e}"

def text_to_voice(text):
    try:
        clean = normalize_mixed_text(text)
        if len(clean) > 500:
            clean = clean[:500] + "... مکمل جواب اوپر پڑھیں"
        tts = gTTS(text=clean, lang="ur", slow=False)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            return tmp.name
    except Exception as e:
        print(f"TTS Error: {e}")
        return None

def get_enhanced_ai_response(user_message: str, location: str = "") -> str:
    relevant_context = pak_agri_rag.get_relevant_info(user_message)

    system_prompt = (
        "You are Zameen Dost, a Pakistani agriculture advisor. "
        "Answer in simple Urdu, start with 'بھائی', use numbers when available, "
        "and keep it concise and actionable. If weather is included, integrate it. "
        "Only use the provided context; do not invent facts."
    )

    prompt_user = (
        f"Context:\n{relevant_context}\n\n"
        f"Location: {safe_str(location)}\n"
        f"Question: {safe_str(user_message)}"
    )

    try:
        chat = groq_client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_user},
            ],
            model="llama-3.1-8b-instant",
            max_tokens=MAX_OUTPUT_TOKENS,
            temperature=0.5,
        )
        return chat.choices[0].message.content
    except Exception as e:
        msg = safe_str(e)
        if ("rate_limit" in msg) or ("tokens per minute" in msg) or ("Request too large" in msg):
            return "معذرت، پیغام بڑا تھا یا رفتار حد سے زیادہ تھی۔ براہِ کرم چھوٹا سوال کریں، یا دوبارہ کوشش کریں۔"
        return f"معذرت، AI سے رابطہ نہیں ہو سکا: {e}"

# =========================
# 💬 Main chat handler
# =========================
def zameen_dost_advanced_chat(audio_input, text_input, city_name, focus_area):
    user_message = ""
    input_display = ""

    if audio_input:
        user_message = voice_to_text(audio_input)
        input_display = f"💬 آپ نے کہا: {user_message}"
    elif text_input:
        user_message = safe_str(text_input)
        input_display = f"💬 آپ نے لکھا: {user_message}"

    if not isinstance(user_message, str) or not user_message.strip():
        return "کرپیا کوئی سوال پوچھیں", None, "❌ کوئی سوال نہیں ملا"

    enhanced = user_message
    if focus_area and safe_str(focus_area) != "عام سوال":
        enhanced += f" (کسان کی دلچسپی: {focus_area})"

    terms = ["موسم", "بارش", "پانی", "weather", "irrigation", "spray", "سپرے"]
    if isinstance(user_message, str) and any(t in user_message for t in terms):
        weather_info = get_weather_with_farming_advice(city_name or "Lahore")
        enhanced += f"\n\nموسمی حالات: {weather_info}"

    ai_response = get_enhanced_ai_response(enhanced, city_name or "")
    voice_response = text_to_voice(ai_response)
    return input_display, voice_response, ai_response

# =========================
# 🖥️ UI
# =========================
with gr.Blocks(
    title="Smart Zameen Dost - زمین دوست",
    theme=gr.themes.Base(),
    css="""
    .gradio-container { background: linear-gradient(135deg, #f8fdff 0%, #e8f7f8 100%); font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
    .header-box { background: white; padding: 20px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); margin: 10px 0; border-left: 4px solid #2E8B57; }
    .stats-box { background: linear-gradient(45deg, #e8f5e8, #f0f8e8); padding: 15px; border-radius: 8px; border: 1px solid #c8e6c9; margin: 10px 0; font-size: 0.9em; }
    """
) as app:
    gr.HTML("""
        <div class='header-box'>
          <div style='text-align: center;'>
            <h1 style='color: #2E8B57; font-size: 2.2em; margin: 0 0 8px 0;'>🌾 Smart Zameen Dost</h1>
            <p style='color: #666; font-size: 1.1em; margin: 0;'>پاکستانی کسانوں کا ذہین مشیر</p>
          </div>
        </div>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🎤 اپنا سوال پوچھیں")
            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="آواز میں پوچھیں")
            text_input = gr.Textbox(label="یا یہاں لکھیں (اردو/English)", placeholder="مثال: کون سی فصل زیادہ منافع دے گی؟", lines=2)
            with gr.Row():
                city_input = gr.Textbox(label="آپ کا شہر", placeholder="Lahore, Karachi, Faisalabad", value="Lahore", scale=1)
                focus_area = gr.Dropdown(
                    label="دلچسپی کا شعبہ",
                    choices=["عام سوال","برآمدی فصلیں","گندم کی کاشت","چاول کی کاشت","کپاس کی کاشت","سبزیوں کی کاشت","پھلوں کی کاشت","کھاد اور بیج","بیماریوں کا علاج","حکومتی اسکیمز","منڈی کی قیمتیں"],
                    value="عام سوال",
                    scale=1,
                )
            chat_btn = gr.Button("🚀 جواب حاصل کریں", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("### 🧠 ذہین جواب")
            input_display = gr.Textbox(label="آپ کا سوال", lines=2, interactive=False)
            audio_output = gr.Audio(label="🔊 آواز میں جواب")
            text_output = gr.Textbox(label="📝 تفصیلی جواب", lines=10, interactive=False, show_copy_button=True)

    with gr.Row():
        kb_stats = gr.HTML(value=pak_agri_rag.get_stats_html(), elem_classes=["stats-box"])

    chat_btn.click(
        zameen_dost_advanced_chat,
        inputs=[audio_input, text_input, city_input, focus_area],
        outputs=[input_display, audio_output, text_output],
    )

print("🎉 App ready!")
print(f"✅ Auto-processed {len(PREDEFINED_PDF_LINKS)} Google Drive PDF link(s)")
print("🔍 Multilingual RAG + Voice + Weather integrated")

gr.close_all()
app.launch(share=True, debug=True, show_api=False)


In [None]:
import os
import re
import tempfile
import requests
from io import BytesIO
from pdf2image import convert_from_bytes
import pytesseract
import gradio as gr

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from groq import Groq

# 🔊 NEW: STT & TTS
import whisper
from gtts import gTTS

# =========================
# 🔐 API Key (use env var)
# =========================
GROQ_API_KEY = "YOUR_GROQ_KEY_HERE"            # paste locally
WEATHER_API_KEY = "YOUR_OPENWEATHER_KEY_HERE"

client = Groq(api_key=os.environ["GROQ_API_KEY"])
# =========================
# 🎙️ Load Whisper STT model
# =========================
WHISPER_MODEL_NAME = os.getenv("WHISPER_MODEL", "base")
try:
    stt_model = whisper.load_model(WHISPER_MODEL_NAME)
except Exception:
    stt_model = whisper.load_model("tiny")  # fallback if base isn't available

# 📎 PDF Drive Links
drive_links = {
    "PDF 1": "https://drive.google.com/file/d/16VRkuegHXbhQPPH6kB3jTcdlg1eh95Og/view?usp=sharing",
    "PDF 2": "https://drive.google.com/file/d/1e4Zi9vYXHEtuU_mkpBKDjZ-s0fIFqdzO/view?usp=sharing",
    "PDF 3": "https://drive.google.com/file/d/149Js-w01KO085cRibqXyra9_oPNRAYqZ/view?usp=sharing"
}

# 📥 Download PDF
def download_pdf_from_drive(drive_link):
    try:
        file_id = drive_link.split("/d/")[1].split("/")[0]
        url = f"https://drive.google.com/uc?export=download&id={file_id}"
        response = requests.get(url)
        response.raise_for_status()
        if response.content[:4] != b"%PDF":
            raise ValueError("Invalid PDF format")
        return BytesIO(response.content)
    except Exception as e:
        print(f"❌ Error downloading PDF: {e}")
        return None

# 🧼 Clean OCR text
def clean_ocr_text(text):
    text = re.sub(r"\.{2,}", ".", text)
    text = re.sub(r"\n+", "\n", text)
    text = re.sub(r" +", " ", text)
    return text.strip()

# 🧠 Extract Urdu text from all PDFs
def extract_all_texts_from_drive(drive_links, max_pages=2):
    all_text = ""
    for title, url in drive_links.items():
        file = download_pdf_from_drive(url)
        if file:
            try:
                images = convert_from_bytes(file.read())[:max_pages]
                for img in images:
                    all_text += pytesseract.image_to_string(img, lang="urd") + "\n"
            except Exception as e:
                print(f"❌ OCR failed for {title}: {e}")
    return clean_ocr_text(all_text)

# 📚 Chunk text
def chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    return splitter.create_documents([text])

# 💾 Create FAISS index
def create_faiss_index(docs):
    if not docs:
        raise ValueError("❌ No chunks created from text.")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    return FAISS.from_documents(docs, embedding=embeddings)

# 🤖 Query with Groq — returns *bilingual* with clear section headers
def query_vector_db(query, db):
    results = db.similarity_search(query, k=3)
    if not results:
        return "❌ No relevant information found."

    context = "\n\n".join([doc.page_content for doc in results])
    prompt = f"""You are a helpful assistant. Use ONLY the context to answer.
Return your reply in EXACTLY this format:

English:
<2–4 sentences in clear English.>

Urdu:
<2–4 sentences in Urdu script.>

Context:
{context}

Question: {query}
"""
    try:
        response = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"❌ Error from Groq: {e}"

# =========================
# 🎙️ Speech-to-Text helper
# =========================
def transcribe_audio(audio_path: str) -> str:
    """Transcribe spoken question (Urdu/English) with Whisper."""
    if not audio_path:
        return ""
    try:
        result = stt_model.transcribe(audio_path, task="transcribe")
        return (result.get("text") or "").strip()
    except Exception as e:
        print(f"❌ STT error: {e}")
        return ""

# =========================
# 🔊 Text-to-Speech helper
# =========================
def tts_to_file(text: str, lang_code: str):
    """Synthesize TTS and return a temporary .mp3 file path or None."""
    if not text or not text.strip():
        return None
    try:
        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        tts = gTTS(text=text.strip(), lang=lang_code, slow=False)
        tts.save(tmp.name)
        return tmp.name
    except Exception as e:
        print(f"❌ TTS error ({lang_code}): {e}")
        return None

def split_bilingual(answer_text: str):
    """Extract English/Urdu parts from the structured answer."""
    en, ur = "", ""
    m_en = re.search(r"English:\s*(.+?)(?:\n\s*Urdu:|$)", answer_text, flags=re.S)
    m_ur = re.search(r"Urdu:\s*(.+)$", answer_text, flags=re.S)
    if m_en: en = m_en.group(1).strip()
    if m_ur: ur = m_ur.group(1).strip()
    if not en and not ur:
        en = answer_text.strip()
    return en, ur

# 🚀 Preprocessing once
print("📦 Processing Urdu PDFs...")
extracted_text = extract_all_texts_from_drive(drive_links)
print(f"✅ Extracted {len(extracted_text)} characters of Urdu text")

documents = chunk_text(extracted_text)
print(f"📄 {len(documents)} chunks created")

vector_db = create_faiss_index(documents)
print("✅ FAISS index created")

# =========================
# 🧩 Gradio pipeline (voice + text)
# =========================
def answer_with_voice(audio_path, typed_query, speak_answer, speak_lang_choice):
    """
    - If audio provided, transcribe it to form the query (auto language).
    - If typed text provided, it takes precedence.
    - Query the vector DB + Groq.
    - Optionally return English and/or Urdu TTS.
    """
    query = ""
    if audio_path:
        query = transcribe_audio(audio_path)
    if typed_query and typed_query.strip():
        query = typed_query.strip()

    if not query:
        return "❌ Please ask a question (type or record).", None, None

    answer = query_vector_db(query, vector_db)
    en_text, ur_text = split_bilingual(answer)

    en_audio = ur_audio = None
    if speak_answer:
        if speak_lang_choice in ("Both", "English only") and en_text:
            en_audio = tts_to_file(en_text, "en")
        if speak_lang_choice in ("Both", "Urdu only") and ur_text:
            ur_audio = tts_to_file(ur_text, "ur")

    return answer, en_audio, ur_audio

# 🎛 Gradio UI (Blocks)
with gr.Blocks(title="📚 Urdu PDF QnA (Groq + FAISS + Voice)") as demo:
    gr.Markdown("## 📚 Urdu PDF QnA — Ask by voice or text\nUses OCR ➜ FAISS ➜ Groq ➜ optional voice answer.")

    with gr.Row():
        audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="🎙️ Speak or upload audio")
        text_in = gr.Textbox(label="💬 Or type your question (English or Roman Urdu)")

    with gr.Row():
        speak_toggle = gr.Checkbox(value=True, label="🔊 Speak the answer")
        speak_lang = gr.Radio(
            choices=["Both", "English only", "Urdu only"],
            value="Both",
            label="Audio language"
        )

    ask_btn = gr.Button("Ask")
    answer_out = gr.Textbox(label="📘 Answer", lines=8)
    en_audio_out = gr.Audio(label="🔊 English audio", type="filepath")
    ur_audio_out = gr.Audio(label="🔊 Urdu audio", type="filepath")

    ask_btn.click(
        fn=answer_with_voice,
        inputs=[audio_in, text_in, speak_toggle, speak_lang],
        outputs=[answer_out, en_audio_out, ur_audio_out]
    )

demo.launch(share=True)




In [None]:
# app.py — Smart Zameen Dost (trimmed context + OCR fallback)
# -----------------------------------------------------------
# - LangChain 0.2.x (langchain_core.documents.Document)
# - Caps retrieval + context to avoid Groq 413/TPM
# - OCR fallback (pdf2image + pytesseract) for image-based PDFs

import os
import io
import re
import json
import tempfile
from datetime import datetime

import gradio as gr
import requests
import nltk
import PyPDF2
from gtts import gTTS
import whisper
import torch  # noqa: F401

from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# NEW: OCR libs
from pdf2image import convert_from_bytes
import pytesseract

# =========================
# 🔑 Keys (⚠️ hard-code only if you accept the risk)
# =========================
GROQ_API_KEY = "YOUR_GROQ_KEY_HERE"            # paste locally
WEATHER_API_KEY = "YOUR_OPENWEATHER_KEY_HERE"
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
os.environ["WEATHER_API_KEY"] = WEATHER_API_KEY

if not GROQ_API_KEY.strip():
    raise RuntimeError("GROQ_API_KEY not set.")
if not WEATHER_API_KEY.strip():
    raise RuntimeError("WEATHER_API_KEY not set.")

groq_client = Groq(api_key=GROQ_API_KEY)

# =========================
# 🔧 Global limits to keep prompt small
# =========================
K_RETRIEVE = 3
PER_DOC_CHARS = 700
MAX_CONTEXT_CHARS = 4000
MAX_OUTPUT_TOKENS = 512

def _limit_chars(s: str, n: int) -> str:
    s = str(s or "")
    return s if len(s) <= n else (s[:n] + " …")

def _clip_context(snippets, max_chars: int) -> str:
    out, used = [], 0
    for snip in snippets:
        snip = str(snip or "")
        if used + len(snip) > max_chars:
            snip = snip[: max(0, max_chars - used)]
        if snip:
            out.append(snip)
            used += len(snip)
        if used >= max_chars:
            break
    return "\n\n".join(out)

# =========================
# 🔇 NLTK (quiet)
# =========================
try:
    nltk.download("punkt", quiet=True)
    nltk.download("stopwords", quiet=True)
except Exception:
    pass

# =========================
# 🤖 Models / Embeddings
# =========================
print("🤖 Loading Whisper model...")
whisper_model = whisper.load_model("base")
print("✅ Whisper model loaded.")

print("🔤 Loading multilingual sentence embeddings...")
multilingual_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
print("✅ Multilingual embeddings loaded.")

# =========================
# 📄 Predefined Google Drive PDFs
# =========================
PREDEFINED_PDF_LINKS = [
    "https://drive.google.com/file/d/1H7b-1PG2SLB99gjogfSl7QTOmLd1iGX0/view?usp=sharing",
    "https://drive.google.com/file/d/16VRkuegHXbhQPPH6kB3jTcdlg1eh95Og/view?usp=sharing",
    "https://drive.google.com/file/d/1e4Zi9vYXHEtuU_mkpBKDjZ-s0fIFqdzO/view?usp=sharing",
    "https://drive.google.com/file/d/149Js-w01KO085cRibqXyra9_oPNRAYqZ/view?usp=sharing"
]

# =========================
# 🧰 Helpers
# =========================
def safe_str(x) -> str:
    if x is None:
        return ""
    if isinstance(x, (bool, int, float)):
        return str(x)
    return str(x)

def normalize_mixed_text(text: str) -> str:
    s = re.sub(r"\s+", " ", safe_str(text)).strip()
    return re.sub(
        r"[^\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF\w\s.,;:!?()\-]",
        " ",
        s,
    )

def extract_numerical_data(text: str):
    t = safe_str(text)
    info = {}
    prices = re.findall(r"[\$Rs\.]\s*(\d+(?:,\d{3})*(?:\.\d{2})?)", t)
    if prices:
        info["prices"] = prices
    perc = re.findall(r"(\d+(?:\.\d+)?)\s*%", t)
    if perc:
        info["percentages"] = perc
    yields = re.findall(
        r"(\d+(?:\.\d+)?)\s*(tons?|kg|quintals?|maunds?)\s*(?:per|/)?\s*(acre|hectare|ایکڑ)",
        t,
        re.IGNORECASE,
    )
    if yields:
        info["yields"] = yields
    return info

# =========================
# 🖼️ OCR config
# =========================
OCR_LANGS = "urd+eng"   # requires tesseract-ocr-urd installed
OCR_DPI = 300
MAX_OCR_PAGES = None    # set an int to cap pages for speed (e.g., 10)

def ocr_pdf_bytes(pdf_content: bytes, dpi: int = OCR_DPI, langs: str = OCR_LANGS, max_pages=MAX_OCR_PAGES):
    """OCR full/partial PDF bytes -> text, pages_processed."""
    images = convert_from_bytes(pdf_content, dpi=dpi)
    if max_pages is not None:
        images = images[:max_pages]
    out = []
    for i, img in enumerate(images, start=1):
        try:
            txt = pytesseract.image_to_string(img, lang=langs)
            txt = (txt or "").strip()
            if txt:
                out.append(f"\n--- OCR Page {i} ---\n{txt}\n")
            else:
                out.append(f"\n--- OCR Page {i} (no text) ---\n")
        except Exception as e:
            out.append(f"\n--- OCR Page {i} (error: {e}) ---\n")
    return "".join(out), len(images)

# =========================
# 📥 Google Drive PDF Processor (with OCR fallback)
# =========================
class GoogleDrivePDFProcessor:
    @staticmethod
    def convert_gdrive_link(share_link: str):
        patterns = [r"/file/d/([a-zA-Z0-9\-_]+)", r"id=([a-zA-Z0-9\-_]+)", r"/d/([a-zA-Z0-9\-_]+)"]
        file_id = None
        link = safe_str(share_link)
        for pat in patterns:
            m = re.search(pat, link)
            if m:
                file_id = m.group(1)
                break
        if not file_id:
            return None
        return f"https://drive.google.com/uc?export=download&id={file_id}"

    @staticmethod
    def download_pdf_from_gdrive(gdrive_link: str):
        try:
            download_link = GoogleDrivePDFProcessor.convert_gdrive_link(gdrive_link)
            if not download_link:
                return None, "Invalid Google Drive link format"

            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
            resp = requests.get(download_link, headers=headers, stream=True, timeout=60)
            txt = safe_str(resp.text)

            if ("confirm=" in txt) or ("virus scan warning" in txt.lower()):
                token = re.search(r"confirm=([^&]+)", txt)
                if token:
                    confirmed = f"{download_link}&confirm={token.group(1)}"
                    resp = requests.get(confirmed, headers=headers, stream=True, timeout=60)

            if resp.status_code == 200:
                return resp.content, "Success"
            return None, f"Download failed: HTTP {resp.status_code}"
        except Exception as e:
            return None, f"Download error: {e}"

    @staticmethod
    def extract_text_from_pdf(pdf_content: bytes):
        """
        Extract text with PyPDF2; if too little text, fallback to OCR.
        Returns (text, page_count).
        """
        # 1) Try text extraction
        try:
            reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
            pages = len(reader.pages)
            out = []
            for i in range(pages):
                try:
                    pg = reader.pages[i]
                    t = (pg.extract_text() or "").strip()
                    if t:
                        out.append(f"\n--- Page {i+1} ---\n{t}\n")
                except Exception:
                    out.append(f"\n--- Page {i+1} (Error extracting) ---\n")
            text = "".join(out)
        except Exception as e:
            text = f"PDF text extraction error: {e}"
            pages = 0

        # 2) Decide whether to OCR
        # If extraction failed or text is tiny, OCR the PDF
        if (pages == 0) or (len(re.sub(r"\s+", "", safe_str(text))) < 100):
            try:
                ocr_text, ocr_pages = ocr_pdf_bytes(pdf_content, dpi=OCR_DPI, langs=OCR_LANGS, max_pages=MAX_OCR_PAGES)
                if len(re.sub(r"\s+", "", ocr_text)) >= 20:
                    return ocr_text, (ocr_pages or pages or 0)
            except Exception as _:
                pass

        return text, pages

# =========================
# 🧠 Knowledge Base / RAG
# =========================
class AdvancedPakistaniAgriRAG:
    def __init__(self):
        self.embeddings = multilingual_embeddings
        self.vector_store = None
        self.gdrive = GoogleDrivePDFProcessor()
        self.processed_documents = []
        self._setup_seed_knowledge()
        self._auto_process_predefined_pdfs()

    def _setup_seed_knowledge(self):
        seed = [
            {
                "content": """Punjab Wheat Varieties for Export:
                اعلیٰ قسم کی گندم کی اقسام:
                - Anmol-91: Yield 45-50 maunds/acre, Export price $320-350/ton
                - Faisalabad-2008: High protein 12-14%, Premium export variety
                - Galaxy-2013: Disease resistant, Suitable for UAE market
                - Punjab-2011: Good for bread making, Export to Afghanistan
                Urdu: یہ اقسام برآمد کے لیے بہترین ہیں اور زیادہ قیمت ملتی ہے""",
                "metadata": {"type": "crop_varieties", "region": "Punjab", "crop": "wheat", "language": "mixed"},
            },
            {
                "content": """Rice Export Opportunities - چاول کی برآمدات:
                Basmati Varieties with International Prices:
                - Super Basmati: $900-1200/ton (UAE, Saudi Arabia)
                - Basmati 385: Premium grade, $1000-1300/ton
                - IRRI-6: $450-550/ton (Philippines, Malaysia)
                - Kainaat: $700-850/ton (Middle East markets)

                Export Requirements:
                - Moisture: Maximum 14%
                - Broken grains: Less than 5%
                - Length: Minimum 6.0mm for Basmati

                اردو میں: بسمتی چاول کی برآمد سب سے زیادہ منافع بخش ہے""",
                "metadata": {"type": "export_markets", "crop": "rice", "price_range": "450-1300", "language": "mixed"},
            },
            {
                "content": """Government Support Schemes - حکومتی اسکیمز:
                Kisan Card Program:
                - 25% subsidy on fertilizers
                - 20% discount on certified seeds
                - Easy loan access through banks

                Solar Tube Well Scheme:
                - 60% government subsidy
                - Remaining 40% through easy installments
                - Electricity bill savings: Rs. 50,000+ annually

                Crop Insurance Program:
                - Premium: 5% of sum insured
                - Government pays 75% of premium
                - Coverage: Natural disasters, pest attacks

                کسان ڈویلپمنٹ پروگرام سے مفت تربیت اور مشورے""",
                "metadata": {"type": "government_schemes", "schemes": "kisan_card,solar_tubewell,crop_insurance", "language": "mixed"},
            },
        ]

        docs = []
        for item in seed:
            content = normalize_mixed_text(item["content"])
            meta = dict(item.get("metadata") or {})
            nums = extract_numerical_data(content)
            if nums:
                meta.update(nums)
            docs.append(Document(page_content=content, metadata=meta))

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", "۔", ".", ":", ";", " "], length_function=len
        )
        pieces = splitter.split_documents(docs)
        self.vector_store = FAISS.from_documents(pieces, self.embeddings)
        print("✅ Seed agricultural knowledge initialized with", len(pieces), "chunks.")

    def _auto_process_predefined_pdfs(self):
        if not PREDEFINED_PDF_LINKS:
            print("ℹ️ No predefined PDFs configured.")
            return

        print(f"🚀 Auto-processing {len(PREDEFINED_PDF_LINKS)} Google Drive PDF(s)...")
        ok = 0
        for i, link in enumerate(PREDEFINED_PDF_LINKS, start=1):
            try:
                blob, msg = self.gdrive.download_pdf_from_gdrive(link)
                if blob is None:
                    print(f"❌ Doc {i}: {msg}")
                    continue

                text, pages = self.gdrive.extract_text_from_pdf(blob)
                if "pdf text extraction error" in safe_str(text).lower():
                    print(f"❌ Doc {i}: {text}")
                    continue

                if len(safe_str(text).strip()) < 100:
                    print(f"ℹ️ Doc {i}: low native text — OCR likely used.")

                processed = normalize_mixed_text(text)
                numbers = extract_numerical_data(processed)

                doc = Document(
                    page_content=processed,
                    metadata={
                        "type": "auto_processed_pdf",
                        "source": f"Auto PDF {i}",
                        "pages": pages,
                        "numerical_data": numbers,
                        "processing_date": datetime.now().strftime("%Y-%m-%d %H:%M"),
                        "original_link": link[:50] + "..." if len(link) > 50 else link,
                    },
                )

                splitter = RecursiveCharacterTextSplitter(
                    chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", "۔", ".", ":", ";", " "]
                )
                chunks = splitter.split_documents([doc])

                if self.vector_store:
                    self.vector_store.add_documents(chunks)
                else:
                    self.vector_store = FAISS.from_documents(chunks, self.embeddings)

                self.processed_documents.append(
                    {"id": i, "pages": pages, "chunks": len(chunks), "source": doc.metadata["original_link"], "status": "✅ Success"}
                )
                print(f"✅ Doc {i}: {pages} pages → {len(chunks)} chunks")
                ok += 1
            except Exception as e:
                self.processed_documents.append(
                    {"id": i, "pages": 0, "chunks": 0, "source": link[:50] + "..." if len(link) > 50 else link, "status": f"❌ Error: {e}"}
                )
                print(f"❌ Doc {i}: {e}")

        print(f"🎉 Finished: {ok}/{len(PREDEFINED_PDF_LINKS)} document(s) processed.")

    def get_stats_html(self) -> str:
        if not self.processed_documents:
            return "📊 Knowledge Base: Seed Pakistani agricultural data only (no PDFs yet)"
        total_chunks = sum(d.get("chunks", 0) for d in self.processed_documents)
        total_pages = sum(d.get("pages", 0) for d in self.processed_documents)
        return f"""📊 Knowledge Base Statistics:

🗂️ Auto-processed Documents: {len(self.processed_documents)}
📄 Total Pages Processed: {total_pages}
🧩 Total Text Chunks: {total_chunks}
📚 Seed Knowledge: Pakistani agriculture (Urdu + English)
🔍 Search Capability: Multilingual (English + Urdu)
✅ Status: Ready for queries
"""

    def get_relevant_info(self, query: str, k: int = K_RETRIEVE) -> str:
        if not self.vector_store:
            return "Knowledge base not available"
        try:
            q = safe_str(query)
            hits = self.vector_store.similarity_search(q, k=k)

            snippets = []
            nums_summary = []

            for i, doc in enumerate(hits, start=1):
                body = _limit_chars(doc.page_content, PER_DOC_CHARS)
                snippets.append(f"معلومات {i}: {body}")

                meta = doc.metadata or {}
                if not isinstance(meta, dict):
                    meta = {}
                nd = meta.get("numerical_data")
                if isinstance(nd, dict):
                    meta = {**meta, **nd}

                if isinstance(meta.get("prices"), list) and meta["prices"]:
                    nums_summary.append(f"💰 قیمتیں: {', '.join(map(safe_str, meta['prices']))}")
                if isinstance(meta.get("percentages"), list) and meta["percentages"]:
                    nums_summary.append(f"📊 فیصد: {', '.join(map(safe_str, meta['percentages']))}%")
                if isinstance(meta.get("yields"), list) and meta["yields"]:
                    y_fmt = []
                    for y in meta["yields"]:
                        try:
                            val, unit, per = y
                            y_fmt.append(f"{val} {unit} per {per}")
                        except Exception:
                            y_fmt.append(safe_str(y))
                    nums_summary.append(f"🌾 پیداوار: {', '.join(y_fmt)}")

            context = "\n\n".join(snippets)
            if nums_summary:
                context = "📈 اہم اعداد و شمار:\n" + "\n".join(nums_summary) + "\n\n" + context

            return _clip_context([context], MAX_CONTEXT_CHARS) or "No relevant information found."
        except Exception as e:
            return f"Error retrieving information: {e}"

# =========================
# 🚀 Initialize RAG
# =========================
print("🧠 Initializing Advanced Pakistani Agricultural Knowledge Base...")
pak_agri_rag = AdvancedPakistaniAgriRAG()

# =========================
# 🎙️ Voice, 🌦️ Weather, 🤝 AI
# =========================
def voice_to_text(audio_file_path):
    if not audio_file_path:
        return ""
    try:
        result = whisper_model.transcribe(audio_file_path, language="ur")
        return normalize_mixed_text(result.get("text", ""))
    except Exception as e:
        return f"آواز سمجھ نہیں آئی: {e}"

def get_weather_with_farming_advice(city="Lahore"):
    try:
        city = safe_str(city).strip() or "Lahore"
        url = f"http://api.openweathermap.org/data/2.5/weather?q={city},PK&appid={WEATHER_API_KEY}&units=metric"
        resp = requests.get(url, timeout=20)
        try:
            data = resp.json()
        except Exception:
            return "موسمی JSON درست نہیں۔"

        main = data.get("main") or {}
        wind = data.get("wind") or {}
        weather_l = data.get("weather") or [{}]

        temp = main.get("temp")
        humidity = main.get("humidity")
        wind_speed = wind.get("speed")
        description = weather_l[0].get("description", "")

        if any(v is None for v in (temp, humidity, wind_speed)):
            return "موسمی معلومات مکمل نہیں مل سکیں۔"

        if temp > 35:
            advice = f"⚠️ زیادہ گرمی ({temp}°C): صبح 6-8 بجے پانی دیں، دوپہر میں نہیں۔ پانی کی مقدار 20% بڑھائیں۔"
        elif humidity > 80:
            advice = f"🌧️ زیادہ نمی ({humidity}%): فنگیسائیڈ سپرے کریں۔ Mancozeb 2g/لیٹر یا Copper Oxychloride 3g/لیٹر۔"
        elif temp < 10:
            advice = f"❄️ سردی ({temp}°C): پودوں کو ڈھانپیں، پانی 50% کم دیں۔ Frost protection ضروری۔"
        elif wind_speed > 5:
            advice = f"💨 تیز ہوا ({wind_speed} m/s): کیڑے مار دوا کا سپرے نہ کریں۔ Wind barriers لگائیں۔"
        else:
            advice = f"✅ موسم اچھا ہے ({temp}°C, {humidity}% نمی): کھیتی کے کام کر سکتے ہیں۔"

        return f"آج {city} میں {temp}°C، نمی {humidity}%، ہوا {wind_speed} m/s، موسم {description}\n\n{advice}"
    except Exception as e:
        return f"موسمی معلومات نہیں مل سکیں: {e}"

def text_to_voice(text):
    try:
        clean = normalize_mixed_text(text)
        if len(clean) > 500:
            clean = clean[:500] + "... مکمل جواب اوپر پڑھیں"
        tts = gTTS(text=clean, lang="ur", slow=False)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            return tmp.name
    except Exception as e:
        print(f"TTS Error: {e}")
        return None

def get_enhanced_ai_response(user_message: str, location: str = "") -> str:
    relevant_context = pak_agri_rag.get_relevant_info(user_message)

    system_prompt = (
        "You are Zameen Dost, a Pakistani agriculture advisor. "
        "Answer in simple Urdu, start with 'بھائی', use numbers when available, "
        "and keep it concise and actionable. If weather is included, integrate it. "
        "Only use the provided context; do not invent facts."
    )

    prompt_user = (
        f"Context:\n{relevant_context}\n\n"
        f"Location: {safe_str(location)}\n"
        f"Question: {safe_str(user_message)}"
    )

    try:
        chat = groq_client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_user},
            ],
            model="llama-3.1-8b-instant",
            max_tokens=MAX_OUTPUT_TOKENS,
            temperature=0.5,
        )
        return chat.choices[0].message.content
    except Exception as e:
        msg = safe_str(e)
        if ("rate_limit" in msg) or ("tokens per minute" in msg) or ("Request too large" in msg):
            return "معذرت، پیغام بڑا تھا یا رفتار حد سے زیادہ تھی۔ براہِ کرم چھوٹا سوال کریں، یا دوبارہ کوشش کریں۔"
        return f"معذرت، AI سے رابطہ نہیں ہو سکا: {e}"

# =========================
# 💬 Main chat handler
# =========================
def zameen_dost_advanced_chat(audio_input, text_input, city_name, focus_area):
    user_message = ""
    input_display = ""

    if audio_input:
        user_message = voice_to_text(audio_input)
        input_display = f"💬 آپ نے کہا: {user_message}"
    elif text_input:
        user_message = safe_str(text_input)
        input_display = f"💬 آپ نے لکھا: {user_message}"

    if not isinstance(user_message, str) or not user_message.strip():
        return "کرپیا کوئی سوال پوچھیں", None, "❌ کوئی سوال نہیں ملا"

    enhanced = user_message
    if focus_area and safe_str(focus_area) != "عام سوال":
        enhanced += f" (کسان کی دلچسپی: {focus_area})"

    terms = ["موسم", "بارش", "پانی", "weather", "irrigation", "spray", "سپرے"]
    if isinstance(user_message, str) and any(t in user_message for t in terms):
        weather_info = get_weather_with_farming_advice(city_name or "Lahore")
        enhanced += f"\n\nموسمی حالات: {weather_info}"

    ai_response = get_enhanced_ai_response(enhanced, city_name or "")
    voice_response = text_to_voice(ai_response)
    return input_display, voice_response, ai_response

# =========================
# 🖥️ UI
# =========================
with gr.Blocks(
    title="Smart Zameen Dost - زمین دوست",
    theme=gr.themes.Base(),
    css="""
    .gradio-container { background: linear-gradient(135deg, #f8fdff 0%, #e8f7f8 100%); font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
    .header-box { background: white; padding: 20px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); margin: 10px 0; border-left: 4px solid #2E8B57; }
    .stats-box { background: linear-gradient(45deg, #e8f5e8, #f0f8e8); padding: 15px; border-radius: 8px; border: 1px solid #c8e6c9; margin: 10px 0; font-size: 0.9em; }
    """
) as app:
    gr.HTML("""
        <div class='header-box'>
          <div style='text-align: center;'>
            <h1 style='color: #2E8B57; font-size: 2.2em; margin: 0 0 8px 0;'>🌾 Smart Zameen Dost</h1>
            <p style='color: #666; font-size: 1.1em; margin: 0;'>پاکستانی کسانوں کا ذہین مشیر</p>
          </div>
        </div>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🎤 اپنا سوال پوچھیں")
            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="آواز میں پوچھیں")
            text_input = gr.Textbox(label="یا یہاں لکھیں (اردو/English)", placeholder="مثال: کون سی فصل زیادہ منافع دے گی؟", lines=2)
            with gr.Row():
                city_input = gr.Textbox(label="آپ کا شہر", placeholder="Lahore, Karachi, Faisalabad", value="Lahore", scale=1)
                focus_area = gr.Dropdown(
                    label="دلچسپی کا شعبہ",
                    choices=["عام سوال","برآمدی فصلیں","گندم کی کاشت","چاول کی کاشت","کپاس کی کاشت","سبزیوں کی کاشت","پھلوں کی کاشت","کھاد اور بیج","بیماریوں کا علاج","حکومتی اسکیمز","منڈی کی قیمتیں"],
                    value="عام سوال",
                    scale=1,
                )
            chat_btn = gr.Button("🚀 جواب حاصل کریں", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("### 🧠 ذہین جواب")
            input_display = gr.Textbox(label="آپ کا سوال", lines=2, interactive=False)
            audio_output = gr.Audio(label="🔊 آواز میں جواب")
            text_output = gr.Textbox(label="📝 تفصیلی جواب", lines=10, interactive=False, show_copy_button=True)

    with gr.Row():
        kb_stats = gr.HTML(value=pak_agri_rag.get_stats_html(), elem_classes=["stats-box"])

    chat_btn.click(
        zameen_dost_advanced_chat,
        inputs=[audio_input, text_input, city_input, focus_area],
        outputs=[input_display, audio_output, text_output],
    )

print("🎉 App ready!")
print(f"✅ Auto-processed {len(PREDEFINED_PDF_LINKS)} Google Drive PDF link(s)")
print("🔍 Multilingual RAG + Voice + Weather integrated")

gr.close_all()
app.launch(share=True, debug=True, show_api=False)


🤖 Loading Whisper model...
✅ Whisper model loaded.
🔤 Loading multilingual sentence embeddings...
✅ Multilingual embeddings loaded.
🧠 Initializing Advanced Pakistani Agricultural Knowledge Base...
✅ Seed agricultural knowledge initialized with 3 chunks.
🚀 Auto-processing 4 Google Drive PDF(s)...
✅ Doc 1: 322 pages → 1516 chunks
✅ Doc 2: 4 pages → 17 chunks
✅ Doc 3: 4 pages → 10 chunks
✅ Doc 4: 4 pages → 16 chunks
🎉 Finished: 4/4 document(s) processed.
🎉 App ready!
✅ Auto-processed 4 Google Drive PDF link(s)
🔍 Multilingual RAG + Voice + Weather integrated
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://14e474dde1316cb860.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
