In [None]:
!pip install -q langchain faiss-cpu PyMuPDF sentence-transformers groq

In [None]:
from google.colab import userdata


In [None]:
import os
import requests
from typing import List, Optional
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms.base import LLM
from pydantic import Field
from groq import Groq

# 🔑 Set your Groq API key
os.environ["GROQ_API_KEY"] = userdata.get('groq-apikey') # Replace with your actual API key

# 📥 Download sample PDF
pdf_url = "https://www.lkouniv.ac.in/site/writereaddata/siteContent/202005012116016435Ranvijay-Pratap-Singh-Environmental-Pollution.pdf"
pdf_path = "Pollution.pdf"
if not os.path.exists(pdf_path):
    with open(pdf_path, "wb") as f:
        f.write(requests.get(pdf_url).content)

# 📄 Load and chunk the PDF
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

# 🧠 Embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(chunks, embeddings)

# 🧠 Groq-compatible LLM wrapper (Pydantic-safe)
class GroqLLM(LLM):
    model: str = "llama3-8b-8192"
    temperature: float = 0.3
    groq_api_key: str = Field(default_factory=lambda: os.environ.get("GROQ_API_KEY"))

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        client = Groq(api_key=self.groq_api_key)
        response = client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=self.temperature,
        )
        return response.choices[0].message.content

    @property
    def _llm_type(self) -> str:
        return "groq"

# 💬 Prompt Template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Use the following context to answer the question.

Context:
{context}

Question: {question}
Answer:"""
)

# 🔧 Retrieval-based QA chain
llm = GroqLLM()
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# ❓ Ask multiple questions
questions = [
    "Summarize the paper in simple terms.",
    "What are the polutions of this paper?",
    "Who are the authors and what institution are they from?",
    "Is there any way to avoid pollution?",
]

for q in questions:
    answer = qa.invoke({"query": q})
    print("Q:", q)
    print("A:", answer)
    print("-" * 50)


Q: Summarize the paper in simple terms.
A: {'query': 'Summarize the paper in simple terms.', 'result': "Here's a summary of the paper in simple terms:\n\nThe paper discusses the importance of preventing pollution in the environment, specifically in soil and water. Soil pollution can occur due to excessive irrigation, industrial waste, and other factors, which can reduce crop yield and affect human health. To prevent soil pollution, measures such as proper tree plantation, treating industrial waste, and using organic fertilizers can be taken.\n\nWater pollution can also occur due to point source pollution, such as industrial waste and municipal sources, as well as other factors. This can harm living organisms and affect the environment. To prevent water pollution, measures such as proper monitoring of sea water, campaigns to prohibit marine pollution, and using technologies such as scrubbers and electrostatic scrubbers can be taken.\n\nOverall, the paper emphasizes the importance of tak