In [None]:
# Step 1: Install dependencies
!pip install PyMuPDF langchain langchain-community chromadb sentence-transformers transformers gradio -qq

# Step 2: Import necessary libraries
import fitz  # PyMuPDF
import gradio as gr
import tempfile
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Step 3: Define functions to load PDF, create embeddings, and answer questions

# Function to load and extract text from a PDF file
def load_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

# Initialize the embedding model and vector store (this can be outside the Gradio function to avoid reloading each time)
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
model_id = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
llm = HuggingFacePipeline(pipeline=pipe)

# Main function to process PDF, generate embeddings, and answer a question
def answer_question(pdf_file, question):
    # Load and split the PDF text
    pdf_text = load_pdf(pdf_file.name)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_text(pdf_text)

    # Generate embeddings and store them in a new ChromaDB instance
    # Temporary directory to avoid conflicts
    with tempfile.TemporaryDirectory() as temp_dir:
        vectorstore = Chroma.from_texts(texts, embedding_model, persist_directory=temp_dir)
        qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever())

        # Run the Q&A chain with the user's question
        answer = qa_chain.run(question)
    return answer

# Step 4: Set up the Gradio interface
pdf_input = gr.File(label="Upload PDF")
question_input = gr.Textbox(label="Enter your question")
answer_output = gr.Textbox(label="Answer")

interface = gr.Interface(
    fn=answer_question,
    inputs=[pdf_input, question_input],
    outputs=answer_output,
    title="PDF Question Answering",
    description="Upload a PDF and ask any question to get detailed answers from the document."
)

# Launch the Gradio interface
interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9410af1ea2c2789609.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


