In [None]:
!pip install reportlab
!pip install docx2pdf
!pip install Pillow
!pip install markdown2
!pip install weasyprint
!pip install python-pptx
!pip install PyMuPDF
!pip install textract
!pip install openpyxl
!pip install beautifulsoup4
!pip install python-docx
!pip install openpyxl
!pip install gradio

In [None]:
import gradio as gr
import fitz  # PyMuPDF
import docx2txt
import os
import requests
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains.question_answering import load_qa_chain
from langchain import PromptTemplate

# Function to extract text from PDF, DOCX, or TXT files
def extract_text_from_file(file_path):
    if file_path.endswith(".pdf"):
        with fitz.open(file_path) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
        return text
    elif file_path.endswith(".docx"):
        return docx2txt.process(file_path)
    elif file_path.endswith(".txt"):
        with open(file_path, "r") as file:
            return file.read()
    else:
        return ""

# Function to load documents and set up question-answering pipeline
def ask_question(question, files):
    # Load text from uploaded files
    texts = [extract_text_from_file(file.name) for file in files]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    # content = "\n\n".join(str(page.page_content) for page in data)

    textss = text_splitter.split_text(texts)
    # Initialize embeddings and vector store
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vector_store = Chroma.from_texts(textss, embeddings).as_retriever()

    # Set up prompt for question-answering
    prompt_template = """
    Please answer the question in as much detail as possible based on the provided context.
    Ensure to include all relevant details. If the answer is not available in the provided context,
    kindly respond with "The answer is not available in the context." Please avoid providing incorrect answers.

    Context:\n {context}?\n
    Question: \n{question}\n

    Answer:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    # Load question-answering model
    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

    # Get relevant documents for the question
    docs = vector_store.get_relevant_documents(question)

    # Get response from question-answering model
    response = chain({"input_documents": docs, "question": question}, return_only_outputs=True)

    return response

# Interface setup
html = """
<div style="text-align:center; max-width: 700px;">
    <h1>ChatDocuments</h1>
    <p> Upload Documents (PDF, DOCX, or TXT), then click on Load Documents <br>
    Once the documents have been loaded you can begin chatting with them =)
</div>"""
css = """container{max-width:700px; margin-left:auto; margin-right:auto; padding:20px}"""
with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
    gr.HTML(html)
    with gr.Column():
        gr.Markdown('ChatDocuments')
        file_upload = gr.Files(label="Upload documents", file_types=['.pdf', '.docx', '.txt'])
        input_text = gr.Textbox(label="Type in your question")
        output_text = gr.Textbox(label="Answer")
        submit_query_button = gr.Button("Submit query")

        submit_query_button.click(ask_question, inputs=[input_text, file_upload], outputs=output_text)

demo.launch()