<a href="https://colab.research.google.com/github/Anupam-chand/Multi_Doc_QnA_with_RAG/blob/main/Gardio_for_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Installing Required Libraries
%pip install python-docx
%pip install python-pptx
%pip install PyPDF2
%pip install langchain
!pip install pandas
!pip install openpyxl
!pip install -q langchain_core
%pip install langchain_community
%pip install langchain_text_splitters
%pip install sentence-transformers
%pip install faiss-cpu
%pip install cohere

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/253.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0
Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownload

In [3]:
import gradio as gr
import os
import pandas as pd
from docx import Document
from PyPDF2 import PdfReader
from pptx import Presentation
from langchain_community.llms import Cohere
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate

In [7]:
def process_uploaded_files(files):
    combined_text = ""
    for file_obj in files:
        ext = os.path.splitext(file_obj.name)[-1].lower()
        if ext == '.pdf':
            pdf_reader = PdfReader(file_obj)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            combined_text += text + "\n\n"
        elif ext == '.docx':
            doc = Document(file_obj)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + '\n'
            combined_text += text + "\n\n"
        elif ext in ['.xlsx', '.xls']:
            df = pd.read_excel(file_obj)
            text = ""
            for idx, row in df.iterrows():
                text += str(row) + '\n'
            combined_text += text + "\n\n"
        elif ext == '.pptx':
            ppt = Presentation(file_obj)
            text = ""
            for slide in ppt.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        text += shape.text + '\n'
            combined_text += text + "\n\n"
        elif ext == '.txt':
            text = file_obj.read().decode('utf-8')
            combined_text += text + "\n\n"
        else:
            return None, f"Unsupported file format: {ext}"

    return combined_text

def process_files(files):
    document_text = process_uploaded_files(files)
    if not document_text:
        return None, "Failed to extract text from uploaded files."

    # Chunking combined text
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200,
        length_function=len,
        separators=['\n', '\n\n', ' ', '']
    )
    chunks = text_splitter.split_text(text=document_text)

    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    vectorstore = FAISS.from_texts(chunks, embedding=embeddings)

    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

    state = {
        "retriever": retriever,
    }
    return state, "All files processed successfully! You can now ask questions."

# The answer_question function remains the same as before
def answer_question(state, question):
    if state is None or "retriever" not in state:
        return "Please upload and process documents first."

    os.environ["COHERE_API_KEY"] = "q9sjN6pzYpUWZ45N1TSeP3PRDiFUVMJ1dSkZWhcO"
    cohere_llm = Cohere(
        model="command", temperature=0.1, cohere_api_key=os.getenv('COHERE_API_KEY')
    )
    prompt_template = """Answer the question as precise as possible using the provided context. If the answer is not contained in the context, say "answer not available in context" and do web search to find the answer if the question is out of context

Context:
{context}?
Question:
{question}
Answer:"""
    prompt = PromptTemplate.from_template(template=prompt_template)

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    rag_chain = (
        {"context": state["retriever"] | format_docs, "question": RunnablePassthrough()}
        | prompt
        | cohere_llm
        | StrOutputParser()
    )
    try:
        answer = rag_chain.invoke(question)
    except Exception as e:
        return f"An error occurred: {str(e)}"

    return answer

# Gradio UI with multiple file upload support
with gr.Blocks() as iface:
    state = gr.State()
    gr.Markdown("# Multi-Document Q&A Chatbot - Multiple File Upload")
    with gr.Row():
        file_input = gr.File(label="Upload your documents", file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".txt"], file_count="multiple")
        upload_btn = gr.Button("Process Documents")
    upload_status = gr.Textbox(label="Upload Status")
    question_input = gr.Textbox(label="Ask your question here")
    answer_output = gr.Textbox(label="Answer")

    upload_btn.click(
        fn=process_files,
        inputs=file_input,
        outputs=[state, upload_status]
    )

    question_input.submit(
        fn=answer_question,
        inputs=[state, question_input],
        outputs=answer_output
    )

iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d5a787ef2bf3693405.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# fix of .xlsx and .pptx file format


In [9]:
def process_uploaded_files(files):
    combined_text = ""
    for file_obj in files:
        ext = os.path.splitext(file_obj.name)[-1].lower()
        if ext == '.pdf':
            pdf_reader = PdfReader(file_obj)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            combined_text += text + "\n\n"
        elif ext == '.docx':
            doc = Document(file_obj)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + '\n'
            combined_text += text + "\n\n"
        elif ext in ['.xlsx', '.xls']:
            # Read Excel with all sheets; format as markdown-style tables
            xls = pd.ExcelFile(file_obj)
            excel_text = ""
            for sheet_name in xls.sheet_names:
                df = xls.parse(sheet_name)
                excel_text += f"Sheet: {sheet_name}\n"
                # Column headers
                excel_text += " | ".join(map(str, df.columns)) + "\n"
                # Markdown separator
                excel_text += " | ".join(["---"] * len(df.columns)) + "\n"
                # Data rows (limit 100 rows max to avoid too large text)
                for idx, row in df.head(100).iterrows():
                    row_str = " | ".join(str(cell) for cell in row)
                    excel_text += row_str + "\n"
                excel_text += "\n\n"
            combined_text += excel_text
        elif ext == '.pptx':
            ppt = Presentation(file_obj)
            ppt_text = ""
            for slide_num, slide in enumerate(ppt.slides, start=1):
                # Try to get slide title or fallback to slide number
                slide_title = None
                for shape in slide.shapes:
                    if shape.has_text_frame and shape.text.strip():
                        slide_title = shape.text.strip()
                        break
                if not slide_title:
                    slide_title = f"Slide {slide_num}"
                ppt_text += f"{slide_title}\n"
                for shape in slide.shapes:
                    if hasattr(shape, 'text') and shape.text.strip():
                        ppt_text += shape.text + "\n"
                ppt_text += "\n"
            combined_text += ppt_text + "\n\n"
        elif ext == '.txt':
            # Read plain text file
            text = file_obj.read().decode('utf-8')
            combined_text += text + "\n\n"
        else:
            return None, f"Unsupported file format: {ext}"

    return combined_text


def process_files(files):
    document_text = process_uploaded_files(files)
    if not document_text:
        return None, "Failed to extract text from uploaded files."


    # Chunking combined text
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200,
        length_function=len,
        separators=['\n', '\n\n', ' ', '']
    )
    chunks = text_splitter.split_text(text=document_text)


    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    vectorstore = FAISS.from_texts(chunks, embedding=embeddings)


    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})


    state = {
        "retriever": retriever,
    }
    return state, "All files processed successfully! You can now ask questions."


# The answer_question function remains the same as before
def answer_question(state, question):
    if state is None or "retriever" not in state:
        return "Please upload and process documents first."


    os.environ["COHERE_API_KEY"] = "q9sjN6pzYpUWZ45N1TSeP3PRDiFUVMJ1dSkZWhcO"
    cohere_llm = Cohere(
        model="command", temperature=0.1, cohere_api_key=os.getenv('COHERE_API_KEY')
    )
    prompt_template = """Answer the question as precise as possible using the provided context. If the answer is not contained in the context, say "answer not available in context" and do web search to find the answer if the question is out of context


Context:
{context}?
Question:
{question}
Answer:"""
    prompt = PromptTemplate.from_template(template=prompt_template)


    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)


    rag_chain = (
        {"context": state["retriever"] | format_docs, "question": RunnablePassthrough()}
        | prompt
        | cohere_llm
        | StrOutputParser()
    )
    try:
        answer = rag_chain.invoke(question)
    except Exception as e:
        return f"An error occurred: {str(e)}"


    return answer


# Gradio UI with multiple file upload support
with gr.Blocks() as iface:
    state = gr.State()
    gr.Markdown("# Multi-Document Q&A Chatbot - Multiple File Upload")
    with gr.Row():
        file_input = gr.File(label="Upload your documents", file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".txt"], file_count="multiple")
        upload_btn = gr.Button("Process Documents")
    upload_status = gr.Textbox(label="Upload Status")
    question_input = gr.Textbox(label="Ask your question here")
    answer_output = gr.Textbox(label="Answer")


    upload_btn.click(
        fn=process_files,
        inputs=file_input,
        outputs=[state, upload_status]
    )


    question_input.submit(
        fn=answer_question,
        inputs=[state, question_input],
        outputs=answer_output
    )


iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://91e461c88a37ad8b83.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


