<a href="https://colab.research.google.com/github/AdopleAIOrg/Document-Question-Answering/blob/main/Document_QA_Product.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -r /content/requirements.txt

In [None]:
%%writefile app.py
import streamlit as st
import cv2
import os
from PIL import Image
import PyPDF2
import re
import fitz
import docx
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

class DocumentQA:

    def __init__(self):
        self.nlp = pipeline('question-answering', model='deepset/roberta-base-squad2', tokenizer='deepset/roberta-base-squad2')

    def _generate_answer(self, plain_text: str, ques_text: str) -> str:

        """
        Generates the answer for the given question based on the provided plain text.

        Args:
            plain_text (str): The plain text to extract the answer from.
            ques_text (str): The question to be answered.

        Returns:
            str: The answer to the given question based on the provided plain text.
        """

        question_set = {
                        'question':ques_text,
                        'context':plain_text
                      }
        results = self.nlp(question_set)
        return results['answer']

    def _extract_text_txt(self, uploaded_txt_file: str, downloaded_txt_file: str) -> str:

        """
        Extracts the text from the uploaded text file.

        Args:
            uploaded_txt_file (str): Path to the uploaded text file.
            downloaded_txt_file (str): Path to save the downloaded text file.

        Returns:
            str: The extracted text from the uploaded text file.
        """

        with open(uploaded_txt_file) as intxt:
            data = intxt.read()

        data = re.findall('[aA-zZ]+', data)
        with open(downloaded_txt_file, 'w') as outtxt:
            outtxt.write('\n'.join(data))
        return ' '.join(data)

    def _extract_text_pdf(self, uploaded_pdf_file: str) -> str:

        """
        Extracts the text from the uploaded PDF file.

        Args:
            uploaded_pdf_file (str): Path to the uploaded PDF file.

        Returns:
            str: The extracted text from the uploaded PDF file.
        """

        with fitz.open(uploaded_pdf_file) as intxt:
            text = ""
            for page in intxt:
                text += page.getText()
        return text

    def _extract_text_docx(self, uploaded_docx_file: str) -> str:
        """
        Extracts the text from the uploaded DOCX file.

        Args:
            uploaded_docx_file (str): Path to the uploaded DOCX file.

        Returns:
            str: The extracted text from the uploaded DOCX file.
        """

        doc = docx.Document(uploaded_docx_file)
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
        return ' '.join(fullText)

    def streamlit_interface(self) -> None:

        """
        Defines the Streamlit user interface and logic.
        """
        st.set_page_config(
            page_title="Q&A",
            page_icon="✨",
            layout="centered",
            initial_sidebar_state="auto",
        )

        upload_path = ""
        download_path = ""

        format_type = st.sidebar.selectbox('Apply Q&A for? ',["Plain Text","Documents"])

        st.title("Q&A System ")

        if format_type == "Plain Text":
            text = st.text_area("Enter your text here: ", height=300)
            ques_text = st.text_area("Enter your Question:",height=50)
            if st.button("Run"):
                if ques_text is not None and text is not None and ques_text != "" and text != "":
                    with st.spinner(f"Getting your Answer..."):
                        ans = self._generate_answer(text,ques_text)
                        st.markdown("Here's the answer")
                        st.success(ans)
                elif (ques_text is None or ques_text == "") and (text is not None or text != ""):
                    st.warning('Please enter your question!')
                elif (ques_text is not None or ques_text != "") and (text is None or text == "" ):
                    st.warning('Please enter your plain text!')
                else:
                    st.warning('Text fields missing! ')

        if format_type == "Documents":
            st.info('Supports all popular document formats - TXT, PDF, DOCX')
            uploaded_file = st.file_uploader("Upload Document", type=["txt","pdf","docx"])
            if uploaded_file is not None:
                with open(os.path.join(upload_path,uploaded_file.name),"wb") as f:
                    f.write((uploaded_file).getbuffer())
                if uploaded_file.name.endswith('.txt') or uploaded_file.name.endswith('.TXT'):
                    with st.spinner(f"Working..."):
                        uploaded_txt_file = os.path.abspath(os.path.join(upload_path,uploaded_file.name))
                        downloaded_txt_file = os.path.abspath(os.path.join(download_path,str("processed_"+uploaded_file.name)))
                        txt = self._extract_text_txt(uploaded_txt_file,downloaded_txt_file)

                if uploaded_file.name.endswith('.pdf') or uploaded_file.name.endswith('.PDF'):
                    with st.spinner(f"Working... "):
                        uploaded_pdf_file = os.path.abspath(os.path.join(upload_path,uploaded_file.name))
                        txt = self._extract_text_pdf(uploaded_pdf_file)

                if uploaded_file.name.endswith('.docx') or uploaded_file.name.endswith('.DOCX'):
                    with st.spinner(f"Working... "):
                        uploaded_docx_file = os.path.abspath(os.path.join(upload_path,uploaded_file.name))
                        txt = self._extract_text_docx(uploaded_docx_file)

            else:
                st.warning('Please upload your document')

            ques_text = st.text_area("Enter your Question: ",height=50)
            if st.button("Run"):
                if (ques_text is not None and ques_text != ""):
                    with st.spinner(f"Getting your Answer... "):
                            ans = self._generate_answer(str(txt),ques_text)
                            st.markdown("Here's the answer ")
                            st.success(ans)
                else:
                    st.warning('Please enter your question!')

if __name__ == "__main__":

  doc_qa = DocumentQA()
  doc_qa.streamlit_interface()

In [None]:
!streamlit run app.py & npx localtunnel --port 8501