In [2]:
import re
import time
from io import BytesIO
from typing import Any, Dict, List

import openai
import streamlit as st
from pypdf import PdfReader

In [3]:
from langchain import LLMChain, OpenAI
from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import VectorStore
from langchain.vectorstores.faiss import FAISS

#### Local helper functions

In [4]:
@st.cache_data
def parse_pdf(file: BytesIO) -> List[str]:
    """ extract text from a pdf file object
        clean, remove specific symbols such as hyphenated word, fixing newlines
        and return a list of string for a page of PDF
    """
    pdf_reader = PdfReader(file)

    output_str = []

    # extract text from each page in the pdf file
    for page in pdf_reader.pages():
        txt = page.extract_text()
        txt = re.sub(r"(\w+)-\n(\w+)", r"\1\2", txt)           # Merge hyphenated words
        txt = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", txt.strip()) # Fix newlines in the middles of sentences
        txt = re.sub(r"\n\s*\n", "\n\n", txt)                  # Remove multiple newlines
        output_str.append(txt)

    return output_str

