In [1]:
# Initial imports
from dotenv import load_dotenv
import glob
import os
from IPython.display import display, Markdown
import chromadb
import json

In [15]:
# Load the variables from .env file and set the API key (or user may manually set the API key)
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
os.environ["MATHPIX_API_ID"] = os.getenv('MATHPIX_API_KEY')
os.environ["HUGGINGFACE_API_KEY"] = os.getenv('HUGGINGFACE_API_KEY')
#openai.api_key = os.getenv('OPENAI_API_KEY')

# Langchain framework
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel # for RAG with source
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter

## The following loaders are used for options
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import PyPDFium2Loader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import MathpixPDFLoader
from langchain_community.document_loaders import PDFMinerLoader
from langchain.document_loaders import PyPDFLoader

In [16]:
## Initial variable setup
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large")
db_directory = "../data/chroma_semantic"
USE_Anthropic = True

if USE_Anthropic:
    llm = ChatAnthropic(model_name="claude-3-sonnet-20240229", temperature=0)

else:
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) # context window size 16k for GPT 3.5 Turbo


collection_list=[

]
#collection_list = ["PBR"] # for testing

In [None]:
# Define a function to load and extract text from PDFs in a folder
def get_file_name(source_path):
    return source_path.split('/')[-1]

def load_pdfs_from_folder(folder_path, loader_option):
    # Get a list of PDF files in the specified folder
    pdf_files = glob.glob(f"{folder_path}/*.pdf")
    docs = []
    for pdf_file in pdf_files:
        file_name = get_file_name(pdf_file)

        if loader_option == 1:
            # Load the PDF file using the PyPDFLoader
            loader = PyPDFLoader(pdf_file)
        elif loader_option == 2:
            # PyPDFium2Loader is known to be faster than PyPDFLoader
            loader = PyPDFium2Loader(pdf_file)
        elif loader_option == 3:
            # PyMuPDFLoader is known to be general purpose, rich metadata
            loader = PyMuPDFLoader(pdf_file)
        elif loader_option == 4:
            # Allows automated concatenate pages
            loader = PDFMinerLoader(pdf_file, concatenate_pages=True)

        loaded_docs = loader.load()

        for doc in loaded_docs:
            doc.metadata['source'] = file_name

        docs.extend(loaded_docs)
    return docs

def pdf_to_md(folder_path, download_path, loader_option):
    # Get a list of PDF files in the specified folder
    pdf_files = glob.glob(f"{folder_path}/*.pdf")
    for pdf_file in pdf_files:
        file_name = get_file_name(pdf_file)
        base_name = file_name.replace('.pdf', '')

        if loader_option == 1:
            # Load the PDF file using the PyPDFLoader
            loader = PyPDFLoader(pdf_file)
        elif loader_option == 2:
            # PyPDFium2Loader is known to be faster than PyPDFLoader
            loader = PyPDFium2Loader(pdf_file)
        elif loader_option == 3:
            # PyMuPDFLoader is known to be general purpose, rich metadata
            loader = PyMuPDFLoader(pdf_file)
        elif loader_option == 4:
            # Allows automated concatenate pages
            loader = PDFMinerLoader(pdf_file, concatenate_pages=True)
        elif loader_option == 5:
            # Use Mathpix OCR to load formula, tables
            # may be slower, but higher quality than all above
            # Require Mathpix API ID - 3 cents per pdf page
            loader = MathpixPDFLoader(pdf_file)

        loaded_docs = loader.load()

        for i, doc in enumerate(loaded_docs):
            doc.metadata['source'] = file_name
            if loader_option > 3:
                md_file_name = f"{download_path}/{base_name}.md"
            else:
                md_file_name = f"{download_path}/{base_name}{i+1:03d}.md"
            with open(md_file_name, 'w', encoding='utf-8') as md_file:
                md_file.write(doc.page_content)

def load_mds_from_folder(folder_path):
    # Get a list of md files in the specified folder
    md_files = glob.glob(f"{folder_path}/*.md")
    docs = []
    for md_file in md_files:
        file_name = get_file_name(md_file)
        base_name = file_name.replace('.md', '')
        pdf_file_name = f"{base_name}.pdf"

        loader = UnstructuredMarkdownLoader(md_file)
        loaded_docs = loader.load()
        #print(loaded_docs)
        for doc in loaded_docs:
            doc.metadata['source'] = pdf_file_name
            print(pdf_file_name)
        docs.extend(loaded_docs)

    return docs
