# Converting the PDF files into vector database  

# 1. Initial setup
## 1.1. Imports
This setup includes loading environment variables from a `.env` file, setting the required environment variables, and importing the necessary modules for further processing. It ensures that the code has access to the required APIs and functions for the subsequent tasks.

In [1]:
# Initial imports
from dotenv import load_dotenv
import glob
import os
from IPython.display import display, Markdown
import chromadb
import json

In [21]:

# Load the variables from .env file and set the API key (or user may manually set the API key)
load_dotenv()  

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
os.environ["MATHPIX_API_ID"] = os.getenv('MATHPIX_API_KEY')
#openai.api_key = os.getenv('OPENAI_API_KEY')

# Langchain framework
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel # for RAG with source
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter

## The following loaders are used for options
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import PyPDFium2Loader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import MathpixPDFLoader
from langchain_community.document_loaders import PDFMinerLoader
from langchain.document_loaders import PyPDFLoader

## 1.2. Initial variable setup

In [22]:
## Initial variable setup
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large")
db_directory = "./data/chroma_semantic"
USE_Anthropic = True

if USE_Anthropic:
    llm = ChatAnthropic(model_name="claude-3-sonnet-20240229", temperature=0)
else:
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) # context window size 16k for GPT 3.5 Turbo

collection_list=[
    "ASOP_life",
    "Bermuda",
    "CFT",
    "PBR",
    "VM21",
    "VM22",
    "Asset",
    "IFRS17"
]
#collection_list = ["PBR"] # for testing

# 2. Load PDF files and convert to a vector DB
## 2.1. Define functions

In [43]:
# Define a function to load and extract text from PDFs in a folder
def get_file_name(source_path):
    return source_path.split('/')[-1]

def load_pdfs_from_folder(folder_path, loader_option):
    # Get a list of PDF files in the specified folder
    pdf_files = glob.glob(f"{folder_path}/*.pdf")
    docs = []
    for pdf_file in pdf_files:
        file_name = get_file_name(pdf_file)
        
        if loader_option == 1:
            # Load the PDF file using the PyPDFLoader
            loader = PyPDFLoader(pdf_file)
        elif loader_option == 2:
            # PyPDFium2Loader is known to be faster than PyPDFLoader
            loader = PyPDFium2Loader(pdf_file)
        elif loader_option == 3:
            # PyMuPDFLoader is known to be general purpose, rich metadata
            loader = PyMuPDFLoader(pdf_file)
        elif loader_option == 4:
            # Allows automated concatenate pages
            loader = PDFMinerLoader(pdf_file, concatenate_pages=True)
        
        loaded_docs = loader.load()
        
        for doc in loaded_docs:
            doc.metadata['source'] = file_name
        
        docs.extend(loaded_docs)
    return docs

def pdf_to_md(folder_path, download_path, loader_option):
    # Get a list of PDF files in the specified folder
    pdf_files = glob.glob(f"{folder_path}/*.pdf")
    for pdf_file in pdf_files:
        file_name = get_file_name(pdf_file)
        base_name = file_name.replace('.pdf', '')
        
        if loader_option == 1:
            # Load the PDF file using the PyPDFLoader
            loader = PyPDFLoader(pdf_file)
        elif loader_option == 2:
            # PyPDFium2Loader is known to be faster than PyPDFLoader
            loader = PyPDFium2Loader(pdf_file)
        elif loader_option == 3:
            # PyMuPDFLoader is known to be general purpose, rich metadata
            loader = PyMuPDFLoader(pdf_file)
        elif loader_option == 4:
            # Allows automated concatenate pages
            loader = PDFMinerLoader(pdf_file, concatenate_pages=True)
        elif loader_option == 5:
            # Use Mathpix OCR to load formula, tables
            # may be slower, but higher quality than all above
            # Require Mathpix API ID - 3 cents per pdf page
            loader = MathpixPDFLoader(pdf_file)
        
        loaded_docs = loader.load()
        
        for i, doc in enumerate(loaded_docs):
            doc.metadata['source'] = file_name
            if loader_option > 3:
                md_file_name = f"{download_path}/{base_name}.md"
            else:
                md_file_name = f"{download_path}/{base_name}{i+1:03d}.md"
            with open(md_file_name, 'w', encoding='utf-8') as md_file:
                md_file.write(doc.page_content)

def load_mds_from_folder(folder_path):
    # Get a list of md files in the specified folder
    md_files = glob.glob(f"{folder_path}/*.md")
    docs = []
    for md_file in md_files:
        file_name = get_file_name(md_file)
        base_name = file_name.replace('.md', '')
        pdf_file_name = f"{base_name}.pdf"
        
        loader = UnstructuredMarkdownLoader(md_file)
        loaded_docs = loader.load()
        print(loaded_docs)
        for doc in loaded_docs:
            doc.metadata['source'] = pdf_file_name
            print(pdf_file_name)
        docs.extend(loaded_docs)
        
    return docs


## 2.2. Convert PDFs to markdown files and then convert to vector database

In [30]:
############################################################################
# Run only to convert pdf to markdown files
############################################################################

collection_list=[
    #"Cayman",
    # "AI_BigData",
    #"ASOP_life",
    #"Bermuda",
    # "CFT",
    # "GAAP",
    # "RiskFinance",
    # "PBR",
    # "VM21",
    # "VM22",
    # "Asset",
    "ifrs17",
]
for collection_name in collection_list:
    # Put new files in the upload subfolder
    folder_path = '../data/pdf/'+collection_name
    download_path = '../data/md/'+collection_name
    os.makedirs(download_path, exist_ok=True)

    # Use loader option 5 to use Mathpix OCR to load formula, tables
    pdf_to_md(folder_path, download_path, loader_option = 5)

Status: loaded, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete


KeyboardInterrupt: 

In [44]:
############################################################################
# Run to load markdown files to vector database
############################################################################
collection_list=[
    #"Cayman",
    # "AI_BigData",
    #"ASOP_life",
    #"Bermuda",
    # "CFT",
    # "GAAP",
    # "RiskFinance",
    # "PBR",
    # "VM21",
    # "VM22",
    # "Asset",
    "ifrs17",
]
for collection_name in collection_list:
    # Put new files in the upload subfolder
    folder_path = '../data/md/'+collection_name
    
    # Call the function to load and extract text from PDFs in the specified folder
    docs = load_mds_from_folder(folder_path)
    
    
    # Create a text splitter object with specified parameters
    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size=1000, # 1000 splits a page into roughly 3 chunks
    #     chunk_overlap=200,
    #     length_function=len,)

    # Use semantic chunker to increase meaningfulness of the chunks
    text_splitter = SemanticChunker(embeddings_model)

    # Split the documents into chunks using the text splitter
    splits = text_splitter.split_documents(docs)

    # Create a Chroma vector database from the document splits
    Chroma.from_documents(
        documents=splits, 
        embedding=embeddings_model, 
        persist_directory=db_directory,
        collection_name=collection_name,
    )

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emilioaguiar/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[Document(page_content="IFRS 17\n\nInsurance Contracts\n\nIn March 2004 the International Accounting Standards Board (Board) issued IFRS 4 Insurance Contracts. IFRS 4 was an interim standard which was meant to be in place until the Board completed its project on insurance contracts. IFRS 4 permitted entities to use a wide variety of accounting practices for insurance contracts, reflecting national accounting requirements and variations of those requirements, subject to limited improvements and specified disclosures.\n\nIn May 2017, the Board completed its project on insurance contracts with the issuance of IFRS 17 Insurance Contracts. IFRS 17 replaces IFRS 4 and sets out principles for the recognition, measurement, presentation and disclosure of insurance contracts within the scope of IFRS 17.\n\nIn June 2020, the Board issued Amendments to IFRS 17. The objective of the amendments is to assist entities implementing the Standard, while not unduly disrupting implementation or diminishing

## 2.3. (NOT USED) Convert PDFs to vector database directly

In [13]:
############################################################################
# Run to load pdf files to vector database
############################################################################

for collection_name in collection_list:
    # Put new files in the upload subfolder
    folder_path = './data/upload/pdf/'+collection_name

    # Call the function to load and extract text from PDFs in the specified folder
    docs = load_pdfs_from_folder(folder_path, loader_option = 1)
    
    # Create a text splitter object with specified parameters
    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size=1000, # 1000 splits a page into roughly 3 chunks
    #     chunk_overlap=200,
    #     length_function=len,)

    # Use semantic chunker to increase meaningfulness of the chunks
    text_splitter = SemanticChunker(embeddings_model)

    # Split the documents into chunks using the text splitter
    splits = text_splitter.split_documents(docs)

    # Create a Chroma vector database from the document splits
    Chroma.from_documents(
        documents=splits, 
        embedding=embeddings_model, 
        persist_directory=db_directory,
        collection_name=collection_name,
    )

ValueError: Expected IDs to be a non-empty list, got []

# 3. For test purposes
## 3.1. Define vector store from vector database 

In [14]:
## a user may choose different collection name from the list
# ["ASOP_life", "Bermuda", "CFT", "VM21", "VM22", "Asset", "IFRS17"]
collection_name = collection_list[0] 

# Get a Chroma vector database with specified parameters
vectorstore = Chroma(embedding_function=embeddings_model, 
                     persist_directory=db_directory,
                     collection_name=collection_name)

## 3.2. Retrieve from the vector store

In [15]:
## Retrieve and RAG chain
# Create a retriever using the vector database as the search source
# You may choose a specific document to filter the search
retriever = vectorstore.as_retriever(search_type="mmr", 
                                     search_kwargs={
                                        'k': 6, 
                                        'lambda_mult': 0.5,
                                        # 'filter': {'source': '201611-Guidance-Notes-for-Commercial-Insurers-and-Groups-Statutory-Reporting-Regime-30-Nov-2016.pdf'}
                                        }
                                    ) 
# Use MMR (Maximum Marginal Relevance) to find a set of documents that are both similar to the input query and diverse among themselves
# Increase the number of documents to get, and increase diversity (lambda mult 0.5 being default, 0 being the most diverse, 1 being the least)

# Load the RAG (Retrieval-Augmented Generation) prompt
qa_system_prompt = """You are a helpful assistant to help actuaries with question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
ASOP or asop means Actuarial Standards of Practice. \
CFT means Cash Flow Testing. AAT means Asset Adequacy Testing. \
BMA means Bermuda Monetary Authority. \
SBA means scenario-based approach. BEL means best estimate liabilities.\
After you answer, provide the sources you used to answer the question. \
If you don't know the answer, just say that you don't know. \

{context}"""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        ("human", "{question}"),
    ]
)

# Define a function to format the documents with their sources and pages
def format_docs_with_sources(docs):
    formatted_docs = "\n\n".join(doc.page_content for doc in docs)
    #sources_pages = "\n".join(f"{doc.metadata['source']} (Page {doc.metadata['page'] + 1})" for doc in docs)
    sources_pages = "\n".join(f"{doc.metadata['source']})" for doc in docs)
    # Added 1 to the page number assuming 'page' starts at 0 and we want to present it in a user-friendly way

    return f"Documents:\n{formatted_docs}\n\nSources and Pages:\n{sources_pages}"

# Create a RAG chain using the formatted documents as the context
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs_with_sources(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

# Create a parallel chain for retrieving and generating answers
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

## 3.3. Generate Q&A Function

In [16]:
def generate_output():
    # Prompt the user for a question on ASOP
    usr_input = input("What is your question on ASOP?: ")

    # Invoke the RAG chain with the user input as the question
    output = rag_chain_with_source.invoke(usr_input)

    # Generate the Markdown output with the question, answer, and context
    markdown_output = "### Question\n{}\n\n### Answer\n{}\n\n### Context\n".format(output['question'], output['answer'])

    last_page_content = None  # Variable to store the last page content
    i = 1 # Source indicator

    # Iterate over the context documents to format and include them in the output
    for doc in output['context']:
        current_page_content = doc.page_content.replace('\n', '  \n')  # Get the current page content
        
        # Check if the current content is different from the last one
        if current_page_content != last_page_content:
            #markdown_output += "- **Source {}**: {}, page {}:\n\n{}\n".format(i, doc.metadata['source'], doc.metadata['page'], current_page_content)
            markdown_output += "- **Source {}**: {}:\n\n{}\n".format(i, doc.metadata['source'], current_page_content)
            i = i + 1
        last_page_content = current_page_content  # Update the last page content
    
    # Display the Markdown output
    display(Markdown(markdown_output))

### Example questions related to ASOPs
- explain ASOP No. 14
- How are expenses relfected in cash flow testing based on ASOP No. 22?
- What is catastrophe risk?
- When do I update assumptions?
- What should I do when I do not have credible data to develop non-economic assumptions?

In [17]:
generate_output()

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'messages.0: all messages must have non-empty content except for the optional final assistant message'}}

# 4. References
- https://www.actuarialstandardsboard.org/standards-of-practice/
- https://python.langchain.com/docs/use_cases/question_answering/quickstart
- https://python.langchain.com/docs/use_cases/question_answering/sources
- https://python.langchain.com/docs/integrations/text_embedding/
- https://python.langchain.com/docs/integrations/vectorstores/chroma
- https://docs.gpt4all.io/gpt4all_python_embedding.html#gpt4all.gpt4all.Embed4All
- https://chat.langchain.com/
- https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.chroma.Chroma.html

# 5. Management of the vector database

In [None]:
client = chromadb.PersistentClient(path=db_directory)

In [None]:
#client.delete_collection(name="VM20") # Delete a collection and all associated files

In [None]:
collection = client.get_collection(name="AI_BigData") 
collection.count()


In [None]:
collection.peek(1)

In [None]:
#collection.modify(name="PBR") rename

In [None]:
from collections import defaultdict

# Get all the metadatas
metadatas = collection.get()['metadatas']

# Create a dictionary to store distinct source names
distinct_sources = defaultdict(set)

# Iterate through metadatas and add source names to the dictionary
for metadata in metadatas:
    source = metadata.get('source', None)
    if source:
        distinct_sources[source].add(source)

# Get the distinct source names as a list
distinct_source_names = list(distinct_sources.keys())

print(distinct_source_names)

In [None]:
collection.get(
    where={"source": "2023_BMA-Supervision-and-Regulation-of-Private-Equity-Insurers.pdf"}
)

In [None]:
collection.delete(
    #ids=["id1", "id2", "id3",...],
    where={"source": "VA_PN_Supplement_Exposure_Draft.pdf"}
)

# Rename process

In [None]:
# Rename process
folder_name = "VM22"
old_file_name = "VM-22 Subgroup Draft July 2023 Clean.pdf"
new_file_name = "202307-NAIC-VM-22 Subgroup Draft.pdf"

# Chroma DB update
collection = client.get_collection(name=folder_name)
results = collection.get(where={"source": old_file_name})
ids_to_update = results['ids']
print(ids_to_update)
new_metadatas = [{"source": new_file_name} for _ in ids_to_update]
collection.update(ids=ids_to_update, metadatas=new_metadatas)


In [None]:

# json file summary update
with open('summary.json', 'r') as f:
    data = json.load(f)
# Update the existing data with the new key-value pair
data[new_file_name] = data.pop(old_file_name)

with open('summary.json', 'w') as f:
    json.dump(data, f, indent=4)


In [None]:

# md and pdf file update
# Construct the paths to the folders
md_folder = os.path.join("data/md", folder_name)
pdf_folder = os.path.join("data/pdf", folder_name)

# Construct the old and new file paths
old_file_name_md = old_file_name.replace(".pdf", ".md")
new_file_name_md = new_file_name.replace(".pdf", ".md")
old_md_path = os.path.join(md_folder, old_file_name_md)
new_md_path = os.path.join(md_folder, new_file_name_md)

# Check if the old MD file exists before renaming
if os.path.exists(old_md_path):
    # Rename the MD file
    os.rename(old_md_path, new_md_path)
else:
    print(f"Skipping MD file rename: {old_md_path} does not exist.")

# Construct the old and new file paths
old_pdf_path = os.path.join(pdf_folder, old_file_name)
new_pdf_path = os.path.join(pdf_folder, new_file_name)

# Rename the file
os.rename(old_pdf_path, new_pdf_path)