# Transforming the PDF files into vector database  

# 1. Initial Setup
This setup includes loading environment variables from a `.env` file, setting the required environment variables, and importing the necessary modules for further processing. It ensures that the code has access to the required APIs and functions for the subsequent tasks.


In [34]:
# Initial imports
from dotenv import load_dotenv
import glob
import os
import openai
from IPython.display import display, Markdown
import chromadb

# Load the variables from .env file and set the API key (or user may manually set the API key)
load_dotenv()  

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
os.environ["MATHPIX_API_ID"] = os.getenv('MATHPIX_API_KEY')
#openai.api_key = os.getenv('OPENAI_API_KEY')

# Langchain framework
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel # for RAG with source
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter

## The following loaders are used for options
from langchain_community.document_loaders import PyPDFium2Loader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import MathpixPDFLoader
from langchain_community.document_loaders import PDFMinerLoader
from langchain.document_loaders import PyPDFLoader


In [41]:
## Initial variable setup
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large")
db_directory = "./data/chroma_semantic"
USE_Anthropic = True

if USE_Anthropic:
    llm = ChatAnthropic(model_name="claude-3-sonnet-20240229", temperature=0)
else:
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) # context window size 16k for GPT 3.5 Turbo

# 2. Load PDF Files and Convert to a Vector DB

In [39]:
# Define a function to load and extract text from PDFs in a folder
def get_file_name(source_path):
    return source_path.split('/')[-1]

def load_pdfs_from_folder(folder_path, loader_option = 5):
    # Get a list of PDF files in the specified folder
    pdf_files = glob.glob(f"{folder_path}/*.pdf")
    docs = []
    for pdf_file in pdf_files:
        file_name = get_file_name(pdf_file)
        
        if loader_option == 1:
            # Load the PDF file using the PyPDFLoader
            loader = PyPDFLoader(pdf_file)
        elif loader_option == 2:
            # PyPDFium2Loader is known to be faster than PyPDFLoader
            loader = PyPDFium2Loader(pdf_file)
        elif loader_option == 3:
            # PyMuPDFLoader is known to be general purpose, rich metadata
            loader = PyMuPDFLoader(pdf_file)
        elif loader_option == 4:
            # Allows automated concatenate pages
            loader = PDFMinerLoader(pdf_file, concatenate_pages=True)
        elif loader_option == 5:
            # Use Mathpix OCR to load formula, tables
            # may be slower, but higher quality than all above
            # Require Mathpix API ID
            loader = MathpixPDFLoader(pdf_file)
        
        loaded_docs = loader.load()
        
        for doc in loaded_docs:
            doc.metadata['source'] = file_name
        
        docs.extend(loaded_docs)
    return docs

In [42]:
############################################################################
# Original Chunk
############################################################################
collection_list=[
    "ASOP_life",
    "Bermuda",
    "CFT",
    "VM21",
    "VM22",
    "Asset",
    "IFRS17"
]

for collection_name in collection_list:
    # Example folder path
    folder_path = './data/'+collection_name

    # Call the function to load and extract text from PDFs in the specified folder
    docs = load_pdfs_from_folder(folder_path, loader_option = 5)
    
    # Create a text splitter object with specified parameters
    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size=1000, # 1000 splits a page into roughly 3 chunks
    #     chunk_overlap=200,
    #     length_function=len,)

    # Split the documents into chunks using the text splitter
    #splits = text_splitter.split_documents(docs)
    
    text_splitter = SemanticChunker(embeddings_model)
    splits = text_splitter.split_documents(docs)

    # Create a Chroma vector database from the document splits, using OpenAIEmbeddings for embedding
    Chroma.from_documents(
        documents=splits, 
        embedding=embeddings_model, 
        persist_directory=db_directory,
        collection_name=collection_name,
    )

Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete
Status: loaded, waiting for processing to complete
Status: split, waiting for processing to complete
Status: loaded, waiting for processing t

In [35]:
# Test
collection_name="formula"

# Example folder path
folder_path = './data/'+collection_name

def load_pdfs_from_folder_test(folder_path, loader_option = 5):
    # Get a list of PDF files in the specified folder
    pdf_files = glob.glob(f"{folder_path}/*.pdf")
    docs = []
    for pdf_file in pdf_files:
        file_name = get_file_name(pdf_file)
        
        if loader_option == 1:
            # Load the PDF file using the PyPDFLoader
            loader = PyPDFLoader(pdf_file)
        elif loader_option == 2:
            # PyPDFium2Loader is known to be faster than PyPDFLoader
            loader = PyPDFium2Loader(pdf_file)
        elif loader_option == 3:
            # PyMuPDFLoader is known to be general purpose, rich metadata
            loader = PyMuPDFLoader(pdf_file)
        elif loader_option == 4:
            # Allows automated concatenate pages
            loader = PDFMinerLoader(pdf_file, concatenate_pages=True)
        elif loader_option == 5:
            # Use Mathpix OCR to load formula, tables
            # may be slower, but higher quality than all above
            # Require Mathpix API ID
            loader = MathpixPDFLoader(pdf_file)
        
        loaded_docs = loader.load()
        
        for doc in loaded_docs:
            doc.metadata['source'] = file_name
        
        docs.extend(loaded_docs)
    return docs

docs5 = load_pdfs_from_folder_test(folder_path, loader_option = 5)
len(docs5)


Status: loaded, waiting for processing to complete


1

In [37]:
docs5

[Document(page_content='c) For any fixed income instruments for which a credit rating is not available from any of the rating agencies named in step (a) or step (b) (as per the insurer\'s selection), the insurer may elect to either leave the assets as unrated (i.e. BSCR rating 8) or obtain the BSCR rating from the National Association of Insurance Commissioners (NAIC) Securities Valuation Office (SVO) rating.\n\nC2.3j The table below contains, for each credit rating agency, the rating categories for which their ratings are allowed for BSCR purposes (marked with \' $\\mathrm{X}$ \'). Additionally, the NAIC SVO ratings may be applied for otherwise unrated assets in accordance with the previous paragraph.\n\n| Credit Rating <br> Agency | Principal <br> Office | Financial <br> institutions, <br> brokers and <br> dealers | Insurance <br> companies | Corporate <br> issuers | Issuers of <br> asset-backed <br> securities | Government <br> securities, municipal <br> securities, foreign <br> gov

In [36]:
display(Markdown(docs5[0].page_content))

c) For any fixed income instruments for which a credit rating is not available from any of the rating agencies named in step (a) or step (b) (as per the insurer's selection), the insurer may elect to either leave the assets as unrated (i.e. BSCR rating 8) or obtain the BSCR rating from the National Association of Insurance Commissioners (NAIC) Securities Valuation Office (SVO) rating.

C2.3j The table below contains, for each credit rating agency, the rating categories for which their ratings are allowed for BSCR purposes (marked with ' $\mathrm{X}$ '). Additionally, the NAIC SVO ratings may be applied for otherwise unrated assets in accordance with the previous paragraph.

| Credit Rating <br> Agency | Principal <br> Office | Financial <br> institutions, <br> brokers and <br> dealers | Insurance <br> companies | Corporate <br> issuers | Issuers of <br> asset-backed <br> securities | Government <br> securities, municipal <br> securities, foreign <br> government <br> securities |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| Moody's Investor's <br> Service* | U.S. | $x$ | $X$ | $x$ | $x$ | $x$ |
| Standard and <br> Poor's $^{*}$ | U.S. | $x$ | $x$ | $x$ | $x$ | $x$ |
| Fitch Ratings* | U.S. | $x$ | $x$ | $x$ | $x$ | $x$ |
| Dominion Bond <br> Rating Service* | U.S. | $x$ | $x$ | $x$ | $x$ | $x$ |
| A.M. Best <br> Company* | U.S. |  | $x$ | $x$ | $x$ |  |
| Kroll Bond Rating <br> Agency* | U.S. | $x$ | $x$ | $x$ | $x$ | $x$ |
| Egan Jones Rating <br> Company* | U.S. | $x$ | $x$ | $x$ |  |  |
| Japan Credit Rating <br> Agency* $^{*}$ | Japan | $x$ | $x$ | $x$ |  | $x$ |

$\left.{ }^{*}\right)$ As determined by the SEC.

C2.3k The mapping of credit ratings to BSCR ratings are:

| BSCR <br> Rating |  <br> Poor's | Moody's | AM Best | Fitch |
| :---: | :---: | :---: | :---: | :---: |
| 1 | AAA | Aaa | aaa | AAA |
| 2 | AA+ to AA- | Aa1 to Aa3 | aa+ to aa- | AA+ to AA- |
| 3 | A+ to A- | A1 to A3 | a+ to a- | A+ to A- |
| 4 | BBB+ to BBB- | Baa1 to Baa3 | bbb+ to bbb- | BBB+ to BBB- |
| 5 | BB+ to BB- | Ba1 to Ba3 | bb+ to bb- | BB+ to BB- |
| 6 | B+ to B- | B1 to B3 | b+ to b- | B+ to B- |
| 7 | CCC+ to CCC- | Caa1 to Caa3 | ccc+ to ccc- | CCC+ to CCC- |
| 8 | Below CCC- | Below Caa3 | Below ccc- | Below CCC- |


| BSCR <br> Rating | KBRA | DBRS | Egan-Jones | Japan Credit <br> Rating Agency |
| :---: | :---: | :---: | :---: | :---: |


| 1 | AAA | AAA | AAA | AAA |
| :---: | :---: | :---: | :---: | :---: |
| 2 | AA+ to AA- | AA (High) to AA (Low) | AA+ to AA- | AA+ to AA- |
| 3 | A+ to A- | A (High) to A (Low) | A+ to A- | A+ to A- |
| 4 | BBB+ to BBB- | BBB (High) to BBB (Low) | BBB+ to BBB- | BBB+ to BBB- |
| 5 | BB+ to BB- | BB (High) to BB (Low) | BB+ to BB- | BB+ to BB- |
| 6 | B+ to B- | B (High) to B (Low) | B+ to B- | B+ to B- |
| 7 | CCC+ to CCC- | CCC (High) to CCC (Low) | CCC+ to CCC- | CCC+ to CCC- |
| 8 | Below CCC- | Below CCC (Low) | Below CCC- | Below CCC- |


| BSCR Rating | NAIC SVO |
| :---: | :---: |
| 1 | - |
| 2 | - |
| 3 | 1 |
| 4 | 2 |
| 5 | 3 |
| 6 | 4 |
| 7 | 5 |
| 8 | 6 |

C2.31 A BSCR rating of 0 (not included above) has been provided for certain high-quality fixedincome investments, specifically sovereign bonds and bond mutual funds.

## Additional Guidance

Figures are to be reported in thousand units (' $000 \mathrm{~s}$ )

C2.ii. Although the insurer does not need to prepare its financial statements in thousands units, the Authority requires insurers to report its statutory financial statements and economic balance sheet in thousands as this impacts the capital charges calculated in the BSCR model. The insurer may attach its Statutory Financial Return under "Other Attachments" to provide the BMA with a more accurate financial position

## Applying a BSCR Rating to unquoted internally rated investments

C2.iii. Insurers that have developed an internal rating for unquoted investments shall apply a BSCR rating of 8 . Nevertheless, if an insurer would like to request permission to use an equivalent scale of these investments in their BSCR filing, the insurer shall request such permission in writing to the BMA and include details on the internal rating assessment and a proposed equivalent scale to the BSCR Rating scale. Only upon approval shall the insurer reclassify an unquoted investment from BSCR Rating 8.

Government National Mortgage Association, Federal National Mortgage Association and Federal Home Loan Mortgage Corporation are not eligible for BSCR Rating 0

## D.SUMMARY

## D1. FEATURES - BSCR ON CURRENT BASIS

## Background

D1.1 The Summary exhibit has seven key features: Required Capital and Surplus, Available Statutory Capital and Surplus, MSM, ECR and TCL, Ratios, Solvency Capital Distribution chart and Regulatory Action Level graph. Each feature is described below. At the bottom of the page the BSCR formula for combining the various risk capital charges is displayed. The only financial data input into the Summary exhibit is the BMA-approved Capital Contribution of the insurer.

## Required Capital and Surplus

D1.2 The BSCR is determined according to the following formula:

![](https://cdn.mathpix.com/cropped/2024_03_14_87cc6d329b33981d47dag-3.jpg?height=417&width=1616&top_left_y=1062&top_left_x=342)

Where:

$C_{f i} \quad=$ capital charge in respect of fixed income investment risk;

$C_{e q} \quad=$ capital charge in respect of equity investment risk capital;

$C_{\text {LTint }}=$ capital charge in respect of interest rate and liquidity risk;

$C_{\text {curr }}=$ capital charge in respect of currency risk;

$C_{\text {conc }}=$ capital charge in respect of concentration risk;

$C_{\text {LTcred }}=$ capital charge in respect of credit risk capital;

$C_{\text {LTmort }}=$ capital charge in respect of long-term insurance risk - mortality;

$C_{L T s l}=$ capital charge in respect of long-term insurance risk - stop loss;

$C_{L T r}=$ capital charge in respect of long-term insurance risk - riders;

$C_{\text {LTmorb }}=$ capital charge in respect of long-term insurance risk - morbidity and disability;

## D2. FIXED INCOME INVESTMENT RISK

## Background

D2.1 There are various categories of assets comprising of bonds, loans and other miscellaneous investments that are used to determine the Fixed Income Investment Risk capital charge.

D2.2 Where applicable, the amounts must reconcile to the appropriate line(s) of the insurer's Form 4 EBS or to the schedules prescribed by or under the Rules for the relevant year.

## Fixed Income Investment Risk Capital Charge

D2.3 The fixed income investment risk charge calculation can be summarised by the following formula:

$C_{f i}=\sum_{i} \chi_{i} \times$ FIastclass $_{i} \times \mu_{r}, \quad$ where:

$i \quad=$ ranges over the classes set out below;

$\chi_{i} \quad$ BMA supplied asset class capital charge factor for type of fixed income asset class $i$;

FIastclass $_{i}=$ value of investment in fixed income asset class $i$ and

$\mu_{r} \quad=$ additional diversification adjustment factor applied to cash and cash equivalent balances, or 1 for other asset classes.

## Items

a) Corporate and Sovereign Bonds

| Line Item |  | Statement Source - The Rules |
| :--- | :--- | :--- |
| 1 | BSCR rating 0 | Based on Schedule II EBS and IIA EBS, line 1, column (1). |
| 2 | BSCR rating 1 | Based on Schedule II EBS and IIA EBS, line 2, column (1). |
| 3 | BSCR rating 2 | Based on Schedule II EBS and IIA EBS, line 3, column (1). |
| 4 | BSCR rating 3 | Based on Schedule II EBS and IIA EBS, line 4, column (1). |
| 5 | BSCR rating 4 | Based on Schedule II EBS and IIA EBS, line 5, column (1). |
| 6 | BSCR rating 5 | Based on Schedule II EBS and IIA EBS, line 6, column (1). |
| 7 | BSCR rating 6 | Based on Schedule II EBS and IIA EBS, line 7, column (1). |
| 8 | BSCR rating 7 | Based on Schedule II EBS and IIA EBS, line 8, column (1). |
| 9 | BSCR rating 8 | Based on Schedule II EBS and IIA EBS, line 9, column (1). |

b) Residential Mortgage-Backed Securities



In [None]:
text_splitter = SemanticChunker(embeddings_model)
splits = text_splitter.split_documents(docs)

In [None]:
splits[312]

In [None]:
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embeddings_model, 
                                    persist_directory=db_directory,collection_name=collection_name)

In [None]:
print(collection_name)

# 3. Retrieve from the Vector DB 

In [None]:
## FYI - not used
prompt = hub.pull("rlm/rag-prompt")
prompt

In [None]:
# Get a Chroma vector database with specified parameters
vectorstore = Chroma(embedding_function=embeddings_model, 
                     persist_directory=db_directory,
                     collection_name="Bermuda")
## a user may choose different collection name from the list

In [None]:
## Retrieve and RAG chain
# Create a retriever using the vector database as the search source
# You may choose a specific document to filter the search
retriever = vectorstore.as_retriever(search_type="mmr", 
                                     search_kwargs={
                                        'k': 6, 
                                        'lambda_mult': 0.5,
                                        # 'filter': {'source': '201611-Guidance-Notes-for-Commercial-Insurers-and-Groups-Statutory-Reporting-Regime-30-Nov-2016.pdf'}
                                        }
                                    ) 
# Use MMR (Maximum Marginal Relevance) to find a set of documents that are both similar to the input query and diverse among themselves
# Increase the number of documents to get, and increase diversity (lambda mult 0.5 being default, 0 being the most diverse, 1 being the least)

# Load the RAG (Retrieval-Augmented Generation) prompt
#prompt = hub.pull("rlm/rag-prompt")

qa_system_prompt = """You are a helpful assistant to help actuaries with question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
ASOP or asop means Actuarial Standards of Practice. \
CFT means Cash Flow Testing. AAT means Asset Adequacy Testing. \
BMA means Bermuda Monetary Authority. \
SBA means scenario-based approach. BEL means best estimate liabilities.\
After you answer, provide the sources you used to answer the question. \
If you don't know the answer, just say that you don't know. \

{context}"""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        ("human", "{question}"),
    ]
)

# Define a function to format the documents with their sources and pages
def format_docs_with_sources(docs):
    formatted_docs = "\n\n".join(doc.page_content for doc in docs)
    #sources_pages = "\n".join(f"{doc.metadata['source']} (Page {doc.metadata['page'] + 1})" for doc in docs)
    sources_pages = "\n".join(f"{doc.metadata['source']})" for doc in docs)
    # Added 1 to the page number assuming 'page' starts at 0 and we want to present it in a user-friendly way

    return f"Documents:\n{formatted_docs}\n\nSources and Pages:\n{sources_pages}"

# Create a RAG chain using the formatted documents as the context
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs_with_sources(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

# Create a parallel chain for retrieving and generating answers
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

# 4. Generate Q&A Function

In [None]:
def generate_output():
    # Prompt the user for a question on ASOP
    usr_input = input("What is your question on ASOP?: ")

    # Invoke the RAG chain with the user input as the question
    output = rag_chain_with_source.invoke(usr_input)

    # Generate the Markdown output with the question, answer, and context
    markdown_output = "### Question\n{}\n\n### Answer\n{}\n\n### Context\n".format(output['question'], output['answer'])

    last_page_content = None  # Variable to store the last page content
    i = 1 # Source indicator

    # Iterate over the context documents to format and include them in the output
    for doc in output['context']:
        current_page_content = doc.page_content.replace('\n', '  \n')  # Get the current page content
        
        # Check if the current content is different from the last one
        if current_page_content != last_page_content:
            #markdown_output += "- **Source {}**: {}, page {}:\n\n{}\n".format(i, doc.metadata['source'], doc.metadata['page'], current_page_content)
            markdown_output += "- **Source {}**: {}:\n\n{}\n".format(i, doc.metadata['source'], current_page_content)
            i = i + 1
        last_page_content = current_page_content  # Update the last page content
    
    # Display the Markdown output
    display(Markdown(markdown_output))

# Example questions related to ASOPs
- explain ASOP No. 14
- How are expenses relfected in cash flow testing based on ASOP No. 22?
- What is catastrophe risk?
- When do I update assumptions?
- What should I do when I do not have credible data to develop non-economic assumptions?

In [None]:
generate_output()

# 5. References
- https://www.actuarialstandardsboard.org/standards-of-practice/
- https://python.langchain.com/docs/use_cases/question_answering/quickstart
- https://python.langchain.com/docs/use_cases/question_answering/sources
- https://python.langchain.com/docs/integrations/text_embedding/
- https://python.langchain.com/docs/integrations/vectorstores/chroma
- https://docs.gpt4all.io/gpt4all_python_embedding.html#gpt4all.gpt4all.Embed4All
- https://chat.langchain.com/
- https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.chroma.Chroma.html

# Management of the vector database

In [None]:
client = chromadb.PersistentClient(path=db_directory)

In [None]:
#client.delete_collection(name="CFT") # Delete a collection and all associated 

In [None]:
collection = client.get_collection(name="CFT") 
collection.count()
collection.peek()