# Transforming the PDF files into vector database  

# 1. Initial Setup
This setup includes loading environment variables from a `.env` file, setting the required environment variables, and importing the necessary modules for further processing. It ensures that the code has access to the required APIs and functions for the subsequent tasks.


In [91]:
# Initial set up
from dotenv import load_dotenv
import os
import openai
# Load the variables from .env file and set the API key (or user may manually set the API key)
load_dotenv()  # This loads the variables from .env (not part of repo)
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.getenv('OPENAI_API_KEY')

# Import the necessary modules
from langchain import hub
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.llms import Ollama
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel # for RAG with source
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from IPython.display import display, Markdown, Latex
import glob
import chromadb
## 
from langchain_community.document_loaders import PyPDFium2Loader
# from langchain_community.document_loaders import PyMuPDFLoader ## not used
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders import PDFMinerLoader
from langchain_core.prompts import ChatPromptTemplate

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor


In [104]:
## Initial variable setup
embeddings_model = OpenAIEmbeddings()
db_directory = "./data/chroma"
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) # context window size 16k for GPT 3.5 Turbo    

# 2. Load PDF Files and Convert to a Vector DB

In [102]:
# Define a function to load and extract text from PDFs in a folder
def get_file_name(source_path):
    return source_path.split('/')[-1]

def load_pdfs_from_folder(folder_path):
    # Get a list of PDF files in the specified folder
    pdf_files = glob.glob(f"{folder_path}/*.pdf")
    docs = []
    for pdf_file in pdf_files:
        file_name = get_file_name(pdf_file)
        
        # Load the PDF file using the PyPDFLoader
        #loader = PyPDFLoader(pdf_file)
        loader = PDFMinerLoader(pdf_file, concatenate_pages=True) 
        loaded_docs = loader.load()
        
        for doc in loaded_docs:
            doc.metadata['source'] = file_name
        
        docs.extend(loaded_docs)
    return docs

In [103]:
############################################################################
# Original Chunk
############################################################################
collection_list=[
    "ASOP_life",
    "Bermuda",
    "CFT",
    "VM21",
    "VM22",
    "Asset",
    "IFRS17"
]

#collection_list=["CFT"]

for collection_name in collection_list:
    # Example folder path
    folder_path = './data/'+collection_name

    # Call the function to load and extract text from PDFs in the specified folder
    docs = load_pdfs_from_folder(folder_path)
    
    # Create a text splitter object with specified parameters
    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size=1000, # 1000 splits a page into roughly 3 chunks
    #     chunk_overlap=200,
    #     length_function=len,)

    # Split the documents into chunks using the text splitter
    #splits = text_splitter.split_documents(docs)
    
    text_splitter = SemanticChunker(OpenAIEmbeddings())
    splits = text_splitter.split_documents(docs)

    # Create a Chroma vector database from the document splits, using OpenAIEmbeddings for embedding
    vectorstore = Chroma.from_documents(documents=splits, 
                                        embedding=embeddings_model, 
                                        persist_directory=db_directory,collection_name=collection_name)

In [78]:
# Test
collection_name="Bermuda"

# Example folder path
folder_path = './data/'+collection_name
#file_name = "201709 Asset_Adequacy_PracticeNote.pdf"

def load_pdfs_from_folder_test(folder_path):
    # Get a list of PDF files in the specified folder
    pdf_files = glob.glob(f"{folder_path}/*.pdf")
    docs = []
    for pdf_file in pdf_files:
        file_name = get_file_name(pdf_file)
        
        # Load the PDF file using the PyPDFLoader
        #loader = PyPDFLoader(pdf_file)
        #loader = PyPDFium2Loader(pdf_file) 
        # PyPDFium2Loader is known to be faster than PyPDFLoader
        #loader = PyMuPDFLoader(pdf_file) 
        # PyMuPDFLoader is known to be general purpose, rich metadata
        loader = PDFMinerLoader(pdf_file, concatenate_pages=True) 
        loaded_docs = loader.load()
        
        for doc in loaded_docs:
            doc.metadata['source'] = file_name
        
        docs.extend(loaded_docs)
    return docs

docs = load_pdfs_from_folder_test(folder_path)
len(docs)


8

In [None]:
docs[0]

In [80]:
text_splitter = SemanticChunker(OpenAIEmbeddings())
splits = text_splitter.split_documents(docs)

In [85]:
splits[312]

Document(page_content='Specifics of the method are provided below. 7. The  Authority has  developed  a  set  of  interest  rate  scenarios  that  have  been  calibrated \nusing an economic scenario generator to develop deviations that are approximately one \nstandard deviation away from  the  mean  so  as  to  target  events  that  may reasonably be \nexpected  to  occur. More  extreme  scenarios  would  be  captured  in  the  capital \nrequirement. These scenarios cover a number of  different  interest  rate  patterns  (such \nas  increasing,  decreasing,  increasing  and  decreasing, twists where the long and short \nterm rates behave differently, etc.)  The specific scenarios are as follows: \n\na. All  rates  decrease  annually  to  total  decrease  of  1.5%  in  tenth  year;  unchanged \n\nthereafter. b. All  rates  increase  annually  to  total  increase  of  1.5%  in  tenth  year;  unchanged \n\nthereafter. c. All  rates  decrease  annually  to  total  decrease  of  1.5%  in  fi

In [95]:
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embeddings_model, 
                                    persist_directory=db_directory,collection_name=collection_name)

In [100]:
print(collection_name)

IFRS17


# 3. Retrieve from the Vector DB 

In [52]:
## FYI - not used
prompt = hub.pull("rlm/rag-prompt")
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [96]:
# Get a Chroma vector database with specified parameters
vectorstore = Chroma(embedding_function=embeddings_model, 
                     persist_directory=db_directory,
                     collection_name="Bermuda")
## a user may choose different collection name from the list

In [97]:
## Retrieve and RAG chain
# Create a retriever using the vector database as the search source
# You may choose a specific document to filter the search
retriever = vectorstore.as_retriever(search_type="mmr", 
                                     search_kwargs={
                                        'k': 6, 
                                        'lambda_mult': 0.5,
                                        # 'filter': {'source': '201611-Guidance-Notes-for-Commercial-Insurers-and-Groups-Statutory-Reporting-Regime-30-Nov-2016.pdf'}
                                        }
                                    ) 
# Use MMR (Maximum Marginal Relevance) to find a set of documents that are both similar to the input query and diverse among themselves
# Increase the number of documents to get, and increase diversity (lambda mult 0.5 being default, 0 being the most diverse, 1 being the least)

# Load the RAG (Retrieval-Augmented Generation) prompt
#prompt = hub.pull("rlm/rag-prompt")

qa_system_prompt = """You are a helpful assistant to help actuaries with question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
ASOP or asop means Actuarial Standards of Practice. \
CFT means Cash Flow Testing. AAT means Asset Adequacy Testing. \
BMA means Bermuda Monetary Authority. \
SBA means scenario-based approach. BEL means best estimate liabilities.\
After you answer, provide the sources you used to answer the question. \
If you don't know the answer, just say that you don't know. \

{context}"""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        ("human", "{question}"),
    ]
)

# Define a function to format the documents with their sources and pages
def format_docs_with_sources(docs):
    formatted_docs = "\n\n".join(doc.page_content for doc in docs)
    #sources_pages = "\n".join(f"{doc.metadata['source']} (Page {doc.metadata['page'] + 1})" for doc in docs)
    sources_pages = "\n".join(f"{doc.metadata['source']})" for doc in docs)
    # Added 1 to the page number assuming 'page' starts at 0 and we want to present it in a user-friendly way

    return f"Documents:\n{formatted_docs}\n\nSources and Pages:\n{sources_pages}"

# Create a RAG chain using the formatted documents as the context
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs_with_sources(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

# Create a parallel chain for retrieving and generating answers
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

# 4. Generate Q&A Function

In [98]:
def generate_output():
    # Prompt the user for a question on ASOP
    usr_input = input("What is your question on ASOP?: ")

    # Invoke the RAG chain with the user input as the question
    output = rag_chain_with_source.invoke(usr_input)

    # Generate the Markdown output with the question, answer, and context
    markdown_output = "### Question\n{}\n\n### Answer\n{}\n\n### Context\n".format(output['question'], output['answer'])

    last_page_content = None  # Variable to store the last page content
    i = 1 # Source indicator

    # Iterate over the context documents to format and include them in the output
    for doc in output['context']:
        current_page_content = doc.page_content.replace('\n', '  \n')  # Get the current page content
        
        # Check if the current content is different from the last one
        if current_page_content != last_page_content:
            #markdown_output += "- **Source {}**: {}, page {}:\n\n{}\n".format(i, doc.metadata['source'], doc.metadata['page'], current_page_content)
            markdown_output += "- **Source {}**: {}:\n\n{}\n".format(i, doc.metadata['source'], current_page_content)
            i = i + 1
        last_page_content = current_page_content  # Update the last page content
    
    # Display the Markdown output
    display(Markdown(markdown_output))

# Example questions related to ASOPs
- explain ASOP No. 14
- How are expenses relfected in cash flow testing based on ASOP No. 22?
- What is catastrophe risk?
- When do I update assumptions?
- What should I do when I do not have credible data to develop non-economic assumptions?

In [99]:
generate_output()

### Question
explain eight interest rate scenarios used for Scenario-based approach.

### Answer
The eight interest rate scenarios used for the Scenario-based approach are as follows:

1. All rates decrease annually to a total decrease of 1.5% in the tenth year, unchanged thereafter.
2. All rates increase annually to a total increase of 1.5% in the tenth year, unchanged thereafter.
3. All rates decrease annually to a total decrease of 1.5% in the fifth year, then back up again by the tenth year.
4. All rates increase annually to a total increase of 1.5% in the fifth year, then back down again by the tenth year.
5. Decrease with a positive twist to the following net change after ten years (interpolate for other durations): -1.5%.
   - Year 1 spot rate
   - Year 10 spot rate
   - Year 30 spot rate
6. Decrease with a negative twist to the following net change after ten years (interpolate for other durations): -0.5%.
   - Year 1 spot rate
   - Year 10 spot rate
   - Year 30 spot rate
7. Increase with a positive twist to the following net change after ten years.
   - Year 1 spot rate: +0.5%
   - Year 10 spot rate: +1.0%
   - Year 30 spot rate: +1.5%
8. Increase with a negative twist to the following net change after ten years.
   - Year 1 spot rate: +1.5%
   - Year 10 spot rate: +1.0%
   - Year 30 spot rate: +0.5%

These scenarios are designed to reflect different interest rate patterns and behaviors, allowing for a comprehensive assessment of interest rate risk under the Scenario-based approach.

Sources:
BMA 202312-Supervision-and-Regulation-of-Private-Equity-Insurers.pdf

### Context
- **Source 1**: 201611-Guidance-Notes-for-Commercial-Insurers-and-Groups-Statutory-Reporting-Regime-30-Nov-2016.pdf:

Specifics of the method (including calculation details) are provided below. 257. The Authority has developed a set of interest rate scenarios to be used in this method. a. These   scenarios   will   cover   a   number   of   different   interest   patterns   (such   as   
increasing  decreasing,  increasing  and  decreasing,  twists  where  the  long  and  short   
term rates behave differently etc.)   
  
b. These  scenarios  have  been  calibrated  using  an  economic  scenario  generator  such   
that  the  deviations  are  approximately  one  standard  deviation  away  from  the  mean   
so  as  target  events  that  may  reasonably  be  expected  to  occur. More  extreme   
scenarios would be reflected in the capital requirement. c.
- **Source 2**: 2023-12-20-11-08-13-2023-Year-End-Long-Term-Instructions-Handbook.pdf:

and the base scenario under the scenario -based approach, according to the formula   
specified in the Prudential Rules.    
C28.1g  As a simplification companies using the scenario -based approach may calculate the capital   
charge for interest rate risk (before the application of offset) based on shocks to the   
balance sheet as if the base scenario had been applied. In this case the offse t is to be   
calculated as the difference between the ‘worst ’ scenario and the base scenario before the   
application of shocks.
- **Source 3**: 2023-12-20-13-15-07-2023-Year-End-Stress-and-Scenario-Instructions-for-Class-C-D--E.pdf:

R8. Inflation   
and Monetary   
Policy Risk   
  
Inflation risk stems from the general uncertainty of prices. Higher than expected inflation decreases the real yield on loans and debts while it may increase   
the value of indemnities, claims and expenses. Simulate a scenario similar to the 2022   
inflationary scenario. The (re)insurer should apply each inflation scenario for three years assuming no initial action to   
curb inflation from central banks. In year four, the central bank changes stance and increases   
rates to restore the current real interest rate. From year five onwards, inflation and interest rates   
return to current levels. Scenario   
  
Moderate   
Inflation   
Severe Inflation   
  
Change in   
inflation   
rate (Y1)   
  
Change in   
inflation   
rate (Y2)   
  
+5.0%   
  
+5.0%   
  
Change in   
inflation   
rate (Y3)   
+5.0%   
  
Change in   
inflation and   
interest rate (Y4)   
+5.0%   
  
+10.0%   
  
+10.0%   
  
+10.0%   
  
+10.0%   
  
To clarify, these stresses should be additively applied to the prevailing annual inflation/interest   
rate assumption used in valuing asset and liabilities (e.g., if the prevailing assumption is 3% p.a. then the moderate stressed assumption should be 8% p.a. for the first four years before returning   
to 3% p.a.). Scenario   
Deflation scenario   
  
Stressed inflation rate   
-1.0%   
  
This stress should replace the prevailing annual inflation rate assumption used in valuing assets   
and liabilities (e.g., if the prevailing assumption is 3% p.a. then the deflation scenario   
assumption should be -1% p.a., i.e., a 4% p.a. reduction in expectations). The interest rate   
assumption in year four should mirror the change in the inflation rate (i.e., -4% p.a. in the   
previous example).
- **Source 4**: BMA 202312-Supervision-and-Regulation-of-Private-Equity-Insurers.pdf:

interest rate scenarios, i.e., the SBA, again by design, accepts that there is no single truth about the   
future of interest rates (hence discount curves) and reflects this uncertainty and its potential impact   
on asset  and liability cashflows in the BEL calculation. Where a mismatch exists, because of asset and   
liability cashflow dynamics, the SBA assigns an explicit cost by picking the worst of the eight scenarios   
to determine the BEL. Over the past two years, Bermuda insurers using the SBA have withstood even   
higher interest rate shocks than  those  shown in the illustrative example above.    
   
As part of its supervisory process, the Authority carries out several assessments, which include   
requiring insurers to demonstrate the degree of matching quantitatively and qualitatively for the   
insurer’s existing asset and liability portfolios for which the SBA is used or proposed to be used.
- **Source 5**: 2023-07-28-16-11-59-Consultation-Paper---Proposed-Enhancements-to-the-Regulatory-Regime-and-Fees-for-Commercial-Insurers.pdf:

assessed over different time horizons with a focus on those horizons over which particular   
risks are expected to arise. Insurer -specific and market -wide scen arios should be   
considered , including their combinations. The scenarios should cover fast -moving and   
more sustained scenarios where the insurer’s liquidity position deteriorates slowly. Tests   
should also be carried out to test the insurer’s liquidity break ing point (i.e., liquidity reverse   
stress tests ).
- **Source 6**: 201611-Guidance-Notes-for-Commercial-Insurers-and-Groups-Statutory-Reporting-Regime-30-Nov-2016.pdf:

The specific scenarios are as follows:   
  
i. All  rates  decrease  annually  to  total  decrease  of  1.5%  in  tenth   year;   
unchanged thereafter. ii. All   rates   increase   annually   to   total   increase   of   1.5%   in   tenth   year;   
  
unchanged thereafter. iii. All  rates  decrease  annually  to  total  decrease  of  1.5%  in  fifth  year,  then   
  
back up again by tenth year. iv. All rates increase annually to total increase of 1.5% in fifth year, then back   
  
down again by tenth year. v. Decrease  with  positive  twist  to  the  following  net  change  after  ten  years   
  
(interpolate for other durations):    
  
-1.5%    
i. Year 1 spot rate   
ii. Year 10 spot rate   -1.0%    
iii. Year 30 spot rate   -0.5%   
  
vi. Decrease with negative twist to the following net change after for ten years   
  
(interpolate for other durations):    
  
-0.5%    
i. Year 1 spot rate   
ii. Year 10 spot rate   -1.0%    
iii. Year 30 spot rate   -1.5%   
  
45   
  
   
   
   
   
   
   
   
   
   
    
   
   
   
   
   
   
   
vii. Increase  with  positive  twist  to  the  following  net  change  after  ten  years   
(interpolate for other durations):    
  
i. Year 1 spot rate  +0.5%    
ii. Year 10 spot rate   +1.0%    
iii. Year 30 spot rate   +1.5%   
  
viii. Increase with negative twist to the following net change after for ten years   
  
(interpolate for other durations):     
  
 i. Year 1 spot rate  +1.5%    
ii. Year 10 spot rate   +1.0%    
iii. Year 30 spot rate   +0.5%   
  
257A. For purposes of calculating best estimate liabilities under the scenario-based method, the   
future yield curves under each scenario would be determined as follows:   
  
a. Convert initial spot rates to the corresponding forward rates.


# 5. References
- https://www.actuarialstandardsboard.org/standards-of-practice/
- https://python.langchain.com/docs/use_cases/question_answering/quickstart
- https://python.langchain.com/docs/use_cases/question_answering/sources
- https://python.langchain.com/docs/integrations/text_embedding/
- https://python.langchain.com/docs/integrations/vectorstores/chroma
- https://docs.gpt4all.io/gpt4all_python_embedding.html#gpt4all.gpt4all.Embed4All
- https://chat.langchain.com/
- https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.chroma.Chroma.html

# Management of the vector database

In [None]:
client = chromadb.PersistentClient(path=db_directory)

In [None]:
#client.delete_collection(name="CFT") # Delete a collection and all associated 

In [None]:
collection = client.get_collection(name="CFT") 
collection.count()
collection.peek()