# Summary chain
## 1. Set up environment
Install necessary packages and set up environment variables.

In [1]:
# Setup environment variables
import dotenv
import os
import glob
from IPython.display import display, Markdown
import json

dotenv.load_dotenv()
os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')

## 2. Import modules
Import modules required for summary chain

In [2]:
# Import

from langchain_anthropic import ChatAnthropic
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PDFMinerLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain

# For semantic chunking - not used
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
# For laod summarize chain - not used
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.text_splitter import TokenTextSplitter
from langchain.chains import MapReduceChain

## 3. Setup model
Setup LLM model and prompts for stuff chain (summary of a full document)

In [3]:
# Define LLM
#llm = ChatAnthropic(model_name="claude-3-sonnet-20240229", temperature=0)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define prompt
prompt_template = """Write a summary of the following document using the format provided. The summary includes the title, publisher and published date if available, purpose and scope, key points, conclusion and implications, and key words. The summary should be comprehensive yet brief, aiming for a reading time of no more than one minute. Avoid any translation or substitution of actuarial terms in the document. When starting a summary, begin the summary with "Title:" without saying "Here is a summary". Here is the summary format:

Title: [title of the document]\n\nPublisher and Published Date: [published date and publisher's name]\n\nPurpose and Scope: [note the purpose and scope]\n\nKey Points:\n- [indicate key points in bullet point format]\n\nConclusions and Implications: [describe conclusion and implications for regulatory and practical purposes in the actuarial field]\n\nKey words: [indicate top five key words from the document]

Here is the document to summarize:

"{text}"
"""
prompt = PromptTemplate.from_template(prompt_template)

# Define LLM chain
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")

In [4]:
# important
# https://harikirankante.hashnode.dev/how-to-summarize-large-documents-using-langchain-and-openai-in-python#heading-map-reduce-chain
# Avoiding token limitations
stuff_chain = StuffDocumentsChain(
    llm_chain=llm_chain) #document_variable_name="doc_summaries"

reduce_chain = ReduceDocumentsChain(
    combine_documents_chain=stuff_chain,
)

# Map Reduce Chain
map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="text",
    reduce_documents_chain=reduce_chain)


### From md files

In [5]:
#########################################################################
# Markdown File summarizer
#########################################################################
# Define the collection list

collection_list=[
    #"legislation",
    #"ifrs17",
    #"solvencia_2",
    #"reg_delegado_v2",
    "reg_execucao"
]

# Check if the JSON file exists
if os.path.exists("summary.json"):
    # Load existing data from the file
    with open("summary.json", "r") as f:
        results = json.load(f)
else:
    # Initialize an empty list if the file doesn't exist
    results = {}

# Loop through each collection
for collection_name in collection_list:
    # Define the directory path for the current collection
    collection_dir = f"../data/md/{collection_name}"

    # Loop through each markdown file in the collection directory
    for file_name in os.listdir(collection_dir):
        if file_name.endswith(".md"):
            print(file_name)
            # Construct the markdown file path
            md_path = os.path.join(collection_dir, file_name)

            # Load the markdown file using UnstructuredMarkdownLoader
            loader = UnstructuredMarkdownLoader(md_path)
            doc = loader.load()

            # Split the content into smaller chuncks
            splitter = TokenTextSplitter(chunk_size=2000)
            split_docs = splitter.split_documents(doc)
            # Invoke the summarization chain
            #output = stuff_chain.invoke(split_docs)
            output = map_reduce_chain.run(split_docs)

            # Create a dictionary for the current record
            record = {
                "collection_name": collection_name,
                "summary": output
            }
            file_name_pdf = file_name.replace(".md", ".pdf")
            # Add the record to the results dictionary using the file name as the key
            results[file_name_pdf] = record

# Save the results as a JSON file
with open("summary.json", "w") as f:
    json.dump(results, f, indent=4)

CELEX_32023R0894_EN_TXT219.md


  warn_deprecated(


CELEX_32023R0894_EN_TXT188.md


KeyboardInterrupt: 

In [21]:
# pdf_path = "./data/pdf/VM21/202401 VM-21 pbr_data_valuation_manual_future_edition.pdf"
# loader = PDFMinerLoader(pdf_path, concatenate_pages=True)
# doc = loader.load()

# output = stuff_chain.invoke(doc)

# display(Markdown(output['output_text']))

Here is a summary of the key points from the actuarial document on valuation requirements for variable annuities:

Title: Valuation Manual - VM-21: Requirements for Principle-Based Reserves for Variable Annuities

Purpose and Scope: This section establishes the minimum reserve valuation standards for variable annuity contracts with guarantees. It constitutes the Commissioners Annuity Reserve Valuation Method (CARVM) for these contracts.

Key Points:
- Requires a stochastic reserve calculation on a combined group of contracts using prudent estimate assumptions and stochastic scenarios for equity returns and interest rates.
- The stochastic reserve is calculated as the Conditional Tail Expectation (CTE) 70 of the projected accumulated deficiencies under a set of capital market scenarios.
- Provides requirements for mapping funds to prescribed asset categories and fund data sources.
- Details methodologies for determining prudent estimate mortality, lapse, premium persistence, and other behavioral assumptions.
- Outlines requirements for the additional standard projection amount using either the Conditional Tail Expectation with Prescribed Assumptions (CTEPA) method or the Compnay-Specific Market Path (CSMP) method.
- Allows an alternative methodology for contracts with no guaranteed living benefits.
- Specifies guidance for modeling revenue sharing, hedging programs, and reinsurance.
- Requires an allocation of the aggregate reserve to individual contracts based on risk analysis.

Conclusions and implications: This prescribed reserve methodology aims to achieve a consistent valuation standard across companies for variable annuity guarantees by defining specific requirements around stochastic projections, prudent estimate assumptions, valuation scenarios, modeling techniques, and reporting.

### From pdf files

In [6]:

#########################################################################
# PDF File summarizer
#########################################################################
# Define the collection list
collection_list=[
    #"legislation",
    #"ifrs17",
    #"solvencia_2",
    #"reg_delegado_v2",
    "reg_execucao"
]

# Check if the JSON file exists
if os.path.exists("summary.json"):
    # Load existing data from the file
    with open("summary.json", "r") as f:
        results = json.load(f)
else:
    # Initialize an empty list if the file doesn't exist
    results = {}

# Loop through each collection
for collection_name in collection_list:
    # Define the directory path for the current collection
    collection_dir = f"../data/pdf/{collection_name}"

    # Loop through each PDF file in the collection directory
    for file_name in os.listdir(collection_dir):
        print(file_name)
        if file_name.endswith(".pdf"):
            # Construct the PDF file path
            pdf_path = os.path.join(collection_dir, file_name)

            # Load the PDF using PDFMinerLoader
            loader = PDFMinerLoader(pdf_path, concatenate_pages=True)
            doc = loader.load()

            # Split the content into smaller chuncks
            splitter = TokenTextSplitter(chunk_size=2000)
            split_docs = splitter.split_documents(doc)

            output = map_reduce_chain.run(split_docs)

            # Create a dictionary for the current record
            record = {
                "collection_name": collection_name,
                "summary": output
            }

            # Add the record to the results dictionary using the file name as the key
            results[file_name] = record

# Save the results as a JSON file
with open("summary.json", "w") as f:
    json.dump(results, f, indent=4)

CELEX_32023R0894_EN_TXT.pdf
CELEX_32023R0894_PT_TXT.pdf
