## lib & env

In [5]:
import os
import glob
import json

from langchain_community.document_transformers import DoctranQATransformer
from langchain_core.documents import Document
from langchain.document_loaders import PyPDFLoader
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
from langchain_core.prompts import PromptTemplate
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

In [2]:
load_dotenv()

True

In [3]:
embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")

## 获取pdf文档，分块并嵌入向量数据库

In [20]:
pdf_files = glob.glob('./insurance/*.pdf')
for pdf in pdf_files:
    pdf_path = os.path.join('./insurance', os.path.basename(pdf))
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split()

In [6]:
text = ""
file_path = "./insurance/*.pdf"
pdf_files = glob.glob(file_path)
for pdf in pdf_files:
    print('Processing File: ' + pdf)
    pdf_reader = PdfReader(pdf)
    for page in pdf_reader.pages:
        text += page.extract_text()

Processing File: ./insurance/BeEasy_pb_en_202306.pdf
Processing File: ./insurance/vPrime_pb_en_202306.pdf
Processing File: ./insurance/意輕鬆意外保障計劃_pb_tc_202306.pdf
Processing File: ./insurance/尊衛您醫療計劃_pb_tc_202306.pdf
Processing File: ./insurance/揀易保癌症保障計劃_econ-plan-brochure-tc_combined.pdf
Processing File: ./insurance/CANsurance_econ-plan-brochure-en_Combined.pdf
Processing File: ./insurance/全自主百萬醫療計劃_tc_new2.pdf
Processing File: ./insurance/MyMillionMedical_en_new.pdf


In [27]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False
)
chunks = text_splitter.split_documents(pages)

In [30]:
vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory="./chroma_db")

OperationalError: attempt to write a readonly database

## 执行检索任务

In [6]:
db = FAISS.load_local("faiss_index", embeddings)
db.docstore._dict

{'be862881-8284-4afb-badb-be75d6a34174': Document(page_content='202306  \n \n \n \n \n \nThank  you for  your  interest  in the  insurance  product  \n多謝你對有關保險 產品的支持  \n \nFor more  information, please  feel free to contact  us \n如欲瞭解更多詳情，歡迎隨時與我們聯絡  \n \n \n \n \nCustomer  Service  Hotline  \n客戶服務熱線  \n8209  0098  \n(Monday to Friday  9:00 am - 6:00 pm, \nexcept  Public  Holidays  \n星期一至 五上午九時至晚上 六時，  \n公眾假期除外 )  \n \ncs.clubcare .hk@pccw.com  \n \n \n \n \nRemarks  \nHKT Financial  Services  (IA) Limited  (“HKTIA”)  is a wholly  owned  subsidiary  of HKT Limited  (HKT  Limited  is', metadata={}),
 'b513804f-ed59-4b51-a789-d00458a67746': Document(page_content='a company incorporated in the Cayman Islands with limited liability), arranging for a wide range of life  \ninsurance and general insurance products under the brand of Club Care. HKTIA is a licensed insurance  \nagency in Hong Kong and regulated by the Insurance Authority of Hong Kong (Licensed insurance Agency  \nLicense No. FA2

In [7]:
def get_conversation_chain(vectorstore):
    llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo') # 用于生成答案，可以换成较好的模型
    # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
        return_source_documents=True,
        condense_question_llm=llm, # 用于凝练问题，可以换成其他便宜的模型
    )
    return conversation_chain

In [8]:

conversation = get_conversation_chain(db)

In [9]:
user_question = "MyMillion Medical Plan, How many newborns are eligible to enjoy the designated medical plan?"
result = conversation({"question": user_question})

In [10]:
result['answer']

'There is no limitation on the number of newborns who may enjoy the designated medical plan coverage under the MyMillion Medical Plan. Each newborn is eligible for this benefit once.'

In [11]:
result['source_documents']

[Document(page_content='Once the policy has been in force for 2 consecutive Policy Years, each of your \nnewborns can enjoy the coverage of a designated medical plan for 1 year at\nno extra cost. Each newborn is eligible to this bene/f_it once but there is\nno limitation on the number of newborns who may enjoy this bene/f_it.\nSeries of health assistance services provided by third parties\nFWD cares about your treatment and recovery journey by providing you with \nservices in addition to /f_inancial support:', metadata={}),
 Document(page_content='The Plan is a standalone medical insurance product. You can purchase this product without \nbundling with other insurance products.Add-On Feature\nProtection for your precious newborns7, 8  \nThe Plan’s coverage is so comprehensive, it even extends to the newest member of \nyour family. Your baby will be born into the protection of a designated medical \nplan, effective for two years at no extra cost, if your Policy has been in force for 2', 

In [12]:
follow_up_question = "How much is the compassionate death benefit?"
result_2 = conversation({"question": follow_up_question})

In [13]:
result_2['answer']

'The amount of the compassionate death benefit is 1% of the Initial Sum Insured. For example, if the Initial Sum Insured is HK$1,000,000, then the compassionate death benefit would be HK$10,000 (1% of HK$1,000,000).'

In [14]:
result_2['source_documents']

[Document(page_content='VIII. Compassionate Death Benefit\nXI. PREMIER THE ONEcierge 10Service Program\nXIII. Second Medical Opinion \n Service 11 Service ProgramX. Special Benefit for InfantHKD 15,000\nThis product material is for reference only and is indicative of the key features of the product. For the exact \nterms, conditions, benefits and exclusions of the product, please refer to the policy provisions of the product.', metadata={}),
 Document(page_content='Table 1: Bene/f_it at-a-glance\nCompassionate Death Bene/f_it 1% of the Initial Sum Insured\n24-hour Worldwide Assistance Service 5 Service programBene/f_its\n10Table of annual premiums\nTable 2: Annual premiums of diﬀerent Occupational Class 7 at\n diﬀerent Initial Sum Insured (Hong Kong resident) \nInitial Sum Insured (HK$)\nOccupational Class 7\n1\n2\n3\n4\nInitial Sum Insured (HK$)\nOccupational Class 7\n1\n2\n3\n4200,000 1,000,000 1,500,000\n240 1,200 1,800\n300 1,500 2,250\n360 1,800 2,700\n600 3,000 4,500', metadata={