In [1]:
import os
import glob
import json

from langchain_community.document_transformers import DoctranQATransformer
from langchain_core.documents import Document
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
from langchain_core.prompts import PromptTemplate

In [2]:
load_dotenv()

True

In [3]:
embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")

## 获取pdf文档，分块并嵌入向量数据库

In [4]:
def get_qa_format(sample_text):
    documents = [Document(page_content=sample_text)]
    qa_transformer = DoctranQATransformer()
    return qa_transformer.transform_documents(documents)

In [5]:
def get_pdf_text(file_path):
    text = ""

    if os.path.isdir(file_path):
        file_path = os.path.join(file_path, '*.pdf')

    pdf_files = glob.glob(file_path)
    for pdf in pdf_files:
        print('处理文件：' + pdf)
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

In [42]:
sample_text = get_pdf_text("insurance/MyMillionMedical_en_new.pdf")

处理文件：insurance/MyMillionMedical_en_new.pdf


In [39]:
get_qa_format(sample_text=sample_text)

ValueError: Did not find openai_api_model, please add an environment variable `OPENAI_API_MODEL` which contains it, or pass `openai_api_model` as a named parameter.

In [6]:
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [7]:
def get_vectorstore(text_chunks):
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

In [8]:
raw_text = get_pdf_text('insurance')
text_chunks = get_text_chunks(raw_text)
vectorstore = get_vectorstore(text_chunks)
vectorstore.save_local("faiss_index")

处理文件：insurance/BeEasy_pb_en_202306.pdf
处理文件：insurance/vPrime_pb_en_202306.pdf
处理文件：insurance/意輕鬆意外保障計劃_pb_tc_202306.pdf
处理文件：insurance/尊衛您醫療計劃_pb_tc_202306.pdf
处理文件：insurance/揀易保癌症保障計劃_econ-plan-brochure-tc_combined.pdf
处理文件：insurance/CANsurance_econ-plan-brochure-en_Combined.pdf
处理文件：insurance/全自主百萬醫療計劃_tc_new2.pdf
处理文件：insurance/MyMillionMedical_en_new.pdf


## 执行检索任务

In [9]:
def get_conversation_chain(vectorstore):
    llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo') # 用于生成答案，可以换成较好的模型
    # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
        return_source_documents=True,
        condense_question_llm=llm, # 用于凝练问题，可以换成其他便宜的模型
    )
    return conversation_chain

In [10]:
db = FAISS.load_local("faiss_index", embeddings)
conversation = get_conversation_chain(db)

In [11]:
user_question = "MyMillion Medical Plan, How many newborns are eligible to enjoy the designated medical plan?"
result = conversation({"question": user_question})

In [12]:
follow_up_question = "How much is the compassionate death benefit?"
result_2 = conversation({"question": follow_up_question})

In [13]:
result_2

{'question': 'How much is the compassionate death benefit?',
 'chat_history': [HumanMessage(content='MyMillion Medical Plan, How many newborns are eligible to enjoy the designated medical plan?', additional_kwargs={}, example=False),
  AIMessage(content='Each newborn is eligible to enjoy the designated medical plan coverage for 2 years at no extra cost once, but there is no limit to the number of eligible newborns who can benefit from this coverage.', additional_kwargs={}, example=False),
  HumanMessage(content='How much is the compassionate death benefit?', additional_kwargs={}, example=False),
  AIMessage(content='The amount of the compassionate death benefit is 1% of the Initial Sum Insured, which in this case is HK$1,000,000. Therefore, the compassionate death benefit amount would be HK$10,000.', additional_kwargs={}, example=False)],
 'answer': 'The amount of the compassionate death benefit is 1% of the Initial Sum Insured, which in this case is HK$1,000,000. Therefore, the compas

In [14]:
result_2['source_documents'][0]

Document(page_content='if the Insured is \nunder age 19 at\nnext birthday\non the date of\nthe Accident)Bene/f_itsStandard SupremeMaximum Bene/f_it Amount\n9StandardMaximum Bene/f_it Amount\nSupremeWhat these plans cover\nTable 1: Bene/f_it at-a-glance\nCompassionate Death Bene/f_it 1% of the Initial Sum Insured\n24-hour Worldwide Assistance Service 5 Service programBene/f_its\n10Table of annual premiums\nTable 2: Annual premiums of diﬀerent Occupational Class 7 at\n diﬀerent Initial Sum Insured (Hong Kong resident) \nInitial Sum Insured (HK$)\nOccupational Class 7\n1\n2\n3\n4\nInitial Sum Insured (HK$)\nOccupational Class 7\n1\n2\n3\n4200,000 1,000,000 1,500,000\n240 1,200 1,800\n300 1,500 2,250\n360 1,800 2,700\n600 3,000 4,500\n500,000 1,000,000 1,500,000\n1,700 3,400 5,100\n2,125 4,250 6,375\n2,550 5,100 7,650\n4,250 8,500 12,750\nNote: \n• The actual premium amount may vary from the above amounts subject to underwriting decision.', metadata={})