In [46]:
# https://github.com/sudarshan-koirala/youtube-stuffs/blob/main/langchain/langchain_Semi_Structured_RAG.ipynb

In [302]:
import os
import re

from unstructured.partition.pdf import partition_pdf

from langchain.chat_models import AzureChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains.question_answering import load_qa_chain
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore, create_kv_docstore
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document

# Set logging for the queries
import logging
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.memory import VectorStoreRetrieverMemory
from langchain.globals import set_debug
set_debug(True)
# set_debug(False)

from operator import itemgetter

from collections import defaultdict

from typing import Any, List
from pydantic import BaseModel, Field

# 前置參數

- 公司proxy設定

- 欲解析的pdf名稱

In [3]:
pdf_name = "202302_2409_AIA_20231105_140505.pdf"
# pdf_name = "react.pdf"
assert os.path.exists(pdf_name)

- unstructured要做partition，需要先在本機安裝poppler、Tesseract-OCR，並設置環境變數

參考網址: https://github.com/insightbuilder/python_de_learners_data/blob/main/code_script_notebooks/projects/langChain_exploration/Unstructured_Quick_Tour_withPyPdf.ipynb

In [4]:
# tesseract安裝路徑
tesseract_install_dir = "C:\\Users\\BenBLLee\\AppData\\Local\\Programs\\Tesseract-OCR"
assert os.path.exists(tesseract_install_dir)

# poppler的 work dir路徑
poppler_bin_dir = "poppler-23.10.0\\Library\\bin"
assert os.path.exists(poppler_bin_dir)

# 將poppler、tesseract位置加入環境變數
os.environ["PATH"] += os.pathsep + os.path.join(os.getcwd(), poppler_bin_dir) + os.pathsep + tesseract_install_dir

- 使用本機的tiktoken模型

參考網址: https://stackoverflow.com/questions/76106366/how-to-use-tiktoken-in-offline-mode-computer

In [5]:
# tiktoken本地模型的位置
tiktoken_cache_dir = os.path.join(os.getcwd(),"9b5ad71b2ce5302211f9c61530b329a4922fc6a4")
assert os.path.exists(tiktoken_cache_dir)

# tiktoken本地模型的位置加入環境變數
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir

- openai相關設定

openai若設置正確，但仍無法取得api回應，需先取得crt認證。參考網址: https://community.openai.com/t/ssl-certificate-verify-failed/32442/39?page=2

- 建立langchain的chatopenai model

In [7]:
# chat model
model = AzureChatOpenAI(
    openai_api_base=gpt_base_url,
    openai_api_version=gpt_version,
    deployment_name=gpt_deployment,
    openai_api_key=chatgpt_key,
    openai_api_type=openai_type,
    streaming=True, 
    callbacks=[StreamingStdOutCallbackHandler()],
    temperature=0,
)

# Retriever

https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary

Use Multi Vector Retriever with summaries:

- InMemoryStore stores the raw text, tables
- vectorstore stores the embedded summaries

- 用hugging face上的 embedding model建立chroma vector db

In [188]:
# embedding model
embedd_model = "BAAI/bge-large-en-v1.5" # 英文
# embedd_model = "BAAI/bge-small-zh" # 中文

embeddings = HuggingFaceEmbeddings(model_name=embedd_model)

In [189]:
vector_store_dir = "./vector store"
collection_name=f'{pdf_name.replace(".pdf","")}.vectorstore'

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name=collection_name,
    embedding_function=embeddings,
    persist_directory=vector_store_dir,
    collection_metadata={"hnsw:space": "cosine"}
)


- 將text的檔案建在本地資料夾中
MultiVectorRetriever要吃BaseStore類

In [190]:
# The storage layer for the parent documents
file_store_dir = f"./text store/{pdf_name.replace('.pdf','')}"
store = LocalFileStore(root_path=file_store_dir) # 讓mset的資料可存本地BaseStore interface that works on the local file system.
doc_store = create_kv_docstore(store=store) # mset的key-value的value變成可放入Document的資料

- 建立MultiVectorRetriever
向量搜尋相似vector，依vector的id_key返回該id_key的doc_store文檔

In [191]:
id_key = "page"

# The retriever (longer docs)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=doc_store,
    id_key=id_key,
    search_kwargs={'k':3, "score_threshold": 1}
)

# The retriever (shorter docs)
short_doc_retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=doc_store,
    id_key=id_key,
    search_kwargs={'k':1, "score_threshold": 1}
)




- 建立Multi Query Retriever，把user輸入的question,映射出其它類似的questions

In [192]:
from typing import List
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field


QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""Generate five different versions(in English) of the given user question. 
    These questions better use different wording to overcome limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

m_q_retriever = MultiQueryRetriever.from_llm( retriever=retriever, llm=model, prompt=QUERY_PROMPT)

# 原始版本
# m_q_retriever = MultiQueryRetriever.from_llm(
#     retriever=retriever, llm=model
# )

- 建立ContextualCompressionRetriever，取得文檔時，先用user question對文檔壓縮，僅保留與question相關的資訊

In [193]:
compressor = LLMChainExtractor.from_llm(llm=model)
comp_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=m_q_retriever)
shorty_comp_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=short_doc_retriever)

# Unstructured PDF

用預訓練模型對pdf頁面的元素切塊，解析出text、table

參考資料:
1. https://www.youtube.com/watch?v=AYBMbIMG19M&t=34s
2. https://github.com/sudarshan-koirala/youtube-stuffs/blob/main/langchain/langchain_Semi_Structured_RAG.ipynb

- 解析pdf

In [15]:
raw_pdf_elements = partition_pdf(filename=pdf_name,
                                 # Unstructured first finds embedded image blocks
                                 extract_images_in_pdf=False,
                                 # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
                                 # Titles are any sub-section of the document
                                 infer_table_structure=True,
                                 # Post processing to aggregate text once we have the title
                                 chunking_strategy="by_title",
                                 # Chunking params to aggregate text blocks
                                 # Attempt to create a new chunk 3800 chars
                                 # Attempt to keep chunks > 2000 chars
                                 max_characters=4000,
                                 new_after_n_chars=3800,
                                 combine_text_under_n_chars=2000,
                                 #image_output_dir_path=path
                                 )

- 用pydantic整理Unstructure解析的pdf元素，先分開text、table元素

In [16]:
# 定義Element class
class Element(BaseModel):
    type: str = Field(..., description="元素的類型。表格型元素為'table'，文字型元素為'text'")
    text: str = Field(..., description="元素的內文。文字型元素為單純的text，表格元素則為html_text")
    page: int = Field(..., description="元素的所在頁數。")
    summary: str = Field("", description="元素內文的摘要。")
    hypo_questions: List[str] = Field([], description="3個假想可用元素的內文來回答的假設性問題。")

# Categorize element's type, page. Put text content into Element
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=element.metadata.text_as_html, page=element.metadata.page_number))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=element.text, page=element.metadata.page_number))

# Table elements
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text elements
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

126
136


# embedding
Smaller chunks: 目前pdf已經被unstructure拆成更小的元素

可以以下兩取向則一執行:

1. 取向一: 用small chunk的摘要做後續embedding
2. 取向二: 用small chunk的假設性問題做後續embedding

參考資料:

https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary

https://python.langchain.com/docs/modules/chains/how_to/openai_functions


- 取向一: 用small chunk的摘要做後續embedding

In [21]:
# 將table、text的文字取出為串列
tables = [e.text for e in table_elements]
texts = [e.text for e in text_elements]

In [22]:
summary_prompt_text= \
"""You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {content} """

summary_prompt = ChatPromptTemplate.from_template(summary_prompt_text)

summarize_chain = (
    {"content":RunnablePassthrough()}    
    | summary_prompt 
    | model 
    | StrOutputParser()
    )

# tables元素做摘要
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

# texts元素做摘要
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "<table><thead><th rowspan=\"2\">Assets Current assets:</th><th></th><th>30, Amount</th><th rowspan=\"2\">%</th><th rowspan=\"2\">31, Amount</th><th rowspan=\"2\">%</th><th rowspan=\"2\">30, Amount</th><th rowspan=\"2\">%</th></thead><thead><th></th><th></th><th></th><th></th><th></th><th></th><th></th><th></th></thead><tr><td>Cash and cash equivalents (Note 6(1))</td><td>$</td><td>90,530,945</td><td>23</td><td>80,613,120</td><td>21</td><td>84,942,092</td><td>21</td></tr><tr><td>Financial assets at fair value through profit or loss — current (Note 6(2))</td><td></td><td>211,920</td><td>-</td><td>365,037</td><td>-</td><td>521,802</td><td>-</td></tr><tr><td colspan=\"8\">Financial assets at amortized cost — current (Note 6(4)) - - - - 10,000,000 3</td></tr><tr><td>Notes and accounts receivable, net (Note 6(5))</td><td></td><td>21,979,562</td><td>6</td><td>18,620,248</td><td>5</td



 Preparation of Financial Reports by Securities Issuers and the IAS 34, Interim Financial Reporting, as endorsed by the FSC. However, they do not include all disclosures The financial statements are intended to present the company's financial status as per the standards and practices accepted in the Republic of China.[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 5:llm:AzureChatOpenAI] [21.64s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "The review of the consolidated financial statements of AUO Corporation and its subsidiaries for the periods ending June 30, 2023 and 2022, found no issues that would suggest the statements do not accurately represent the company's financial position, performance, and cash flows. The review was conducted in accordance with the Regulations Governing the Preparation of Financial Reports by Securities Issuers and the International Accounting Standard 34, endorsed by the Financial Supervisory Commission o



, the dividend should be at least 20% of these earnings. If not, AUO may decide not to distribute a dividend. The cash portion of the dividend should beAUO's Employees202 are1 granted earnings restricted distribution stocks was without approved payment in and two meetings in 2022 can fully vest 400,000 units after two years of service. They can also vest 40% and 60% at least 10% of the year. On February 23, 2023, AUO's Board of Directors decided not to distribute dividends for 2022.[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 5:llm:AzureChatOpenAI] [25.51s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "AUO's company policy, in accordance with the ROC Company Act, allows for capital surplus to be used to offset deficits or be distributed as stock dividends or cash, based on shareholdings. This must be approved by AUO's Board of Directors and reported to shareholders. The total capital surplus capitalized per year cannot exceed 10% of 



The AUO Corporation and its subsidiaries recognized revenue for the first half of 2023 and 2022, which was previously included in the contract127 thousand. As of June 30, 2023, ADTHLD, a subsidiary of AUO, had a share-based payment rewards plan for employees of AUO and its subsidiaries.[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 5:llm:AzureChatOpenAI] [18.96s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "AUO used the Black-Scholes option pricing model to measure the fair value of share-based payments. The compensation costs recognized for these payments were $0 thousand for both three and six months ended June 30, 2022, and $58,931 thousand and $72,744 thousand for the same periods in 2023. The capital surplus from the difference between the subscription price and repurchase cost of treasury shares for the six months ended June 30, 2023 was $18,127 thousand. As of June 30, 2023, ADTHLD, a subsidiary of AUO, had a share-based paymen

- 取向二: 用small chunk的假設性問題做後續embedding

In [17]:
# 用pydantic取得假設性問題的摘要
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

# 獲取假設性問題的prompt
hpo_q_format_prompt =ChatPromptTemplate.from_template(
    "Generate a list of 3 hypothetical questions that the below document could be used to answer:\n\n{doc}"
)
    

get_hypo_q_chain = (
    RunnablePassthrough() | hpo_q_format_prompt
    | model.bind(functions=functions, function_call={"name": "hypothetical_questions"})
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

# tables元素做摘要
table_questions = get_hypo_q_chain.batch([ {"doc":e.text} for e in table_elements], {"max_concurrency": 5})
# texts元素做摘要
text_questions = get_hypo_q_chain.batch([ {"doc":e.text} for e in text_elements], {"max_concurrency": 5})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "doc": "<table><thead><th rowspan=\"2\">Assets Current assets:</th><th></th><th>30, Amount</th><th rowspan=\"2\">%</th><th rowspan=\"2\">31, Amount</th><th rowspan=\"2\">%</th><th rowspan=\"2\">30, Amount</th><th rowspan=\"2\">%</th></thead><thead><th></th><th></th><th></th><th></th><th></th><th></th><th></th><th></th></thead><tr><td>Cash and cash equivalents (Note 6(1))</td><td>$</td><td>90,530,945</td><td>23</td><td>80,613,120</td><td>21</td><td>84,942,092</td><td>21</td></tr><tr><td>Financial assets at fair value through profit or loss — current (Note 6(2))</td><td></td><td>211,920</td><td>-</td><td>365,037</td><td>-</td><td>521,802</td><td>-</td></tr><tr><td colspan=\"8\">Financial assets at amortized cost — current (Note 6(4)) - - - - 10,000,000 3</td></tr><tr><td>Notes and accounts receivable, net (Note 6(5))</td><td></td><td>21,979,562</td><td>6</td><td>18,620,248</td><td>5</td><



[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 4:llm:AzureChatOpenAI] [7.45s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "",
        "generation_info": {
          "finish_reason": "stop"
        },
        "type": "ChatGenerationChunk",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessageChunk"
          ],
          "kwargs": {
            "example": false,
            "content": "",
            "additional_kwargs": {
              "function_call": {
                "name": "hypothetical_questions",
                "arguments": "{\n  \"questions\": [\n    \"What was the value of prepayments for purchases on 30, 2023?\",\n    \"How much did the noncurrent financial assets at amortized cost increase from 31, 2022 to 30, 2023?\",\n    \"What was the difference between the refundable and overpaid tax on 31



[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 4:llm:AzureChatOpenAI] [7.25s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "",
        "generation_info": {
          "finish_reason": "stop"
        },
        "type": "ChatGenerationChunk",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessageChunk"
          ],
          "kwargs": {
            "example": false,
            "content": "",
            "additional_kwargs": {
              "function_call": {
                "name": "hypothetical_questions",
                "arguments": "{\n  \"questions\": [\n    \"What was the value of common stock on 30, 2023?\",\n    \"How much did the value of convertible bonds change between 31, 2022 and 30, 2023?\",\n    \"What was the total value of 'others' on 31, 2022?\"\n  ]\n}"
              }
            }
        



[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 4:llm:AzureChatOpenAI] [8.47s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "",
        "generation_info": {
          "finish_reason": "stop"
        },
        "type": "ChatGenerationChunk",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessageChunk"
          ],
          "kwargs": {
            "example": false,
            "content": "",
            "additional_kwargs": {
              "function_call": {
                "name": "hypothetical_questions",
                "arguments": "{\n  \"questions\": [\n    \"What was the total amount for the three months ended June 30, 2023?\",\n    \"How does the total for the three months ended June 30, 2023 compare to the same period in 2022?\",\n    \"What was the total for the six months ended June 30, 2023?\"\n  ]

- summary、hypo_questions合併回Elements

In [23]:
for e, s, q in list(zip(table_elements, table_summaries, table_questions)):
    e.summary = s
    e.hypo_questions = q

for e, s, q in list(zip(text_elements, text_summaries, text_questions)):
    e.summary = s
    e.hypo_questions = q

- 新增summary和hypoquestions的Document到vector store

In [214]:
table_elements[30].hypo_questions

['What is the total amount recognized in financial liabilities at FVTPL—current?',
 'What is the total value represented in the table?',
 'What is the difference between the total value and the consideration recognized in financial liabilities at FVTPL—current?']

In [215]:
from tqdm import tqdm

def save_elements_doc_to_vectorstore( retriever, elements):    
    
    for e in tqdm(elements):

        page = str( e.page)

        # 待新增的Document list
        docs = [] 

        # summary的Document
        e_summary_doc = Document(page_content=e.summary, metadata={id_key: page})
        docs.append(e_summary_doc)

        for q in e.hypo_questions:

            # 假設性問題的Document
            e_question_doc = Document(page_content=q, metadata={id_key: page})
            docs.append(e_question_doc)
        
        # 存入vectorstore
        retriever.vectorstore.add_documents(docs)

    # vectorstore save to local disk
    retriever.vectorstore.persist()

# 將tables、texts的元素存入vectorstore
save_elements_doc_to_vectorstore( retriever, text_elements+table_elements)
# save_elements_doc_to_vectorstore( short_doc_retriever, text_elements+table_elements)

100%|██████████| 262/262 [07:28<00:00,  1.71s/it]


- 新增每頁的原文(text+table)到filesotre

In [216]:
# Add texts and tables to docstore
def save_page_content_to_filestore(elements):
    page_doc_dic = defaultdict(lambda: Document(page_content="", metadata={id_key: ""}))
    for e in elements:
        page = str(e.page)
        page_doc_dic[page].page_content += "\n" + e.text
        page_doc_dic[page].metadata[id_key] = page

    retriever.docstore.mset( list(page_doc_dic.items()))

    return page_doc_dic

# 將tables、texts的元素存入filesotre
page_doc_dic = save_page_content_to_filestore(text_elements+table_elements)

# Memory
- VectorStoreRetrieverMemory

In [227]:
memory_key="history"
memory_input_key="question"

# 建立vector store
memory_vectorstore = Chroma(
    collection_name="conversation_memory",
    embedding_function=embeddings,
    collection_metadata={"hnsw:space": "cosine"}
)

# 實體化VectorStoreRetrieverMemory
memory = VectorStoreRetrieverMemory(retriever=memory_vectorstore.as_retriever(), memory_key=memory_key, input_key=memory_input_key)

# 提取相關對話歷史
def get_relavant_memory_history(question):
    return memory.load_memory_variables({memory_input_key:question})[memory_key]

# 儲存本次對話歷史
def save_to_memory_history(kwargs:{"question":str, "model_result":dict}):
    question = kwargs.get("question")
    model_result = kwargs.get("model_result")
    memory.save_context({"input":question}, {"output":model_result["output_text"]})
    return model_result


# RAG from LangChain Expression Language.


定義鏈: 將使用者問題延伸做相似搜尋，依回傳頁數的Document、使用者問題做refine摘要

- prompt

In [287]:
# 使用者提問結合對話歷史脈絡的prompt
condense_question_prompt = ChatPromptTemplate.from_messages([
        ("system", "Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question so that it can be understood without the context of the preceding conversation"),
        ("human", "Chat History:'{history}'\nFollow Up question:'{question}'\nStandalone question:"),
])

# refine最開始前，第一份文檔要先做此prompt
initial_abstract_prompt = ChatPromptTemplate.from_messages([
            ("system", "You are a chatbot. Use contents of the 'context' *AS IS* that is relevant to answer(in Tradition Chinese) the user's 'question'. If none of the 'context' is relevant return NO_OUTPUT."),
            ("human", "'context':\n'{context_str}'\n'question':'{condensed_question}'\n'yout answer(in Tradition Chinese)':"),
])

# refine的第二份文檔以後，用此prompt
refine_abstract_prompt = ChatPromptTemplate.from_messages([
            ("system", "You are a chatbot. Use contents of the 'context' *AS IS* that is relevant to answer the user's 'question' to finished 'existed answer'. If none of the 'context' is relevant return 'existed answer'."),
            ("human", "'existed answer':{existing_answer}\n'context':\n\n{context_str}\n'question':{condensed_question}\n'yout answer(in Tradition Chinese)':"),
])

- 定義摘要對話歷史及問題的chain

In [288]:
condense_question_chain = (
    {"question":RunnablePassthrough(),
    "condensed_question":{"question":RunnablePassthrough(), "history":RunnablePassthrough()| get_relavant_memory_history } 
                        | condense_question_prompt 
                        | model 
                        | StrOutputParser()
    }
)

- 定義獲取document的chain

In [289]:
unique_docs = lambda doc: list(set(doc))

# multiple query retriever -> docs
mq_retrieve_doc_chain = (
    RunnablePassthrough()
    |{"question":itemgetter("question"),
     "condensed_question":itemgetter("condensed_question"),
    "input_documents": itemgetter("question")| m_q_retriever
    }
) 

# multiple query retriever -> docs -> compressed docs
cp_retrieve_doc_chain = (
    RunnablePassthrough()
    |{"question":itemgetter("question"),
     "condensed_question":itemgetter("condensed_question"),
    "input_documents": itemgetter("condensed_question")| comp_retriever 
    }
) 

# multiple query retriever -> docs -> compressed docs
short_cp_retrieve_doc_chain = (
    RunnablePassthrough()
    |{"question":itemgetter("question"),
     "condensed_question":itemgetter("condensed_question"),
    "input_documents": itemgetter("condensed_question")
    | shorty_comp_retriever 
    }
) 

- 定義qa_chain

In [290]:
# langchain已有封裝好的load_qa_chain
qa_model = load_qa_chain(
        model, 
        chain_type="refine", 
        return_refine_steps=True, 
        question_prompt=initial_abstract_prompt, 
        refine_prompt=refine_abstract_prompt
    )

qa_chain = (
    RunnablePassthrough()
    |{"question":itemgetter("question"),
      "result":{"condensed_question":itemgetter("condensed_question"),"input_documents":itemgetter("input_documents")}|qa_model
    }
)

- 儲存對話歷史的chain

In [291]:
memory_save_chain = (
    RunnablePassthrough()
    |{"question":itemgetter("question"),"model_result":itemgetter("result")} 
    | save_to_memory_history
)

In [292]:

mq_chain = (
    condense_question_chain # 將對話歷史結合使用者提問
    | mq_retrieve_doc_chain # 使用者提問提取相關文件
    | qa_chain # 依據提問及提供的文件做摘要回答
    # | memory_save_chain # 將回答存入對話歷史
)

cp_chain = (
    condense_question_chain # 將對話歷史結合使用者提問
    | cp_retrieve_doc_chain # 使用者提問提取相關文件
    | qa_chain # 依據提問及提供的文件做摘要回答
    | memory_save_chain # 將回答存入對話歷史
)


short_cp_chain = (
    RunnablePassthrough()|
    condense_question_chain # 將對話歷史結合使用者提問
    | short_cp_retrieve_doc_chain # 使用者提問提取相關文件
    | qa_chain # 依據提問及提供的文件做摘要回答
    | memory_save_chain # 將回答存入對話歷史
)


In [293]:
# https://python.langchain.com/docs/modules/agents/how_to/custom_agent

In [None]:
memory_save_chain = (
    RunnablePassthrough()
    |{"question":itemgetter("question"),"model_result":itemgetter("result")} 
    | save_to_memory_history
)

# 提取相關對話歷史
def get_relavant_memory_history(question):
    return memory.load_memory_variables({memory_input_key:question})[memory_key]

# 儲存本次對話歷史
def save_to_memory_history(kwargs:{"question":str, "model_result":dict}):
    question = kwargs.get("question")
    model_result = kwargs.get("model_result")
    memory.save_context({"input":question}, {"output":model_result["output_text"]})
    return model_result

In [340]:
from langchain.agents import tool

@tool
def shallow_docs_answer(word: str) -> int:
    """由資料庫取得較少的關聯參考文本，加速回覆使用者。用在使用者的需求可能不太需要深度解析時。使用此工具時請將User的問題原封不動丟進word參數。"""
    result = cp_chain.invoke( word, config={"callbacks": [ConsoleCallbackHandler()]})
    print(result)
    return result

@tool
def deep_docs_answer(word: str) -> int:
    """由資料庫取得較多的參考文本，為使用者詳細解答。用在使用者的需求可能需要深度解析、或明確表達先前回應不甚清楚時。使用此工具時請將User的問題原封不動丟進word參數。"""
    result = mq_chain.invoke( word, config={"callbacks": [ConsoleCallbackHandler()]})
    print(result)
    return result

@tool
def look_history(word: str) -> int:
    """幫你回憶一筆你和使用者的對話歷史。用在你判斷可能僅需查看對話歷史，不需查看額外文檔，就可回答使用者問題。使用此工具時，你可以將你自己想到的問題，或User的問題原封不動丟進word參數，就會返回最相關的歷史問答。"""
    history = get_relavant_memory_history(word)
    print(history)
    return history

# 儲存本次對話歷史
@tool
def save_history(dic: dict) -> int:
    """若你僅有調用staight_answer，沒有到資料庫查找文檔，你可以用這個函式來儲存對話歷史。傳入的參數是一個字典，包含兩個key，'question'、'model_result'。請把'question'的value放使用者的問題，'model_result'放你要給使用者的回應"""
    save_to_memory_history(dic)
    return dic

tools = [shallow_docs_answer, deep_docs_answer, look_history, save_history]

In [341]:
from langchain_community.tools.convert_to_openai import format_tool_to_openai_function

llm_with_tools = model.bind(functions=[format_tool_to_openai_function(t) for t in tools])

In [342]:
# from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import ReduceDocumentsChain

reduce_prompt = ChatPromptTemplate.from_messages([
            ("system", "Take the following set of summaries and distill them into a final, consolidated summary of the main themes. "),
            ("human", "summaries:{docs}\n\nHelpful Answer:"),
])

# Run chain
reduce_chain = LLMChain(llm=model, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

In [343]:
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents.format_scratchpad.openai_functions import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser


agent_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are very powerful assistant, but don't know current events",
        ),
        (
            "user", 
            "{input}",
        ),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)


agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_functions(x["intermediate_steps"]) 
        # | reduce_documents_chain
    }
    | agent_prompt
    | llm_with_tools
    # | RunnablePassthrough()
    | OpenAIFunctionsAgentOutputParser()
)

In [344]:
from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [345]:
result = agent_executor.invoke({"input": "不是這個，你還有給我另一個清單，裡面還有持股比例等訊息"}, config={"callbacks": [ConsoleCallbackHandler()]})

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "不是這個，你還有給我另一個清單，裡面還有持股比例等訊息"
}


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "不是這個，你還有給我另一個清單，裡面還有持股比例等訊息",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "input": "不是這個，你還有給我另一個清單，裡面還有持股比例等訊息",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableParallel > 4:chain:<lambda>] Entering Chain run with input:
[0m{
  "input": "不是這個，你還有給我另一個清單，裡面還有持股比例等訊息",
  "intermediate_steps": []
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableParallel > 4:chain:<lambda>] [1ms] Exiting Chain run with

In [331]:
agent_executor.tools

[StructuredTool(name='shallow_docs_answer', description='shallow_docs_answer(word: str) -> int - 由資料庫取得較少的關聯參考文本，加速回覆使用者。用在使用者的需求可能不太需要深度解析時。使用此工具時請將User的問題原封不動丟進word參數。', args_schema=<class 'pydantic.main.shallow_docs_answerSchemaSchema'>, func=<function shallow_docs_answer at 0x0000020D0AB4A820>),
 StructuredTool(name='deep_docs_answer', description='deep_docs_answer(word: str) -> int - 由資料庫取得較多的參考文本，為使用者詳細解答。用在使用者的需求可能需要深度解析、或明確表達先前回應不甚清楚時。使用此工具時請將User的問題原封不動丟進word參數。', args_schema=<class 'pydantic.main.deep_docs_answerSchemaSchema'>, func=<function deep_docs_answer at 0x0000020D0A262F70>)]

In [299]:
result = agent_executor.invoke({"input": "幫我再詳細說明審查項目的部分"}, config={"callbacks": [ConsoleCallbackHandler()]})

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "幫我再詳細說明審查項目的部分"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "幫我再詳細說明審查項目的部分",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "input": "幫我再詳細說明審查項目的部分",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableParallel > 4:chain:<lambda>] Entering Chain run with input:
[0m{
  "input": "幫我再詳細說明審查項目的部分",
  "intermediate_steps": []
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableParallel > 4:chain:<lambda>] [0ms] Exiting Chain run with output:
[0m{
  "output": "幫我再詳細說明審查項目的部分"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecu

In [313]:
def qa_fn(query: str):
    result = agent_executor.invoke({"input": query})
    return result["output"]

In [314]:
import gradio as gr


input_query = gr.components.Textbox(label="提問")
output_summary = gr.components.Textbox(label="Summary")

interface = gr.Interface(
    fn=qa_fn,
    inputs=input_query,
    outputs=output_summary,
    title="PDF QA",
    description="與PDF資料進行問答",
)

IMPORTANT: You are using gradio version 3.10.1, however version 3.14.0 is available, please upgrade.
--------


In [365]:
gr.Error?

[1;31mInit signature:[0m [0mgr[0m[1;33m.[0m[0mError[0m[1;33m([0m[0mmessage[0m[1;33m:[0m [0mstr[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m      Common base class for all non-exit exceptions.
[1;31mFile:[0m           c:\users\benbllee\appdata\local\programs\python\python38\lib\site-packages\gradio\exceptions.py
[1;31mType:[0m           type
[1;31mSubclasses:[0m     


In [None]:
os.environ['no_proxy'] = "127.0.0.1, ::1"
interface.launch()

In [362]:
interface.close()

Closing server running on port: 7861


In [358]:
import numpy as np
embed_dic = vectorstore._collection.get(include=['embeddings'])
embeds = np.array(embed_dic['embeddings'])

In [359]:
import pandas as pd
pd.DataFrame(embed_dic)

Unnamed: 0,ids,embeddings,metadatas,documents
0,81afc15a-bf0f-11ee-a827-04d9f5b9883a,"[-0.04364822059869766, -0.009149854071438313, ...",,
1,81afc15b-bf0f-11ee-ab9f-04d9f5b9883a,"[-0.00037384851020760834, 0.054077427834272385...",,
2,81afc15c-bf0f-11ee-8c55-04d9f5b9883a,"[-0.018978575244545937, -0.018475523218512535,...",,
3,81afc15d-bf0f-11ee-a9b7-04d9f5b9883a,"[-0.03718911111354828, -0.012039348483085632, ...",,
4,89609031-bf0f-11ee-b833-04d9f5b9883a,"[-0.03548971936106682, -0.005625970661640167, ...",,
...,...,...,...,...
4187,5e269509-bf3d-11ee-86fe-04d9f5b9883a,"[-0.010211263783276081, 0.03371633589267731, -...",,
4188,5fbfe585-bf3d-11ee-b617-04d9f5b9883a,"[-0.008214849978685379, 0.004189640749245882, ...",,
4189,5fbfe586-bf3d-11ee-b16a-04d9f5b9883a,"[-0.03312841057777405, 0.03238153085112572, -0...",,
4190,5fbfe587-bf3d-11ee-ab57-04d9f5b9883a,"[0.026447821408510208, 0.013673314824700356, -...",,


In [366]:
import gradio as gr
import sys

class Logger:
    def __init__(self, filename):
        self.terminal = sys.stdout
        self.log = open(filename, "w")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)
        
    def flush(self):
        self.terminal.flush()
        self.log.flush()
        
    def isatty(self):
        return False    

sys.stdout = Logger("output.log")

def test(x):
    print("This is a test")
    print(f"Your function is running with input {x}...")
    return x

def read_logs():
    sys.stdout.flush()
    with open("output.log", "r") as f:
        return f.read()

with gr.Blocks() as demo:
    with gr.Row():
        input = gr.Textbox()
        output = gr.Textbox()
    btn = gr.Button("Run")
    btn.click(test, input, output)
    
    logs = gr.Textbox()
    demo.load(read_logs, None, logs, every=1)
    
demo.queue().launch()

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




In [368]:
demo.close()

ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='6lxncl7teng_1' coro=<Queue.process_events() done, defined at C:\Users\BenBLLee\AppData\Local\Programs\Python\Python38\lib\site-packages\gradio\queue.py:271> exception=ValueError('[<gradio.queue.Event object at 0x0000020D070C27F0>] is not in list')>
Traceback (most recent call last):
  File "C:\Users\BenBLLee\AppData\Local\Programs\Python\Python38\lib\site-packages\gradio\queue.py", line 347, in process_events
    self.active_jobs[self.active_jobs.index(events)] = None
ValueError: [<gradio.queue.Event object at 0x0000020D070C27F0>] is not in list


Closing server running on port: 7862
