In [5]:
# https://github.com/sudarshan-koirala/youtube-stuffs/blob/main/langchain/langchain_Semi_Structured_RAG.ipynb

In [2]:
import os
import re

from unstructured.partition.pdf import partition_pdf
from langchain.chat_models import AzureChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough


from pydantic import BaseModel
from typing import Any, Optional

In [3]:
pdf_name = "./202302_2409_AIA_20231105_140505.pdf"

# 放tiktoken模型的位置
tiktoken_cache_dir = os.getcwd()
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
assert os.path.exists(os.path.join(tiktoken_cache_dir,"9b5ad71b2ce5302211f9c61530b329a4922fc6a4"))

# openai認證
openai_crt_dir = "openaiCert/TMGCert.crt"
os.environ['REQUESTS_CA_BUNDLE'] = os.path.join(os.getcwd(), openai_crt_dir)

# Azure GPT url, api-key
chatgpt_url = "https://auoazchatgpt08.openai.azure.com/openai/deployments/GPT4/chat/completions?api-version=2023-07-01-preview"
chatgpt_key = "e498c3852b1145ddb27ff0b9cbc4f4ca"

# tesseract安裝路徑
tesseract_install_dir = "C:\\Users\\BenBLLee\\AppData\\Local\\Programs\\Tesseract-OCR"

# poppler的 work dir路徑
poppler_bin_dir = "poppler-23.10.0\\Library\\bin"


# proxy相關
proxy = "auhqproxy.cdn.corpnet.auo.com:8080"
auo_account = "benbllee"
auo_password = "Aa0937454850"


# 將poppler、tesseract加入環境變數
os.environ["PATH"] += os.pathsep + os.path.join(os.getcwd(), poppler_bin_dir) + os.pathsep + tesseract_install_dir
# proxy加入環境變數
http_proxy_url = f"http://{auo_account}:{auo_password}@{proxy}"
https_proxy_url = f"https://{auo_account}:{auo_password}@{proxy}"
os.environ["HTTP_PROXY"] = http_proxy_url 
os.environ["HTTPS_PROXY"] = https_proxy_url

In [4]:
http_proxy_url

'http://benbllee:Aa0937454850@auhqproxy.cdn.corpnet.auo.com:8080'

In [31]:
# 解析BASE和DEPLOYMENT_NAME，作為參數置入AzureChatOpenAI()
pattern = r"(https://.+\.openai\.azure\.com/)openai/deployments/(.+)/chat/"
BASE_URL = re.search(pattern, chatgpt_url).group(1)            # ex. "https://auoazchatgpt03.openai.azure.com/"
DEPLOYMENT_NAME = re.search(pattern, chatgpt_url).group(2)     # ex. "GPT4"

# LLM
model = AzureChatOpenAI(
    openai_api_base=BASE_URL,
    openai_api_version="2023-07-01-preview",
    deployment_name=DEPLOYMENT_NAME,
    openai_api_key=chatgpt_key,
    openai_api_type="azure",
    # tiktoken_model_name="cl100k_base",
    streaming=True, 
    # callbacks=[rewrite_callback()],
    callbacks=[StreamingStdOutCallbackHandler()],
    temperature=0,
)

In [19]:
raw_pdf_elements = partition_pdf(filename=pdf_name,
                                 # Unstructured first finds embedded image blocks
                                 extract_images_in_pdf=False,
                                 # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
                                 # Titles are any sub-section of the document
                                 infer_table_structure=True,
                                 # Post processing to aggregate text once we have the title
                                 chunking_strategy="by_title",
                                 # Chunking params to aggregate text blocks
                                 # Attempt to create a new chunk 3800 chars
                                 # Attempt to keep chunks > 2000 chars
                                 max_characters=4000,
                                 new_after_n_chars=3800,
                                 combine_text_under_n_chars=2000,
                                 #image_output_dir_path=path
                                 )

Downloading:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/115M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

In [213]:
from unstructured.staging.base import elements_to_json
json_ = elements_to_json(raw_pdf_elements)
print(json_)

[
    {
        "type": "CompositeElement",
        "element_id": "fe3024f65639494efa9a46f99e0352dc",
        "metadata": {
            "coordinates": {
                "points": [
                    [
                        1329.3,
                        201.5
                    ],
                    [
                        1329.3,
                        232.2
                    ],
                    [
                        1541.7,
                        232.2
                    ],
                    [
                        1541.7,
                        201.5
                    ]
                ],
                "system": "PixelSpace",
                "layout_width": 1654,
                "layout_height": 2339
            },
            "filename": "202302_2409_AIA_20231105_140505.pdf",
            "file_directory": ".",
            "last_modified": "2023-11-05T14:05:18",
            "filetype": "application/pdf",
            "page_number": 1,
            "detect

In [21]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 136,
 "<class 'unstructured.documents.elements.Table'>": 126}

In [275]:
class Element(BaseModel):
    type: str
    text: Any
    html: Any
    page: int
    coords: list

# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element), html='', page=element.metadata.page_number, coords=element.metadata.coordinates.points))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element), html=element.metadata.text_as_html, page=element.metadata.page_number, coords=element.metadata.coordinates.points))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

126
136


In [256]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

The text provides a detailed financial breakdown for the years 2022 and 2023. It includes various financial aspects such as interest expense,The table presents a detailed breakdown of assets for three different dates: June 30, 2023, December 31, 2022, and June 30, 2022. The assetsThe table provides a detailed account of the equity attributable to shareholders of AUO Corporation from January 1, 2022, to June 30, 2023. It includes various components such as capital stock, retained earnings, capital surplus, and other comprehensive income. The total equity balance at the start interest income, compensation costs of share-based payments, gains on disposals of property, plant and equipment, and unrealized foreign currency exchangeThe table presents a detailed financial report for the periods ending June 30, 2023 and 2022. It includes revenue, cost of sales, gross profit,The table presents a detailed breakdown of liabilities, equity, and other financial components for three different dates: 



 indicates whether there is an allowance for bad debt, with all companies listed as 'Yes'. The interest rate item value varies across companies, with the maximum endorsement of 15,734,393, with 8,552,800 actually drawn down, and a balance of 15,734,393 for the period. This guarantee is not collateralized by properties. Technologies USA, highest being for AUOSZ at 15,287,985. The table also includes notes for further clarification on certain points. Inc., Abakus Solar AG, T-powertek Optronics Co., Ltd., and others. The ownership percentages range from 1.01% to 19.89%. Some of the securitiesThe table provides information about the financial assets held by an entity as of June 30, 2023. The assets are in the form of stocks from variousThe table provides financial data for a specific display segment. The net revenue from external customers is $135,296,927. The segment profit,The table provides a detailed breakdown of transactions between AUO and its subsidiaries and associates. The transac

In [257]:
# Apply to texts
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

The text provides a list of financial liabilities and assets forThe document is a translated version of the consolidated financial statements of AUO Corporation and its subsidiaries. It specifically presents the AUO Corporation and its subsidiaries. These include short-term borrowings, financialThe review of the consolidated financial statements of AUO Corporation and its subsidiaries for the periods ending June 30The text appears to be a list of financial assets and liabilities, possibly from a balance sheet or financial statement. It includes cash and cashThe document is an independent auditors' review report for AUO Corporation and its subsidiaries for the six months ended June 30, 2023 and 2022. liabilities at fair value, accounts payable, equipment and construction payable, current tax liabilities, lease liabilities, and other current liabilities. It, 2023 and 2022, found no issues that would suggest the statements do not accurately represent the company's financial position, perf

# Add to vectorstore

https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary

Use Multi Vector Retriever with summaries:

- InMemoryStore stores the raw text, tables
- vectorstore stores the embedded summaries

In [258]:
from langchain.embeddings import HuggingFaceEmbeddings
# embedding model
embedd_model = "BAAI/bge-large-en-v1.5" # 英文
# embedd_model = "BAAI/bge-small-zh" # 中文

embeddings = HuggingFaceEmbeddings(model_name=embedd_model)

In [260]:
from langchain.vectorstores import Chroma

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="AUO_2023_S3",
    embedding_function=embeddings
)

In [261]:
from langchain.storage import InMemoryStore

# The storage layer for the parent documents
store = InMemoryStore()


In [262]:
from langchain.retrievers.multi_vector import MultiVectorRetriever

id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
    search_kwargs={'k':4}
)

In [None]:

# ********測試合併同page**************
import uuid
from langchain.schema.document import Document

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [Document(page_content=s,metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

In [71]:
# import uuid
# from langchain.schema.document import Document

# # Add texts
# doc_ids = [str(uuid.uuid4()) for _ in texts]
# summary_texts = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]
# retriever.vectorstore.add_documents(summary_texts)
# retriever.docstore.mset(list(zip(doc_ids, texts)))


# # Add tables
# table_ids = [str(uuid.uuid4()) for _ in tables]
# summary_tables = [Document(page_content=s,metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]
# retriever.vectorstore.add_documents(summary_tables)
# retriever.docstore.mset(list(zip(table_ids, tables)))

# RAG from LangChain Expression Language.

In [179]:
from langchain.chains.summarize import load_summarize_chain
from operator import itemgetter

question_template = \
"""
將以下使用者的問題翻譯成英文，並請你發想出幾個搜尋關鍵字寫在翻譯之後的括號。依據以下回答範例的格式回答。
問題:

{user_question}

回答範例(設問題為"這家公司的營運狀況如何"):

這家公司的營運狀況如何?
How is the operational status of this company? (Revenue, Net income/profit, Gross margin, Operating expenses, Earnings before interest, taxes, depreciation, and amortization (EBITDA), Operating profit/loss, Cash flow from operations, Return on investment (ROI), Sales growth, Market share, Inventory turnover, Customer retention rate, Employee productivity, Asset utilization, Debt-to-equity ratio)

回答:
"""
question_prompt = ChatPromptTemplate.from_template(question_template)


answer_template = \
"""
基於以下文本脈絡用繁體中文來回答以下問題。若你認為文本脈絡無法呼應問題，就回答文本脈絡文字的詳細摘要。
文本脈絡:

{context}

問題:
{question}

繁體中文回答:
"""

answer_prompt = ChatPromptTemplate.from_template(answer_template)


chain = (
    {"user_question":RunnablePassthrough()}
    | question_prompt
    | model
    | StrOutputParser()
    |{"context": retriever , "question": RunnablePassthrough()} 
    | answer_prompt
    | model
    | StrOutputParser()
)

In [180]:
chain.invoke("告訴我AUO的子公司清單，並依照AUO持股數由大到小排序")

告訴我AUO的子公司清單，並依照AUO持股數由大到小排序
Tell me the list of AUO's subsidiaries, sorted by AUO's shareholding from largest to smallest. (AUO, Subsidiaries, Shareholding, List, Sorted)文本脈絡並未提供AUO的子公司清單，以及AUO對各子公司的持股數。

'文本脈絡並未提供AUO的子公司清單，以及AUO對各子公司的持股數。'

In [121]:
print(result)


對不起，該文本脈絡並未提供足夠的信息來分析AUO的基本面，包括收入、淨利潤、毛利率、營運費用、息稅折舊前利潤（EBITDA）、營業利潤/損失、營業現金流、投資回報率（ROI）、銷售增長、市場份額、存貨週轉率、負債權益比率等。該文本主要提供了AUO的財務報表說明，包括一些負債和資產的項目，以及一些營運部門的資訊。然而，這些資訊並不足以進行全面的財務分析。建議您直接查閱AUO的完整財務報表以獲得更詳細的資訊。


In [1]:
import pandas as pd
# 讀資料
data_csv = pd.read_csv("content_df.csv").dropna().sample(frac=1, random_state=42).reset_index(drop=True)


In [7]:
df = data_csv[["title","Click"]].dropna().sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
# 寫到pickle
df.to_markdown("content_df.md", index=False)