In [None]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from pprint import pprint


### Web文書の取得

In [None]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    )
)
docs = loader.load()


In [None]:
# Web文書確認
print('文書数:', len(docs))
pprint(docs[0].page_content)

### 文書の分割

In [None]:
# 文字数を基準とした分割
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
## 文書分割確認
print('分割数:', len(splits))

### 埋め込みベクトルDB作成

In [None]:
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

### リトリーバの作成

In [None]:
retriever = vectorstore.as_retriever()

### プロンプトテンプレートの作成

In [None]:
prompt = hub.pull("rlm/rag-prompt")

In [None]:
# プロンプトテンプレートの確認
pprint(prompt.messages[0].prompt.template)

### LLMの用意

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

### リトリーバによる文書取得後の処理

In [None]:
# 複数文書の連結
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


### チェーンの作成

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


### 質問実行

In [None]:
pprint(rag_chain.invoke("What is Task Decomposition?"))