<a href="https://colab.research.google.com/github/sugarforever/LangChain-Tutorials/blob/main/LangChain_PDF_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

该Python notebook利用langchain的QA chain，结合Chroma来实现PDF文档Analysis-and-Comparison-between-Optimism-and-StarkNet.pdf的语义化搜索。

该PDF文档共61页。通过本notebook，我们演示该字数规模的文件的语义化索引的OpenAI API开销。

使用时，在本地创建`.env`，并如`.env.example`所示，设置有效的OpenAI API Key即可。

In [1]:
import os,sys
import openai
from dotenv import load_dotenv, find_dotenv
# sys.path.append("../..")

# 读取本地/项目的环境变量。

# find_dotenv()寻找并定位.env文件的路径
# load_dotenv()读取该.env文件，并将其中的环境变量加载到当前的运行环境中  
# 如果你设置的是全局的环境变量，这行代码则没有任何作用。
print(find_dotenv())
_ = load_dotenv(find_dotenv())
print(os.environ["OPENAI_API_KEY"])

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate

from langchain.chains import LLMRequestsChain, LLMChain


llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo')

d:\Anaconda3\envs\LLM\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
d:\Anaconda3\envs\LLM\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


c:\Users\lenovo\Desktop\LangChainPlayGround\DeeperTutorials\.env
sk-lANo2jIeCWQt94UCCf5d16B7C32744279bF98b06C822D519




### Load the pdf file 

In [9]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings


PDF_PATH='../Analysis-and-Comparison-between-Optimism-and-StarkNet.pdf'
embedding_model = OpenAIEmbeddings()

def load_pdf(path:str):
    docs = PyMuPDFLoader(path).load()
    print (f'You have {len(docs)} document(s) in your data')
    print (f'There are {len(docs[0].page_content)} characters in the first page of your document')
    
    total = 0
    for doc in docs:
        total += len(doc.page_content)
    print (f'There are {total} characters in your document')

    return docs

def load_and_split_doc():
    docs = load_pdf(PDF_PATH)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    split_docs = text_splitter.split_documents(docs)
    print (f'Now you have {len(split_docs)} documents')

    return split_docs

splitted_docs = split_doc()

You have 61 document(s) in your data
There are 284 characters in the first page of your document
There are 112630 characters in your document
Now you have 143 documents


## Load the vector database

In [12]:
from langchain.vectorstores import Chroma

persist_directory = 'starknet'
collection_name = 'starknet_index'

# Load the vectorstore from disk
vectordb = Chroma(collection_name=collection_name, 
                  persist_directory=persist_directory, 
                  embedding_function=embedding_model)

from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    vectorstore = Chroma.from_documents(splitted_docs,
                                        embedding_model, 
                                        collection_name=collection_name, 
                                        persist_directory=persist_directory)
    vectorstore.persist()
    print(cb)

Tokens Used: 0
	Prompt Tokens: 0
	Completion Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0


## search related docs 

In [21]:
query = "What is starknet?"
related_docs = vectorstore.similarity_search(query, 3, include_metadata=True)

for related_doc in related_docs:
    print(related_doc.metadata)
    print(related_doc.page_content)

{'author': '', 'creationDate': "D:20221031205028-04'00'", 'creator': 'LaTeX with hyperref', 'file_path': '../Analysis-and-Comparison-between-Optimism-and-StarkNet.pdf', 'format': 'PDF 1.4', 'keywords': '', 'modDate': "D:20221031205028-04'00'", 'page': 35, 'producer': 'dvips + GPL Ghostscript GIT PRERELEASE 9.22', 'source': '../Analysis-and-Comparison-between-Optimism-and-StarkNet.pdf', 'subject': '', 'title': '', 'total_pages': 61, 'trapped': ''}
StarkNet
3.2.1
Overview
StarkNet is a Validity Rollup developed by StarkWare that uses the STARK proof system
to validate its state on Ethereum. To facilitate the construction of validity proofs, a
35
{'author': '', 'creationDate': "D:20221031205028-04'00'", 'creator': 'LaTeX with hyperref', 'file_path': '../Analysis-and-Comparison-between-Optimism-and-StarkNet.pdf', 'format': 'PDF 1.4', 'keywords': '', 'modDate': "D:20221031205028-04'00'", 'page': 51, 'producer': 'dvips + GPL Ghostscript GIT PRERELEASE 9.22', 'source': '../Analysis-and-Compar

In [22]:
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(llm, chain_type="stuff")
print(chain.document_prompt)

print(chain.prompt_length(related_docs, 
                          question='What is starknet?'))
with get_openai_callback() as cb:
    print(chain.run(input_documents=related_docs, question=query))
    print(cb)

input_variables=['page_content'] output_parser=None partial_variables={} template='{page_content}' template_format='f-string' validate_template=True
630
StarkNet is a Validity Rollup developed by StarkWare that uses the STARK proof system to validate its state on Ethereum.
Tokens Used: 664
	Prompt Tokens: 637
	Completion Tokens: 27
Successful Requests: 1
Total Cost (USD): $0.0010095
