In [None]:
!pip install langchain
!pip install pypdf
!pip install InstructorEmbedding
!pip install sentence_transformers
!pip install faiss-gpu

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

# 读取本地知识
loader = UnstructuredPDFLoader("三国演义第一回.pdf")
pages = loader.load_and_split()
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(pages, embeddings).as_retriever()

query = "桃园三结义都有谁"
docs = docsearch.get_relevant_documents(query)
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
output = chain.run(input_documents=docs, question=query)
print(output)

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

root_dir = '/content/drive/MyDrive'
# 读取所有的pdf文件
loader = DirectoryLoader(f'{root_dir}/PDFs/', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
print(documents)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 拆分长文本
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
print(texts[0])

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
# 设置embedding model
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large",model_kwargs={"device": "cuda"})
# 将texts转化为embeddings，存放到Faiss中
db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)
# 设置相似度查找方式，Topk=3
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 3})
retriever.search_type

In [None]:
# 相似文档检索
docs = retriever.get_relevant_documents("个金客户经理的工作职责是什么?")
print(len(docs))
docs[0]

In [None]:
!pip install openai

In [None]:
import os
os.environ["OPENAI_API_KEY"] = 'sk-'

from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

## LLM回答
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

# 使用Chain，回答问题
qa_chain_instrucEmbed = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0.2, ),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)


query = '个金客户经理的客户职责是什么?'
print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)