In [1]:
from langchain.document_loaders import PyMuPDFLoader
import glob
PDF_data = []
pdfs = glob.glob("pdf/*.pdf")

for pdfFile in glob.glob("pdf/*.pdf"):
    loader = PyMuPDFLoader(pdfFile)
    PDF_data.extend(loader.load())

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=5)
all_splits = text_splitter.split_documents(PDF_data)

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
# model_name = "GanymedeNil/text2vec-base-chinese"
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda:0'}
embedding = HuggingFaceEmbeddings(model_name=model_name,
                                  model_kwargs=model_kwargs)

In [None]:
## 儲存向量資料庫
from langchain.vectorstores import Chroma
persist_directory = 'db'
vectordb = Chroma.from_documents(documents=all_splits, embedding=embedding, persist_directory=persist_directory)

In [4]:
## 載入過去的創建的資料庫
from langchain.vectorstores import Chroma
persist_directory = 'db'
vectordb = Chroma(embedding_function=embedding, persist_directory=persist_directory)

In [5]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.base import BaseCallbackHandler
from langchain_community.llms import LlamaCpp
# model_path = "./model/Taiwan-LLM-7B-v2.1-chat-Q4_K_M.gguf"
model_path = "./model/chinese-alpaca-2-7b.Q4_K_M.gguf"
from queue import Queue, Empty
from typing import Any
from threading import Thread

q = Queue()
job_done = object()

class QueueCallback(BaseCallbackHandler):
    """Callback handler for streaming LLM responses to a queue."""

    def __init__(self, q):
        self.q = q

    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
        self.q.put(token)

    def on_llm_end(self, *args, **kwargs: Any) -> None:
        return self.q.empty()
callbacks = [QueueCallback(q)]

llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=10,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    # callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    callbacks=callbacks,
    verbose=True,
)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 
Model metadata: {'general.name': 'LLaMA v2', 'general.architecture': 'llama', 'llama.context_length': '4096', 'llama.rope.dimension_count': '128', 'llama.embedding_length': '4096', 'llama.block_count': '32', 'llama.feed_forward_length': '11008', 'llama.attention.head_count': '32', 'tokenizer.ggml.eos_token_id': '2', 'general.file_type': '15', 'llama.attention.head_count_kv': '32', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0'}


In [6]:
from langchain.chains import LLMChain
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""<<SYS>> \n 你是一名導覽員，負責回答大家的問題 \
 \n <</SYS>> \n\n [INST] 請使用中文回覆: \n\n {question} [/INST]""",
)

DEFAULT_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""你是一名導覽員，負責回答大家的問題 \
 請使用中文回覆: {question}""",
)

QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
    default_prompt=DEFAULT_SEARCH_PROMPT,
    conditionals=[(lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)],
)

prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)
prompt

PromptTemplate(input_variables=['question'], template='<<SYS>> \n 你是國資圖的導覽員，負責回答大家的問題  \n <</SYS>> \n\n [INST] 請使用中文回覆: \n\n {question} [/INST]')

In [7]:
from langchain.chains.retrieval_qa.base import RetrievalQA

retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [8]:
import gradio as gr

def answer(question):
    def task():
        response = qa.invoke(question)
        q.put(job_done)
    
    t = Thread(target=task)
    t.start()

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def res(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        question = history[-1][0]
        print("Question: ", question)
        history[-1][1] = ""
        answer(question=question)
        while True:
          try:
            next_token = q.get(True, timeout=1)
            if next_token is job_done:
              break
            history[-1][1] += next_token
            yield history
          except Empty:
            continue

    msg.submit(res, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue()
demo.launch()


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Question:  你好


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
