In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, DirectoryLoader

## fix chroma version error
__import__('pysqlite3')
import pysqlite3
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [2]:
## Preparing documents
doc_path = "doc_shuttle"
loader = DirectoryLoader(doc_path, glob='**/*.txt', loader_cls=TextLoader)
TXT_data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(TXT_data)

In [3]:
## Embed and store the texts
## Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
embedding = HuggingFaceEmbeddings(model_name=model_name,
                                  model_kwargs=model_kwargs)

vectordb = Chroma.from_documents(documents=all_splits, embedding=embedding, persist_directory=persist_directory)

In [4]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp

llm = LlamaCpp(
    model_path="llama-2-7b-chat.Q4_K_M.gguf",
    # model_path="tiny-llama-shuttle-xpc-cube-en10000_Q4_K_M.gguf",
    # model_path="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    # model_path="bailong-instruct-7b-v0.1-q5_k_m.gguf",
    n_gpu_layers=100,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 

In [5]:
from langchain.chains import LLMChain
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""<|system|>
    You are a helpful assistant eager to assist with providing better Google search results. \ 
    The answer you provided must completely retrieve from context. \
    Provide an answer to the following question in 100 words. Ensure that the answer is informative, \
    relevant, and concise. It is forbidden to generate answers that are irrelevant to the question. It is forbidden to generate negative answer like "Unhelpful Answer" in your response:
            {question} </s>""",
)

DEFAULT_TINY_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""<<SYS>> 
    You are a helpful assistant eager to assist with providing better Google search results.
    <</SYS>> 
    
    [INST] Provide an answer to the following question in 150 words. Ensure that the answer is informative, \
            relevant, and concise:
            {question} 
    [/INST]""",
)

DEFAULT_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are a helpful assistant eager to assist with providing better Google search results. \
        Provide an answer to the following question in about 150 words. Ensure that the answer is informative, \
        relevant, and concise: \
        {question}""",
)

QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
    default_prompt=DEFAULT_SEARCH_PROMPT,
    conditionals=[(lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)],
)

prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)
prompt

PromptTemplate(input_variables=['question'], template='<|system|>\n    You are a helpful assistant eager to assist with providing better Google search results. \\ \n    The answer you provided must completely retrieve from context.     Provide an answer to the following question in 100 words. Ensure that the answer is informative,     relevant, and concise. It is forbidden to generate answers that are irrelevant to the question. It is forbidden to generate negative answer like "Unhelpful Answer" in your response:\n            {question} </s>')

In [6]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [7]:
querys = [
    "Tell me about the processor of SH610R4.",
    "Tell me about the processor of SZ270R9.",
    "What is the chipset of SH570R8?",
    "What processor used in SH510R4?",
    "What is the dimensions of SW580R8?",
    "Tell me about the processor of SW580R8.",
    "What processor used in SH370R6?"
]

for query in querys:
    qa.invoke(query)



[1m> Entering new RetrievalQA chain...[0m
 The processor supported by SH610R4 is Intel 12th/13th/14th generation Core i3/i5/i7/i9, Pentium, and Celeron 125W LGA1700 CPU.


llama_print_timings:        load time =   22682.50 ms
llama_print_timings:      sample time =      20.24 ms /    59 runs   (    0.34 ms per token,  2914.44 tokens per second)
llama_print_timings: prompt eval time =   73170.44 ms /  1560 tokens (   46.90 ms per token,    21.32 tokens per second)
llama_print_timings:        eval time =   16386.49 ms /    58 runs   (  282.53 ms per token,     3.54 tokens per second)
llama_print_timings:       total time =   89756.31 ms /  1618 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The processor of SZ270R9 is compatible with Kabylake/Skylake 95W Core i3/i5/i7 processors utilizing the LGA 1151 socket.


llama_print_timings:        load time =   22682.50 ms
llama_print_timings:      sample time =      17.29 ms /    50 runs   (    0.35 ms per token,  2891.84 tokens per second)
llama_print_timings: prompt eval time =   61468.85 ms /  1319 tokens (   46.60 ms per token,    21.46 tokens per second)
llama_print_timings:        eval time =   13358.25 ms /    49 runs   (  272.62 ms per token,     3.67 tokens per second)
llama_print_timings:       total time =   74990.04 ms /  1368 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The SH570R8 compact desktop utilizes the Intel H570 chipset.


llama_print_timings:        load time =   22682.50 ms
llama_print_timings:      sample time =       7.55 ms /    22 runs   (    0.34 ms per token,  2913.14 tokens per second)
llama_print_timings: prompt eval time =   62049.64 ms /  1330 tokens (   46.65 ms per token,    21.43 tokens per second)
llama_print_timings:        eval time =    5775.70 ms /    21 runs   (  275.03 ms per token,     3.64 tokens per second)
llama_print_timings:       total time =   67891.03 ms /  1351 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The processor used in SH510R4 is an Intel Core processor from the 10th or 11th generation for the LGA1200 socket.


llama_print_timings:        load time =   22682.50 ms
llama_print_timings:      sample time =      12.73 ms /    38 runs   (    0.34 ms per token,  2983.90 tokens per second)
llama_print_timings: prompt eval time =   75636.85 ms /  1592 tokens (   47.51 ms per token,    21.05 tokens per second)
llama_print_timings:        eval time =   10310.16 ms /    37 runs   (  278.65 ms per token,     3.59 tokens per second)
llama_print_timings:       total time =   86076.47 ms /  1629 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The dimensions of the SW580R8 micro server are 13"(L) × 8.5"(W) × 7.5"(H).


llama_print_timings:        load time =   22682.50 ms
llama_print_timings:      sample time =      13.03 ms /    39 runs   (    0.33 ms per token,  2992.40 tokens per second)
llama_print_timings: prompt eval time =   45936.68 ms /   995 tokens (   46.17 ms per token,    21.66 tokens per second)
llama_print_timings:        eval time =   10114.42 ms /    38 runs   (  266.17 ms per token,     3.76 tokens per second)
llama_print_timings:       total time =   56168.61 ms /  1033 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The SW580R8 micro server supports 10th/11th gen Intel Core processors and Xeon W-series processors. (Yes/No)


llama_print_timings:        load time =   22682.50 ms
llama_print_timings:      sample time =      13.06 ms /    39 runs   (    0.33 ms per token,  2986.45 tokens per second)
llama_print_timings: prompt eval time =   70867.76 ms /  1464 tokens (   48.41 ms per token,    20.66 tokens per second)
llama_print_timings:        eval time =   10702.99 ms /    38 runs   (  281.66 ms per token,     3.55 tokens per second)
llama_print_timings:       total time =   81698.51 ms /  1502 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The processor used in SH370R6 is Intel 11th gen Core i3/i5/i7/i9, Pentium, or Celeron processor.


llama_print_timings:        load time =   22682.50 ms
llama_print_timings:      sample time =      14.14 ms /    40 runs   (    0.35 ms per token,  2829.45 tokens per second)
llama_print_timings: prompt eval time =   69126.49 ms /  1455 tokens (   47.51 ms per token,    21.05 tokens per second)
llama_print_timings:        eval time =   10972.25 ms /    39 runs   (  281.34 ms per token,     3.55 tokens per second)
llama_print_timings:       total time =   80226.19 ms /  1494 tokens



[1m> Finished chain.[0m
