In [3]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, DirectoryLoader

## fix chroma version error
__import__('pysqlite3')
import pysqlite3
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [4]:
## Preparing documents
doc_path = "doc_shuttle"
loader = DirectoryLoader(doc_path, glob='**/*.txt', loader_cls=TextLoader)
TXT_data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(TXT_data)
# print(all_splits)

In [8]:
## Embed and store the texts
## Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
embedding = HuggingFaceEmbeddings(model_name=model_name,
                                  model_kwargs=model_kwargs)

vectordb = Chroma.from_documents(documents=all_splits, embedding=embedding, persist_directory=persist_directory)

In [9]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp

llm = LlamaCpp(
    model_path="llama-2-7b-chat.Q4_K_M.gguf",
    n_gpu_layers=100,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 

In [10]:
from langchain.chains import LLMChain
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""<|system|>
    You are a helpful assistant eager to assist with providing better Google search results.
    Provide an answer to the following question in 150 words. Ensure that the answer is informative, \
    relevant, and concise:
            {question} </s>""",
)

DEFAULT_TINY_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""<<SYS>> 
    You are a helpful assistant eager to assist with providing better Google search results.
    <</SYS>> 
    
    [INST] Provide an answer to the following question in 150 words. Ensure that the answer is informative, \
            relevant, and concise:
            {question} 
    [/INST]""",
)

DEFAULT_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are a helpful assistant eager to assist with providing better Google search results. \
        Provide an answer to the following question in about 150 words. Ensure that the answer is informative, \
        relevant, and concise: \
        {question}""",
)


QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
    default_prompt=DEFAULT_SEARCH_PROMPT,
    conditionals=[(lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)],
)

prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)
prompt

PromptTemplate(input_variables=['question'], template='<<SYS>> \n    You are a helpful assistant eager to assist with providing better Google search results.\n    <</SYS>> \n    \n    [INST] Provide an answer to the following question in 150 words. Ensure that the answer is informative,             relevant, and concise:\n            {question} \n    [/INST]')

In [11]:
llm_chain = LLMChain(prompt=prompt, llm=llm)
# question = "What is Taiwan known for?"
# llm_chain.invoke({"question": question})

In [12]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [13]:
# query = "Tell me about the processor of SH610R4"
# qa.invoke(query)

querys = [
    "Tell me about the Shuttle XPC cube series and its models.",
    "Tell me about the processor of SH610R4.",
    "Tell me about the processor of SZ270R9.",
    "What is the chipset of SH570R8?",
    "What processor used in SH510R4?",
    "What is the dimensions of SW580R8?",
    "Tell me about the processor of SW580R8."
]

for query in querys:
    qa.invoke(query)



[1m> Entering new RetrievalQA chain...[0m
 The Shuttle XPC cube series is a line of compact and powerful PCs designed for various applications. The series includes several models with different specifications and features. Here are some key details about the Shuttle XPC cube series and its models:
The Shuttle XPC cube series includes the following models:
* Shuttle XPC cube 100: This is the base model with an Intel 10th Gen processor, 8GB RAM, and a 256GB SSD. It has a 16x PCIe Gen4 slot and supports Linux and Windows 10 OS.
* Shuttle XPC cube 200: This model has a higher specification than the 100 series with Intel 11th Gen processor, 16GB RAM, and a 512GB SSD. It also has a 16x PCIe Gen4 slot and supports Linux and Windows 10 OS.
* Shuttle XPC cube 300: This model has a higher specification than the 200 series with Intel 11th Gen processor, 16GB RAM, and a 1TB SSD.


llama_print_timings:        load time =   23336.51 ms
llama_print_timings:      sample time =      90.38 ms /   256 runs   (    0.35 ms per token,  2832.61 tokens per second)
llama_print_timings: prompt eval time =   73652.67 ms /  1536 tokens (   47.95 ms per token,    20.85 tokens per second)
llama_print_timings:        eval time =   74619.53 ms /   256 runs   (  291.48 ms per token,     3.43 tokens per second)
llama_print_timings:       total time =  149113.63 ms /  1792 tokens



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit


 The processor of SH610R4 is supported by Intel 12th/13th generation Core i3/i5/i7/i9, Pentium, and Celeron 125W LGA1700 CPU.


llama_print_timings:        load time =   23336.51 ms
llama_print_timings:      sample time =      19.04 ms /    56 runs   (    0.34 ms per token,  2941.18 tokens per second)
llama_print_timings: prompt eval time =   73204.01 ms /  1525 tokens (   48.00 ms per token,    20.83 tokens per second)
llama_print_timings:        eval time =   15926.26 ms /    55 runs   (  289.57 ms per token,     3.45 tokens per second)
llama_print_timings:       total time =   89311.15 ms /  1580 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The SZ270R9 supports Kabylake/Skylake 95W Core i3/i5/i7 LGA 1151 CPU processor.
Unhelpful Answer: The SZ270R9 has a processor that is made of chocolate and has rainbows coming out of it.


llama_print_timings:        load time =   23336.51 ms
llama_print_timings:      sample time =      28.07 ms /    76 runs   (    0.37 ms per token,  2707.71 tokens per second)
llama_print_timings: prompt eval time =   78954.16 ms /  1604 tokens (   49.22 ms per token,    20.32 tokens per second)
llama_print_timings:        eval time =   23381.10 ms /    75 runs   (  311.75 ms per token,     3.21 tokens per second)
llama_print_timings:       total time =  102597.58 ms /  1679 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The chipset of SH570R8 is Intel H570 express chipset.


llama_print_timings:        load time =   23336.51 ms
llama_print_timings:      sample time =       8.61 ms /    23 runs   (    0.37 ms per token,  2672.86 tokens per second)
llama_print_timings: prompt eval time =   78909.04 ms /  1585 tokens (   49.78 ms per token,    20.09 tokens per second)
llama_print_timings:        eval time =    6555.47 ms /    22 runs   (  297.98 ms per token,     3.36 tokens per second)
llama_print_timings:       total time =   85553.16 ms /  1607 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The processor used in SH510R4 is Intel Core processors of the 10th and 11th generations for LGA1200 sockets.


llama_print_timings:        load time =   23336.51 ms
llama_print_timings:      sample time =      13.36 ms /    39 runs   (    0.34 ms per token,  2919.82 tokens per second)
llama_print_timings: prompt eval time =   82642.02 ms /  1668 tokens (   49.55 ms per token,    20.18 tokens per second)
llama_print_timings:        eval time =   11728.53 ms /    38 runs   (  308.65 ms per token,     3.24 tokens per second)
llama_print_timings:       total time =   94516.11 ms /  1706 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The dimensions of the SW580R8 micro server are 13 inches (L) × 8.5 inches (W) × 7.5 inches (H).


llama_print_timings:        load time =   23336.51 ms
llama_print_timings:      sample time =      12.91 ms /    39 runs   (    0.33 ms per token,  3021.38 tokens per second)
llama_print_timings: prompt eval time =   71885.61 ms /  1472 tokens (   48.84 ms per token,    20.48 tokens per second)
llama_print_timings:        eval time =   10743.22 ms /    38 runs   (  282.72 ms per token,     3.54 tokens per second)
llama_print_timings:       total time =   82754.56 ms /  1510 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m
 The processor supported by SW580R8 is Intel 10th/11th generation Core i3/i5/i7, Pentium, and Celeron 125W LGA1200 CPU.


llama_print_timings:        load time =   23336.51 ms
llama_print_timings:      sample time =      19.12 ms /    52 runs   (    0.37 ms per token,  2719.10 tokens per second)
llama_print_timings: prompt eval time =   22345.39 ms /   424 tokens (   52.70 ms per token,    18.97 tokens per second)
llama_print_timings:        eval time =   15648.42 ms /    51 runs   (  306.83 ms per token,     3.26 tokens per second)
llama_print_timings:       total time =   38193.62 ms /   475 tokens



[1m> Finished chain.[0m
