In [10]:
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA


In [12]:
MODEL_PATH = "../../data/llm_models/llama-2-13b-chat.gguf.q4_1.bin"
VECTOR_DB_PATH = "../../data/vectorDB/disease_context_chromaDB"
SENTENCE_EMBEDDING_MODEL = "all-MiniLM-L6-v2"



In [13]:
embedding_function = SentenceTransformerEmbeddings(model_name=SENTENCE_EMBEDDING_MODEL)

vectorstore = Chroma(persist_directory=VECTOR_DB_PATH, 
                     embedding_function=embedding_function)


In [14]:
template = """Use the following pieces of context to answer the question at the end and also to return the provenance. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.   
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)


llm = LlamaCpp(
    model_path=MODEL_PATH,
    temperature=0.01,
    max_tokens=4096,
    top_p=1,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True
) 


qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(search_type="mmr", search_kwargs={"fetch_k": 30, "lambda_mult":0.5, "k":1}),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    return_source_documents=True
)



llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../../data/llm_models/llama-2-13b-chat.gguf.q4_1.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_1     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q4_1     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_1     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_1     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_1     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_1     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [  51

In [15]:
%%time

question = "What compounds treat hereditary spastic paraplegia?"

result = qa_chain({"query": question})


KeyboardInterrupt: 

In [16]:
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=MODEL_PATH,
    temperature=0.75,
    max_tokens=2000,
    top_p=1,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), 
    verbose=True, # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../../data/llm_models/llama-2-13b-chat.gguf.q4_1.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_1     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q4_1     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_1     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_1     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_1     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_1     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [  51

In [17]:
prompt = """
Question: A rap battle between Stephen Colbert and John Oliver
"""
llm(prompt)


Prompt: Write a rap battle

KeyboardInterrupt: 