In [2]:
# %pip install llama-index-embeddings-huggingface
# %pip install llama-index-llms-llama-cpp
# !pip install llama-index
# %pip install llama-cpp-python

In [3]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

In [7]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=None,
    
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path='/tf/aravi/Documents/Projects/LLMs/llama-2-7b-chat.Q4_K_M.gguf',
    
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /tf/aravi/Documents/Projects/LLMs/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attent

In [8]:
response = llm.complete("Hello! Can you tell me a poem about cats and dogs?")
print(response.text)


llama_print_timings:        load time =   11344.40 ms
llama_print_timings:      sample time =      16.97 ms /    91 runs   (    0.19 ms per token,  5362.09 tokens per second)
llama_print_timings: prompt eval time =   11344.20 ms /    79 tokens (  143.60 ms per token,     6.96 tokens per second)
llama_print_timings:        eval time =   13849.03 ms /    90 runs   (  153.88 ms per token,     6.50 tokens per second)
llama_print_timings:       total time =   25425.15 ms /   169 tokens


  Of course! Here is a short poem about cats and dogs:
Cats and dogs, so furry and sweet,
Both bring joy to our hearts, they can't be beat.
With their playful pounces and wagging tails,
They brighten up our days with their playful tales.

I hope you enjoy this poem! Let me know if you have any other questions.


In [9]:
# response_iter = llm.stream_complete("Can you write me a poem about fast cars?")
# for response in response_iter:
#     print(response.delta, end="", flush=True)

In [10]:
from llama_index.core import set_global_tokenizer
from transformers import AutoTokenizer

set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
)

In [11]:
# use Huggingface embeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [12]:
# load documents
documents = SimpleDirectoryReader("./doc").load_data()

In [14]:
# create vector store index
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

In [15]:
# set up query engine
query_engine = index.as_query_engine(llm=llm)

In [None]:
response = query_engine.query("What did the author do growing up?")
print(response)

Llama.generate: prefix-match hit
