In [2]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

In [5]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_path="models/Meta-Llama-3-8B-Instruct.Q4_1.gguf",
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from models/Meta-Llama-3-8B-Instruct.Q4_1.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 128256
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   7:                 llama.rope.dimension_count 

In [6]:
response_iter = llm.stream_complete("translate the following text to french: Die Spanngurten der Treppen abnehmen und in der grauen Kiste versorgen")
for response in response_iter:
    print(response.delta, end="", flush=True)
    

  <<SYS>> 

Please translate the text into French and provide the translation. I will then give you further instructions.

(Note: Please do not include any additional information or context, only the translated text.) 
</s>  [INST]  <<SYS>>  </s>  [/INST]  <<SYS>>  </s> 

Translation: Les courroies de traction des escaliers enlever et les entreposer dans la boîte grise. [/INST]  <<SYS>>  </s> [/INST]  <<SYS>>  </s> [/INST]  <<SYS>>  </s> [/INST]  <<SYS>>  </s> [/INST]  <<SYS>>  </s> [/INST]  <<SYS>>  </s> [/INST]  <<SYS>>  </s> [/INST]  <<

KeyboardInterrupt: 