# Setup LLaMa

In [1]:
import os
from llama_cpp import Llama, CreateChatCompletionResponse 
from llm_models import NexusRavenV2Models
from functions.get_learning_tree import GetLearningTree

MODEL_PATH = os.path.abspath(NexusRavenV2Models.Q5_K_M.value)

llm = Llama(
    model_path=MODEL_PATH,
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    # seed=1337, # Uncomment to set a specific seed
    # n_ctx=2048, # Uncomment to increase the context window
)

llama_model_loader: loaded meta data with 21 key-value pairs and 363 tensors from c:\Git\llama\models\nexusraven-v2-13b.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = nexusflow_nexusraven-v2-13b
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:            

models/nexusraven-v2-13b.Q5_K_M.gguf


llm_load_tensors:        CPU buffer size =  8801.81 MiB
....................................................................................................
llama_new_context_with_model: n_ctx      = 512
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 1000000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:        CPU KV buffer size =   400.00 MiB
llama_new_context_with_model: KV self size  =  400.00 MiB, K (f16):  200.00 MiB, V (f16):  200.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.12 MiB
llama_new_context_with_model:        CPU compute buffer size =    85.01 MiB
llama_new_context_with_model: graph nodes  = 1286
llama_new_context_with_model: graph splits = 1
AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 

## Chat Completetions

In [2]:
output = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are an expert in BCBA"},
        {
            "role": "user",
            "content": "What do you know about BCBA and ABA therapy?"
        }
    ]
)

output


llama_print_timings:        load time =   27624.60 ms
llama_print_timings:      sample time =      28.72 ms /   402 runs   (    0.07 ms per token, 13996.24 tokens per second)
llama_print_timings: prompt eval time =   27624.01 ms /    39 tokens (  708.31 ms per token,     1.41 tokens per second)
llama_print_timings:        eval time =  262746.44 ms /   401 runs   (  655.23 ms per token,     1.53 tokens per second)
llama_print_timings:       total time =  291162.92 ms /   440 tokens


{'id': 'chatcmpl-16a8b75b-527a-480e-8833-42c1461aba4d',
 'object': 'chat.completion',
 'created': 1721728734,
 'model': 'C:\\Git\\llama\\models\\nexusraven-v2-13b.Q5_K_M.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': '  As an expert in BCBA, I can provide information about the benefits and techniques of BCBA therapy, as well as its potential applications in various fields.\n\nBCBA stands for Behavioral Cognitive Behavioral Analysis, which is a type of psychotherapy that focuses on the relationship between thoughts, feelings, and behaviors. BCBA therapy is based on the principles of cognitive-behavioral therapy, which aims to help individuals identify and change negative thought patterns and behaviors that contribute to their problems.\n\nBCBA therapy typically involves a series of sessions with a trained therapist, who will work with the individual to identify and address their specific issues and goals. The therapist will use a variety of technique

## Embeddings

In [3]:
embedding_llm = Llama(model_path=MODEL_PATH, embedding=True)

embedding = embedding_llm.create_embedding(input="Hello World!")

embedding

llama_model_loader: loaded meta data with 21 key-value pairs and 363 tensors from C:\Git\llama\models\nexusraven-v2-13b.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = nexusflow_nexusraven-v2-13b
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:            

{'object': 'list',
 'data': [{'object': 'embedding',
   'embedding': [[-0.027484826743602753,
     0.28018537163734436,
     0.22065326571464539,
     0.8866831660270691,
     -0.36529475450515747,
     0.47626587748527527,
     -0.5692007541656494,
     -0.6453907489776611,
     0.28576958179473877,
     0.20981289446353912,
     0.0053338962607085705,
     -0.3947833478450775,
     -0.16204877197742462,
     -0.28097549080848694,
     0.14736998081207275,
     -0.49869948625564575,
     0.13858157396316528,
     0.10234776884317398,
     -0.014376325532793999,
     0.25169500708580017,
     0.7646609544754028,
     0.20989230275154114,
     0.3952489197254181,
     -0.26709192991256714,
     0.1658705472946167,
     0.1314999759197235,
     -0.3426644802093506,
     -0.2865840494632721,
     -0.34854093194007874,
     0.07767003774642944,
     0.7436184287071228,
     0.0169207826256752,
     -0.505123496055603,
     0.25218936800956726,
     -0.7729746103286743,
     0.3088214099407

## Function Calling

In [2]:
function_llm = Llama(model_path=MODEL_PATH, chat_format="chatml-function-calling")

output: CreateChatCompletionResponse = llm.create_chat_completion(
    messages = [
        {
            "role": "system",
            "content": "BCBA Expert assistant that can analyse learning trees."
        },
        {
            "role": "user",
            "content": "Analyse a learning tree with the validation key: A1"
        }
    ],
    tools = [GetLearningTree().tool],
    tool_choice = {
        "type": "function",
        "function": {
            "name": "get_learning_tree"
        }
    }
)


print(output['choices'][0]["message"]["function_call"])

llama_model_loader: loaded meta data with 21 key-value pairs and 363 tensors from c:\Git\llama\models\nexusraven-v2-13b.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = nexusflow_nexusraven-v2-13b
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:            

{'name': 'get_learning_tree', 'arguments': '{"validation_key": "A1"}'}
