# Setup LLaMa

In [1]:
import os
from llama_cpp import Llama, CreateChatCompletionResponse
from llm_models import NexusRavenV2Models 
from functions.get_learning_tree import GetLearningTree

MODEL_PATH = os.path.abspath(NexusRavenV2Models.Q5_K_M.value)

llm = Llama(
    model_path=MODEL_PATH,
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    verbose=True,
    # seed=1337, # Uncomment to set a specific seed
    # n_ctx=2048, # Uncomment to increase the context window
)

llama_model_loader: loaded meta data with 21 key-value pairs and 363 tensors from c:\Git\llama\app\llm\models\nexusraven-v2-13b.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = nexusflow_nexusraven-v2-13b
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:    

## Chat Completetions

In [3]:
output = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a math tutor"},
        {
            "role": "user",
            "content": "What is 11 times 11 plus 15?"
        }
    ]
)

output

Llama.generate: 11 prefix-match hit, remaining 28 prompt tokens to eval

llama_print_timings:        load time =   23039.91 ms
llama_print_timings:      sample time =       3.10 ms /   111 runs   (    0.03 ms per token, 35794.90 tokens per second)
llama_print_timings: prompt eval time =     732.51 ms /    28 tokens (   26.16 ms per token,    38.22 tokens per second)
llama_print_timings:        eval time =   37062.79 ms /   110 runs   (  336.93 ms per token,     2.97 tokens per second)
llama_print_timings:       total time =   37846.34 ms /   138 tokens


{'id': 'chatcmpl-19ff7481-b84e-4072-8767-05b351aa81ba',
 'object': 'chat.completion',
 'created': 1726562052,
 'model': 'c:\\Git\\llama\\app\\llm\\models\\nexusraven-v2-13b.Q5_K_M.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': '  To answer the question "What is 11 times 11 plus 15?", we need to follow these steps:\n\n1. Multiply 11 by 11: 11 x 11 = 121\n2. Add 15 to the result: 121 + 15 = 136\n\nTherefore, the answer to the question "What is 11 times 11 plus 15?" is 136.'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 39, 'completion_tokens': 110, 'total_tokens': 149}}

## Embeddings

In [2]:
embedding_llm = Llama(model_path=MODEL_PATH, embedding=True)

embedding = embedding_llm.create_embedding(input="Hello World!")

embedding

llama_model_loader: loaded meta data with 21 key-value pairs and 363 tensors from c:\Git\llama\app\llm\models\nexusraven-v2-13b.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = nexusflow_nexusraven-v2-13b
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:    

{'object': 'list',
 'data': [{'object': 'embedding',
   'embedding': [[-0.03206310048699379,
     0.2818916141986847,
     0.21949860453605652,
     0.8874706625938416,
     -0.36668986082077026,
     0.4786068797111511,
     -0.5725802779197693,
     -0.6475871801376343,
     0.2859399616718292,
     0.21231280267238617,
     0.008447348140180111,
     -0.39260438084602356,
     -0.15645398199558258,
     -0.2820327579975128,
     0.1457994282245636,
     -0.49535924196243286,
     0.1383007913827896,
     0.10314445942640305,
     -0.010698569938540459,
     0.24910438060760498,
     0.76813143491745,
     0.21038569509983063,
     0.39407259225845337,
     -0.26895248889923096,
     0.16150037944316864,
     0.1330651342868805,
     -0.34232842922210693,
     -0.2868497371673584,
     -0.3523007035255432,
     0.07485605031251907,
     0.7450352311134338,
     0.016849366948008537,
     -0.5075965523719788,
     0.25262734293937683,
     -0.7731997966766357,
     0.3107920289039612,

## Function Calling

In [3]:
function_llm = Llama(model_path=MODEL_PATH, chat_format="chatml-function-calling")

output: CreateChatCompletionResponse = llm.create_chat_completion(
    messages = [
        {
            "role": "system",
            "content": "BCBA Expert assistant that can analyse learning trees."
        },
        {
            "role": "user",
            "content": "Analyse a learning tree with the validation key: A1"
        }
    ],
    tools = [GetLearningTree().tool],
    tool_choice = {
        "type": "function",
        "function": {
            "name": "get_learning_tree"
        }
    }
)


print(output['choices'][0]["message"]["function_call"])

llama_model_loader: loaded meta data with 21 key-value pairs and 363 tensors from c:\Git\llama\app\llm\models\nexusraven-v2-13b.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = nexusflow_nexusraven-v2-13b
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:    

{'name': 'get_learning_tree', 'arguments': '{"validation_key": "A1"}'}


In [3]:
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"CUDA Device Count: {torch.cuda.device_count()}")
    print(f"Current CUDA Device: {torch.cuda.current_device()}")
    print(f"CUDA Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("CUDA is not available.")

CUDA is available!
CUDA Device Count: 1
Current CUDA Device: 0
CUDA Device Name: NVIDIA GeForce RTX 4060 Laptop GPU
