# Background

This node will run some examples of using llama cpp python bind and provide samples

# Install python binding example

In [1]:
!CMAKE_ARGS="-DGGML_CUDA=on" 


In [2]:
pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.1.tar.gz (63.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25l- \ done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | 

# High level API

In [3]:
from llama_cpp import Llama

llm = Llama(
      model_path="/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf",
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)
output = llm(
      "Q: Name the planets in the solar system? A: ", # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)

llama_model_loader: loaded meta data with 33 key-value pairs and 218 tensors from /kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Cosmo2 1.7B Webinst Sc2
llama_model_loader: - kv   3:                       general.organization str              = HuggingFaceTB
llama_model_loader: - kv   4:                           general.finetune str              = webinst-sc2
llama_model_loader: - kv   5:                           general.basename str              = cosmo2
llama_model_loader: - kv   6:                         general.size_label str              

{'id': 'cmpl-e9040afe-f13c-47b4-8fb2-a8b050d448c8', 'object': 'text_completion', 'created': 1727883013, 'model': '/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf', 'choices': [{'text': 'Q: Name the planets in the solar system? A: ', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 13, 'completion_tokens': 1, 'total_tokens': 14}}


# Pulling models from Hugging Face Hub

Llama cpp python could pull image from hugging face.  For example


In [4]:
llm = Llama.from_pretrained(
    repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
    filename="*q8_0.gguf",
    verbose=False
)

qwen2-0_5b-instruct-q8_0.gguf:   0%|          | 0.00/531M [00:00<?, ?B/s]

# Chat Completion

The high-level API also provides a simple interface for chat completion.

Chat completion requires that the model knows how to format the messages into a single prompt. The Llama class does this using pre-registered chat formats (ie. chatml, llama-2, gemma, etc) or by providing a custom chat handler object.

In [5]:
from llama_cpp import Llama
llm = Llama(
      model_path="/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf",
      chat_format="llama-2"
)
llm.create_chat_completion(
      messages = [
          {"role": "system", "content": "You are an assistant who perfectly describes images."},
          {
              "role": "user",
              "content": "Describe this image in detail please."
          }
      ]
)

llama_model_loader: loaded meta data with 33 key-value pairs and 218 tensors from /kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Cosmo2 1.7B Webinst Sc2
llama_model_loader: - kv   3:                       general.organization str              = HuggingFaceTB
llama_model_loader: - kv   4:                           general.finetune str              = webinst-sc2
llama_model_loader: - kv   5:                           general.basename str              = cosmo2
llama_model_loader: - kv   6:                         general.size_label str              

{'id': 'chatcmpl-ffdc6882-5d0e-439e-9bb5-59398fdad094',
 'object': 'chat.completion',
 'created': 1727883020,
 'model': '/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant', 'content': ''},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 34, 'completion_tokens': 0, 'total_tokens': 34}}

# JSON and JSON Schema Mode

To constrain chat responses to only valid JSON or a specific JSON Schema use the response_format argument in create_chat_completion.

## JSON Mode

The following example will constrain the response to valid JSON strings only.

In [6]:
from llama_cpp import Llama
llm = Llama(model_path="/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf", chat_format="chatml")
llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that outputs in JSON.",
        },
        {"role": "user", "content": "Who won the world series in 2020"},
    ],
    response_format={
        "type": "json_object",
    },
    temperature=0.7,
)

llama_model_loader: loaded meta data with 33 key-value pairs and 218 tensors from /kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Cosmo2 1.7B Webinst Sc2
llama_model_loader: - kv   3:                       general.organization str              = HuggingFaceTB
llama_model_loader: - kv   4:                           general.finetune str              = webinst-sc2
llama_model_loader: - kv   5:                           general.basename str              = cosmo2
llama_model_loader: - kv   6:                         general.size_label str              

{'id': 'chatcmpl-e71e225a-ffcf-40dc-8c9f-6b556080fbeb',
 'object': 'chat.completion',
 'created': 1727883022,
 'model': '/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant', 'content': '{}'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 35, 'completion_tokens': 1, 'total_tokens': 36}}

## Json schema mode
To constrain the response further to a specific JSON Schema add the schema to the schema property of the response_format argument.

In [7]:
from llama_cpp import Llama
llm = Llama(model_path="/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf", chat_format="chatml")
llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that outputs in JSON.",
        },
        {"role": "user", "content": "Who won the world series in 2020"},
    ],
    response_format={
        "type": "json_object",
        "schema": {
            "type": "object",
            "properties": {"team_name": {"type": "string"}},
            "required": ["team_name"],
        },
    },
    temperature=0.7,
)

llama_model_loader: loaded meta data with 33 key-value pairs and 218 tensors from /kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Cosmo2 1.7B Webinst Sc2
llama_model_loader: - kv   3:                       general.organization str              = HuggingFaceTB
llama_model_loader: - kv   4:                           general.finetune str              = webinst-sc2
llama_model_loader: - kv   5:                           general.basename str              = cosmo2
llama_model_loader: - kv   6:                         general.size_label str              

{'id': 'chatcmpl-f7025a69-782f-4299-bd96-dc4f8a8d1693',
 'object': 'chat.completion',
 'created': 1727883024,
 'model': '/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': '{"team_name": "Los Angeles Dodgers (NL) vs. New York Mets (AL) 2020 World Series"}'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 35, 'completion_tokens': 31, 'total_tokens': 66}}

# Function Calling

The high-level API supports OpenAI compatible function and tool calling. This is possible through the functionary pre-trained models chat format or through the generic chatml-function-calling chat format.

In [8]:
from llama_cpp import Llama
llm = Llama(model_path="/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf", chat_format="chatml-function-calling")
llm.create_chat_completion(
      messages = [
        {
          "role": "system",
          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"

        },
        {
          "role": "user",
          "content": "Extract Jason is 25 years old"
        }
      ],
      tools=[{
        "type": "function",
        "function": {
          "name": "UserDetail",
          "parameters": {
            "type": "object",
            "title": "UserDetail",
            "properties": {
              "name": {
                "title": "Name",
                "type": "string"
              },
              "age": {
                "title": "Age",
                "type": "integer"
              }
            },
            "required": [ "name", "age" ]
          }
        }
      }],
      tool_choice={
        "type": "function",
        "function": {
          "name": "UserDetail"
        }
      }
)

llama_model_loader: loaded meta data with 33 key-value pairs and 218 tensors from /kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Cosmo2 1.7B Webinst Sc2
llama_model_loader: - kv   3:                       general.organization str              = HuggingFaceTB
llama_model_loader: - kv   4:                           general.finetune str              = webinst-sc2
llama_model_loader: - kv   5:                           general.basename str              = cosmo2
llama_model_loader: - kv   6:                         general.size_label str              

{'id': 'chatcmpl-5ca2e2b7-70c9-4c41-a9a7-3c6aaca331ce',
 'object': 'chat.completion',
 'created': 1727883032,
 'model': '/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': None,
    'function_call': {'name': 'UserDetail',
     'arguments': '{ "name": "Jason", "age": 25 }'},
    'tool_calls': [{'id': 'call__0_UserDetail_cmpl-5ca2e2b7-70c9-4c41-a9a7-3c6aaca331ce',
      'type': 'function',
      'function': {'name': 'UserDetail',
       'arguments': '{ "name": "Jason", "age": 25 }'}}]},
   'logprobs': None,
   'finish_reason': 'tool_calls'}],
 'usage': {'prompt_tokens': 273, 'completion_tokens': 15, 'total_tokens': 288}}

# Speculative Decoding

llama-cpp-python supports speculative decoding which allows the model to generate completions based on a draft model.

The fastest way to use speculative decoding is through the LlamaPromptLookupDecoding class.

Just pass this as a draft model to the Llama class during initialization.

In [9]:
from llama_cpp import Llama
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding

llama = Llama(
    model_path="/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf",
    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
)

llama_model_loader: loaded meta data with 33 key-value pairs and 218 tensors from /kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Cosmo2 1.7B Webinst Sc2
llama_model_loader: - kv   3:                       general.organization str              = HuggingFaceTB
llama_model_loader: - kv   4:                           general.finetune str              = webinst-sc2
llama_model_loader: - kv   5:                           general.basename str              = cosmo2
llama_model_loader: - kv   6:                         general.size_label str              

# Embeddings

To generate text embeddings use create_embedding or embed. Note that you must pass embedding=True to the constructor upon model creation for these to work properly.

In [10]:
import llama_cpp

llm = llama_cpp.Llama(model_path="/kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf", embedding=True)

embeddings = llm.create_embedding("Hello, world!")

# or create multiple embeddings at once

embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"])

llama_model_loader: loaded meta data with 33 key-value pairs and 218 tensors from /kaggle/input/smollm-1.7b-instruct-gguf/gguf/fp16/1/SmolLM-1.7B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Cosmo2 1.7B Webinst Sc2
llama_model_loader: - kv   3:                       general.organization str              = HuggingFaceTB
llama_model_loader: - kv   4:                           general.finetune str              = webinst-sc2
llama_model_loader: - kv   5:                           general.basename str              = cosmo2
llama_model_loader: - kv   6:                         general.size_label str              