In [7]:
from llama_cpp import Llama

llm = Llama(
      model_path="models/Llama-3.2-1B-Instruct-f16.gguf",
      n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)
output = llm(
      "Q: Name the planets in the solar system? A: ", # Prompt
      max_tokens=None, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)

llama_model_loader: loaded meta data with 31 key-value pairs and 147 tensors from models/Llama-3.2-1B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 1B
llama_model_loader: - kv   6:                            general.license str              = llama3.2
llama_model_loader: - kv   7:                 

{'id': 'cmpl-6fce1c0f-09a2-4251-ae73-0ab86725e120', 'object': 'text_completion', 'created': 1744090977, 'model': 'models/Llama-3.2-1B-Instruct-f16.gguf', 'choices': [{'text': 'Q: Name the planets in the solar system? A:  Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune. ', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 14, 'completion_tokens': 19, 'total_tokens': 33}}


In [8]:
print(output["choices"][0]["text"]) # Print the generated text

Q: Name the planets in the solar system? A:  Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune. 


In [9]:
def QuestionAnswering(prompt):
    output = llm(
        prompt,
        max_tokens=None,
        stop=["Q:", "\n"],
        echo=True
    )
    return output["choices"][0]["text"]

In [10]:
print(QuestionAnswering("Q: Can You tell me how to optimize cuda? A: ")) # Example usage of the function

Llama.generate: 3 prefix-match hit, remaining 12 prompt tokens to eval
llama_perf_context_print:        load time =     683.55 ms
llama_perf_context_print: prompt eval time =     210.34 ms /    12 tokens (   17.53 ms per token,    57.05 tokens per second)
llama_perf_context_print:        eval time =    2111.48 ms /    28 runs   (   75.41 ms per token,    13.26 tokens per second)
llama_perf_context_print:       total time =    2369.93 ms /    40 tokens


Q: Can You tell me how to optimize cuda? A:  You can optimize CUDA by optimizing the host code to make the most of the GPU's resources. Here are some general tips for optimizing CUDA code:


### JSON Structured

In [12]:
from llama_cpp import Llama
llm = Llama(model_path="models/Llama-3.2-1B-Instruct-f16.gguf", chat_format="chatml")
response = llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that outputs in JSON.",
        },
        {"role": "user", "content": "Who won the world series in 2020"},
    ],
    response_format={
        "type": "json_object",
    },
    temperature=0.7,
)

llama_model_loader: loaded meta data with 31 key-value pairs and 147 tensors from models/Llama-3.2-1B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 1B
llama_model_loader: - kv   6:                            general.license str              = llama3.2
llama_model_loader: - kv   7:                 

In [15]:
# response['choices'][0]['message']['content'] # Print the generated text in JSON format
print(response['choices'][0]['message']['content']) # Print the generated text in JSON format


{"year": 2020, "world_series": "Los Angeles Dodgers"}



In [16]:
# Example usage of the function
def QuestionAnswering(prompt):
    output = llm.create_chat_completion(
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that outputs in JSON.",
            },
            {"role": "user", "content": prompt},
        ],
        response_format={
            "type": "json_object",
        },
        temperature=0.7,
    )
    return output['choices'][0]['message']['content']
print(QuestionAnswering("Q: Can You tell me how to optimize cuda? A: ")) # Example usage of the function

Llama.generate: 32 prefix-match hit, remaining 26 prompt tokens to eval
llama_perf_context_print:        load time =     847.83 ms
llama_perf_context_print: prompt eval time =     362.06 ms /    26 tokens (   13.93 ms per token,    71.81 tokens per second)
llama_perf_context_print:        eval time =    3292.01 ms /    44 runs   (   74.82 ms per token,    13.37 tokens per second)
llama_perf_context_print:       total time =    5448.82 ms /    70 tokens


{
  "status": "success",
  "data": {
    "optimization": "Optimizing CUDA can be a complex process, but here are some general tips to help you get started."
  }
}



### Function calling with Llama 3.2 1B

In [18]:
from llama_cpp import Llama
llm = Llama(model_path="models/Llama-3.2-1B-Instruct-f16.gguf", chat_format="chatml-function-calling")
response = llm.create_chat_completion(
      messages = [
        {
          "role": "system",
          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"

        },
        {
          "role": "user",
          "content": "Extract Jason is 25 years old"
        }
      ],
      tools=[{
        "type": "function",
        "function": {
          "name": "UserDetail",
          "parameters": {
            "type": "object",
            "title": "UserDetail",
            "properties": {
              "name": {
                "title": "Name",
                "type": "string"
              },
              "age": {
                "title": "Age",
                "type": "integer"
              }
            },
            "required": [ "name", "age" ]
          }
        }
      }],
      tool_choice={
        "type": "function",
        "function": {
          "name": "UserDetail"
        }
      }
)
print(response['choices'][0]['message']['function_call']) # Print the function call in JSON format
print(response['choices'][0]['message']['function_call']['name']) # Print the function name
print(response['choices'][0]['message']['function_call']['arguments']) # Print the function arguments in JSON format

llama_model_loader: loaded meta data with 31 key-value pairs and 147 tensors from models/Llama-3.2-1B-Instruct-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 1B
llama_model_loader: - kv   6:                            general.license str              = llama3.2
llama_model_loader: - kv   7:                 

{'name': 'UserDetail', 'arguments': '{"name": "Jason", "age": 25}'}
UserDetail
{"name": "Jason", "age": 25}


In [19]:
def FunctionCalling(prompt):
    output = llm.create_chat_completion(
        messages=[
            {
                "role": "system",
                "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        tools=[{
            "type": "function",
            "function": {
                "name": "UserDetail",
                "parameters": {
                    "type": "object",
                    "title": "UserDetail",
                    "properties": {
                        "name": {
                            "title": "Name",
                            "type": "string"
                        },
                        "age": {
                            "title": "Age",
                            "type": "integer"
                        }
                    },
                    "required": ["name", "age"]
                }
            }
        }],
        tool_choice={
            "type": "function",
            "function": {
                "name": "UserDetail"
            }
        }
    )
    return output['choices'][0]['message']['function_call']

In [20]:
print(FunctionCalling("Extract Jason is 25 years old")) # Example usage of the function
# Print the function call in JSON format

Llama.generate: 267 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =    3022.24 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    1069.05 ms /    13 runs   (   82.23 ms per token,    12.16 tokens per second)
llama_perf_context_print:       total time =    1410.29 ms /    14 tokens


{'name': 'UserDetail', 'arguments': '{"name": "Jason", "age": 25}'}


### Function-Calling calculator

In [51]:
def calculate_math_query(prompt):
    import json
    import re
    from sympy import sympify, SympifyError

    # Define a new tool schema for mathematical calculations with improved system prompt
    output = llm.create_chat_completion(
        messages=[
            {
                "role": "system",
                "content": "You are a mathematical assistant that extracts mathematical expressions from text queries. Extract ONLY the raw mathematical expression without any additional text or formatting. For example, from 'calculate 5*6', extract only '5*6'."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        tools=[{
            "type": "function",
            "function": {
                "name": "Calculate",
                "parameters": {
                    "type": "object",
                    "title": "Calculate",
                    "properties": {
                        "expression": {
                            "title": "Expression",
                            "type": "string",
                            "description": "The raw mathematical expression to evaluate (e.g., '5*6'). Include only numbers and operators."
                        }
                    },
                    "required": ["expression"]
                }
            }
        }],
        tool_choice={
            "type": "function",
            "function": {
                "name": "Calculate"
            }
        }
    )

    try:
        # Extract the expression from the function call
        function_call = output['choices'][0]['message']['function_call']
        arguments_str = function_call.get('arguments', '')

        # Debug: print the raw arguments string
        print("Raw arguments string:", arguments_str)

        try:
            # First attempt - normal JSON parsing
            arguments = json.loads(arguments_str)
            expression = arguments['expression']
        except json.JSONDecodeError:
            # Fallback: use regex with DOTALL flag to extract the expression value
            match = re.search(r'"expression"\s*:\s*"([^"]+)"', arguments_str, re.DOTALL)
            if match:
                expression = match.group(1)
            else:
                return "Failed to extract expression from model response"

        # Clean the expression - remove any non-math characters
        # Add '°' to the allowed list if you want to handle or remove it manually
        cleaned_expression = re.sub(r'[^0-9\+\-\*\/\(\)\.\s\^⋅°]', '', expression)

        # Optionally remove all degree symbols
        cleaned_expression = cleaned_expression.replace('°', '')

        # If nothing meaningful remains after cleaning, report an error
        if not re.search(r'[\d]', cleaned_expression):
            return "Could not extract a valid mathematical expression"

        # Safely evaluate the expression
        try:
            result = float(sympify(cleaned_expression))
            return result
        except (SympifyError, ValueError):
            # Fall back to eval with additional safeguards
            if any(keyword in cleaned_expression for keyword in ['__', 'import', 'eval', 'exec', 'compile']):
                return "Expression contains unsafe elements"
            result = eval(cleaned_expression)
            return result
    except Exception as e:
        return f"Unexpected error: {str(e)}"

In [52]:
# Example usage
# print(calculate_math_query("calculate 5*6"))  # Output: 30
print(calculate_math_query("what is 10+5"))   # Output: 15
# print(calculate_math_query("compute 20/4"))   # Output: 5.0

Llama.generate: 290 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =    3022.24 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   17533.85 ms /   221 runs   (   79.34 ms per token,    12.60 tokens per second)
llama_perf_context_print:       total time =   24038.83 ms /   222 tokens


Raw arguments string: {"expression": "10+5\u00b0\u00b2\u00b0\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00b2\u00b9\u00b0\u00
Failed to extract expression from model response
