## Completions

[OpenAI API Reference](https://platform.openai.com/docs/api-reference/completions)


In [None]:
# Completion
import openai

openai.api_base = "http://localhost:8091/v1"
openai.api_key = ""
prompt = "Tell me something funny about llamas."

response = openai.Completion.create(
    model="llamacpp",
    prompt=prompt,
    temperature=1.31,
    max_tokens=8192,
    top_p=1.0,
    frequency_penalty=0,
    presence_penalty=0,
    stream=False,
)
message = response.choices[0].text.strip()
print(message)

## Chat Completion

[OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)


In [None]:
import openai

openai.api_base = "http://localhost:8091/v1"
openai.api_key = ""
prompt = "What is the capital of Ohio?"
messages = [{"role": "system", "content": prompt}]

response = openai.ChatCompletion.create(
    model="Local-LLM",
    messages=messages,
    temperature=1.31,
    max_tokens=8192,
    top_p=1.0,
    n=1,
    stream=False,
)
print(response)


## Embeddings

[OpenAI API Reference](https://platform.openai.com/docs/api-reference/embeddings)

The embeddings endpoint it currently uses is an ONNX embedder with 256 max tokens.


In [None]:
import openai

openai.api_base = "http://localhost:8091/v1"
openai.api_key = ""
prompt = "Tell me something funny about llamas."

response = openai.Embedding.create(
    input=prompt,
    model="Local-LLM",
)

print(response)

In [8]:
from llama_cpp import Llama
from GetModel import get_model


class LlamaProvider:
    def __init__(
        self,
        stop: str = "<|im_end|>",
        max_tokens: int = 8192,
        temperature: float = 0.8,
        top_p: float = 0.95,
        stream: bool = False,
        presence_penalty: float = 0.0,
        frequency_penalty: float = 0.0,
        logit_bias: list = [],
        model: str = "TheBloke/Mistral-7B-OpenOrca-GGUF",
        **kwargs,
    ):
        self.params = {}
        self.model_path = get_model(model_url=model)

        self.max_tokens = max_tokens if max_tokens else 8192
        self.params["n_ctx"] = self.max_tokens
        if stop:
            self.params["stop"] = stop
        if temperature:
            self.params["temperature"] = temperature
        if top_p:
            self.params["top_p"] = top_p
        if stream:
            self.params["stream"] = stream
        if presence_penalty:
            self.params["presence_penalty"] = presence_penalty
        if frequency_penalty:
            self.params["frequency_penalty"] = frequency_penalty
        if logit_bias:
            self.params["logit_bias"] = logit_bias

    def instruct(self, prompt, tokens: int = 0):
        self.params["n_predict"] = int(self.max_tokens) - tokens
        llm = Llama(**self.params, model_path=self.model_path)
        prompt_template = """
  <|im_start|>user

  {prompt}<|im_end|>

  <|im_start|>assistant
"""
        data = llm(prompt=prompt_template.format(prompt=prompt), stream=False)
        messages = [{"role": "user", "content": prompt}]
        message = data["choices"][0]["text"]
        if message.startswith("\n "):
            message = message[3:]
        if message.endswith("\n\n  "):
            message = message[:-4]
        if self.params["stop"] in message:
            message = message.split(self.params["stop"])[0]
        messages.append({"role": "assistant", "content": message})
        data["messages"] = messages
        return data


provider = LlamaProvider(max_tokens=8192, top_p=1.0, temperature=1.31)
provider.instruct(prompt="What is the capital of Ohio?")


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from models/mistral-7b-openorca.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32002,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,    

{'id': 'cmpl-ffaacda6-eb79-4809-92ab-3e72591aaf65',
 'object': 'text_completion',
 'created': 1696381408,
 'model': 'models/mistral-7b-openorca.Q4_K_M.gguf',
 'choices': [{'text': "\n  The capital of Ohio is Columbus.\n\n  <|im_end|>\n\n---\n\nThe assistant replied that the capital of Ohio is Columbus, which correctly answers the user's question.",
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 42, 'completion_tokens': 46, 'total_tokens': 88},
 'messages': [{'role': 'user', 'content': 'What is the capital of Ohio?'},
  {'role': 'assistant',
   'content': "The capital of Ohio is Columbus.\n\n  <|im_end|>\n\n---\n\nThe assistant replied that the capital of Ohio is Columbus, which correctly answers the user's question."}]}

In [None]:
from llama_cpp import Llama

llm = Llama(model_path="./models/mistral-7b-openorca.Q4_K_M.gguf")
prompt_template = """
  <|im_start|>user

  {prompt}<|im_end|>

  <|im_start|>assistant
"""
prompt = "What is the capital of Ohio?"

output = llm(
    stop="<|im_end|>", prompt=prompt_template.format(prompt=prompt), max_tokens=8192
)
messages = [{"role": "user", "content": prompt}]
message = output["choices"][0]["text"]
if message.startswith("\n "):
    message = message[3:]
if message.endswith("\n\n  "):
    message = message[:-4]
messages.append({"role": "assistant", "content": message})
print(message)
print(messages)
