## Completions

[OpenAI API Reference](https://platform.openai.com/docs/api-reference/completions)


In [None]:
# Completion
import openai

openai.api_base = "http://localhost:8091/v1"
openai.api_key = ""
prompt = "Tell me something funny about llamas."

response = openai.Completion.create(
    model="llamacpp",
    prompt=prompt,
    temperature=1.31,
    max_tokens=8192,
    top_p=1.0,
    frequency_penalty=0,
    presence_penalty=0,
    stream=False,
)
message = response.choices[0].text.strip()
print(message)

## Chat Completion

[OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)


In [None]:
import openai

openai.api_base = "http://localhost:8091/v1"
openai.api_key = ""
prompt = "What is the capital of Ohio?"
messages = [{"role": "system", "content": prompt}]

response = openai.ChatCompletion.create(
    model="Local-LLM",
    messages=messages,
    temperature=1.31,
    max_tokens=8192,
    top_p=1.0,
    n=1,
    stream=False,
)
print(response)


## Embeddings

[OpenAI API Reference](https://platform.openai.com/docs/api-reference/embeddings)

The embeddings endpoint it currently uses is an ONNX embedder with 256 max tokens.


In [None]:
import openai

openai.api_base = "http://localhost:8091/v1"
openai.api_key = ""
prompt = "Tell me something funny about llamas."

response = openai.Embedding.create(
    input=prompt,
    model="Local-LLM",
)

print(response)

In [3]:
import requests


# Instructions for setting up llama.cpp server:
# https://github.com/ggerganov/llama.cpp/tree/master/examples/server#llamacppexampleserver
class LlamaProvider:
    def __init__(
        self,
        STOP_SEQUENCE: str = "</s>",
        MAX_TOKENS: int = 8192,
        AI_TEMPERATURE: float = 0.8,
        AI_TOP_P: float = 0.95,
        AI_TOP_K: int = 40,
        TFS_Z: float = 1.0,
        TYPICAL_P: float = 1.0,
        REPEAT_PENALTY: float = 1.1,
        REPEAT_LAST_N: int = 64,
        PENALIZE_NL: bool = True,
        PRESENCE_PENALTY: float = 0.0,
        FREQUENCY_PENALTY: float = 0.0,
        MIROSTAT: int = 0,
        MIROSTAT_TAU: float = 5.0,
        MIROSTAT_ETA: float = 0.1,
        IGNORE_EOS: bool = False,
        LOGIT_BIAS: list = [],
        **kwargs,
    ):
        self.params = {}
        self.MAX_TOKENS = MAX_TOKENS if MAX_TOKENS else 8192
        if STOP_SEQUENCE:
            self.params["stop"] = STOP_SEQUENCE
        if AI_TEMPERATURE:
            self.params["temperature"] = AI_TEMPERATURE
        if AI_TOP_P:
            self.params["top_p"] = AI_TOP_P
        if AI_TOP_K:
            self.params["top_k"] = AI_TOP_K
        if TFS_Z:
            self.params["tfs_z"] = TFS_Z
        if TYPICAL_P:
            self.params["typical_p"] = TYPICAL_P
        if REPEAT_PENALTY:
            self.params["repeat_penalty"] = REPEAT_PENALTY
        if REPEAT_LAST_N:
            self.params["repeat_last_n"] = REPEAT_LAST_N
        if PENALIZE_NL:
            self.params["penalize_nl"] = PENALIZE_NL
        if PRESENCE_PENALTY:
            self.params["presence_penalty"] = PRESENCE_PENALTY
        if FREQUENCY_PENALTY:
            self.params["frequency_penalty"] = FREQUENCY_PENALTY
        if MIROSTAT:
            self.params["mirostat"] = MIROSTAT
        if MIROSTAT_TAU:
            self.params["mirostat_tau"] = MIROSTAT_TAU
        if MIROSTAT_ETA:
            self.params["mirostat_eta"] = MIROSTAT_ETA
        if IGNORE_EOS:
            self.params["ignore_eos"] = IGNORE_EOS
        if LOGIT_BIAS:
            self.params["logit_bias"] = LOGIT_BIAS

    def instruct(self, prompt, tokens: int = 0):
        self.params["prompt"] = prompt
        self.params["n_predict"] = int(self.MAX_TOKENS) - tokens
        self.params["stream"] = False
        print(self.params)
        response = requests.post(f"http://localhost:8080/completion", json=self.params)
        data = response.json()
        print(data)
        if "choices" in data:
            choices = data["choices"]
            if choices:
                return choices[0]["text"]
        if "content" in data:
            return data["content"]
        return data


provider = LlamaProvider(MAX_TOKENS=8192, TOP_P=1.0, TEMPERATURE=1.31)
provider.instruct(prompt="What is the capital of Ohio?")

{'stop': '</s>', 'temperature': 0.8, 'top_p': 0.95, 'top_k': 40, 'tfs_z': 1.0, 'typical_p': 1.0, 'repeat_penalty': 1.1, 'repeat_last_n': 64, 'penalize_nl': True, 'mirostat_tau': 5.0, 'mirostat_eta': 0.1, 'prompt': 'What is the capital of Ohio?', 'n_predict': 8192, 'stream': False}


ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))