### Initialization

In [1]:
import httpx
import json
import openai
import os
import requests
from typing import List
from dotenv import load_dotenv
import time

load_dotenv(".env")
print(os.environ["CANONICAL_CACHE_API_KEY"])
print(os.environ["CANONICAL_CACHE_HOST"])

H9o0XGNw.RJL5iU8yuURKggXIsAHFNbzALCWFRNZw
https://cacheapp.canonical.chat/


### Client Configuration

In [2]:
TEMPERATURE = 0
CACHECLIENT = openai.OpenAI(
    base_url=os.environ["CANONICAL_CACHE_HOST"],
    api_key="canbeanything",
    http_client=httpx.Client(
        headers={
            "X-Canonical-Api-Key": os.environ["CANONICAL_CACHE_API_KEY"],
        },
    ),
)

OPENAICLIENT = openai.OpenAI(
    api_key= os.environ["OPENAI_API_KEY"],
)

### Function Definition

In [18]:
def complete(
    client: openai.OpenAI,
    messages: List[dict[str:str]],
    stream: bool,
    temperature: int = 0,
) -> openai.ChatCompletion:
    return client.chat.completions.create(
        model="gpt-3.5-turbo",
        stream=stream,
        temperature=temperature,
        messages=messages,
    )


def displaycompletion(completion: openai.ChatCompletion, stream: bool) -> str:
    msg = ""
    if stream:
        for chunk in completion:
            if chunk.choices[0].delta.content is not None:
                chunk = chunk.choices[0].delta.content
                msg += chunk
                print(chunk, end="", flush=True)
    else:
        msg = completion.choices[0].message.content
        print(msg)
    return msg


def updatecache(messages: List[dict[str:str]], temp: int = 0) -> None:
    requests.request(
        method="POST",
        url=f"{os.environ['CANONICAL_CACHE_HOST']}api/v1/cache",
        headers={
            "Content-Type": "application/json",
            "X-Canonical-Api-Key": os.environ["CANONICAL_CACHE_API_KEY"],
        },
        data=json.dumps(
            {
                "messages": messages,
                "temperature": temp,
                "model": "gpt-3.5-turbo",
            }
        ),
    )


async def ask_canonical(message: str) -> None:
    global chat_history
    # Add user's message to the conversation history
    chat_history.append({"role": "user", "content": message})

    stream = False
    try:

        tic = time.perf_counter()
        response = complete(CACHECLIENT, chat_history, stream, TEMPERATURE)
        message_content = response.choices[0].message.content
        toc = time.perf_counter()
        print("\nCache Hit")
        print(f"\nRetrieved cached response in {toc - tic:0.2f} seconds. I'm much faster outside of this dang Notebook! You should see me on-prem!")
        print("\n")

        msg = displaycompletion(response, stream)
        chat_history.append({"role": "assistant", "content": msg})

    except openai.NotFoundError as e:
        pass

        stream = True
        tic = time.perf_counter()
        response = complete(OPENAICLIENT, chat_history, stream, TEMPERATURE)
        toc = time.perf_counter()
        print("\nCache Miss")
        print(f"\nRetrieved API response in {toc - tic:0.2f} seconds.")
        print("")                

        msg = displaycompletion(response, stream)
        
        chat_history.append({"role": "assistant", "content": msg})
        updatecache(chat_history, TEMPERATURE)
        
        print("\n")
        print("\nCache writes are unavailable for 5 seconds to avoid conflicting with pre-fetching in Voice AI applications.") 
        print("\nEnforcing 5 second sleep...")
        time.sleep(5)
        


#### Define The System Prompt and Cache Scope

Each system prompt gets its own cache. To start a new cache, change the system prompt. The first time you use this demo, change the system prompt. Even a small change, like adding replacing one word, will create a new cache.

In [19]:
SYSTEM_PROMPT_CONTENT = ("You are a customer support assistant for ACME 3-D wind sensorss company." + 
                         "Talk to a customer regarding his or her inquiries and concerns about ACME's products and services." + 
                         "ACME sells products through their online store.")


#### User makes a request

Here the user is making the first ever request on the new cache. The cache is currently empty so we expect a cache miss.

In [20]:
global chat_history
chat_history = [
    {"role": "system", "content": SYSTEM_PROMPT_CONTENT},
    {"role": "assistant", "content": "Hello, how can I help you today?"}
]

# First request
message = "I'd like to ask for a refund."
await ask_canonical(message)


Cache Miss

Retrieved API response in 0.74 seconds.

I'm sorry to hear that you're not satisfied with your purchase. Can you please provide me with your order number so I can look into your request for a refund?


Cache writes are unavailable for 5 seconds to avoid conflicting with pre-fetching in Voice AI applications.

Enforcing 5 second sleep...


### New user starts a new session with the same system prompt and cache. New user makes a similar request.

Here a new user is chatting with the LLM. The first LLM response is the one and only response in the cache. If the user makes a similar request as the request that's in the cache, you'll get a cache hit. 

In [21]:
global chat_history
chat_history = [
    {"role": "system", "content": SYSTEM_PROMPT_CONTENT},
    {"role": "assistant", "content": "Hello, how can I help you today?"}
]

message = "I'd like a refund."
await ask_canonical(message)


Cache Hit

Retrieved cached response in 0.80 seconds. I'm much faster outside of this dang Notebook! You should see me on-prem!


I'm sorry to hear that you're not satisfied with your purchase. Can you please provide me with your order number so I can look into your request for a refund?


#### Your Turn

Make a new request of the LLM that's similar to the previous requests. Do you get a cache hit?

Make a request that's similar to the previous responses, but has a different intention. Do you get a cache miss?

In [23]:
global chat_history
chat_history = [
    {"role": "system", "content": SYSTEM_PROMPT_CONTENT},
    {"role": "assistant", "content": "Hello, how can I help you today?"}
]

message = "[write your message here.]"
await ask_canonical(message)


Cache Hit

Retrieved cached response in 0.53 seconds. I'm much faster outside of this dang Notebook! You should see me on-prem!


I see that you have some inquiries or concerns about our products and services. Please feel free to share them with me so I can assist you accordingly.
