# Usage

> Lisette usage and cost monitoring 

In [None]:
#| default_exp usage

In [None]:
#| export
from litellm.integrations.custom_logger import CustomLogger
from fastcore.utils import *
import time
try: from fastlite import *
except ImportError: raise ImportError("Please install `fastlite` to use sqlite based lisette usage logging.")

In [None]:
import litellm, importlib, httpx
from lisette.core import Chat, AsyncChat, patch_litellm
from cachy import enable_cachy, disable_cachy
from fastcore.test import *

In [None]:
#| hide
enable_cachy()

## Lisette Usage Logger

In [None]:
_ = importlib.reload(litellm) # to re-run the notebook without kernel restart

In [None]:
# litellm._turn_on_debug()

In [None]:
patch_litellm()

In [None]:
#| export
class Usage: id:int; timestamp:float; model:str; user_id:str; prompt_tokens:int; completion_tokens:int; total_tokens:int; cached_tokens:int; cache_creation_tokens:int; cache_read_tokens:int; web_search_requests:int; response_cost:int

Anthropic provides web search request counts directly via `usage.server_tool_use.web_search_requests`, billed at $10 per 1,000 searches ([pricing](https://docs.claude.com/en/docs/about-claude/pricing)). Gemini returns queries in `groundingMetadata.webSearchQueries`—each query counts as a separate billable use—with 5,000 free prompts per month, then $14 per 1,000 search queries (coming soon) ([pricing](https://ai.google.dev/gemini-api/docs/pricing), [grounding docs](https://ai.google.dev/gemini-api/docs/google-search)).

In [None]:
#| export
def search_count(r):
    if cnt := nested_idx(r.usage, 'server_tool_use', 'web_search_requests'): return cnt # Anthropic
    if meta := getattr(r, 'vertex_ai_grounding_metadata', None): # Gemini
        if meta and (queries := meta[0].get('webSearchQueries')): return len(queries)
    if cnt := nested_idx(r.usage, 'prompt_tokens_details', 'web_search_requests'): return cnt # streaming with `include_usage`
    return 0

The precomputed response cost provided is available in `kwargs['response_cost']` according to the [litellm docs](https://docs.litellm.ai/docs/observability/custom_callback#whats-available-in-kwargs):

In [None]:
#| export
class LisetteUsageLogger(CustomLogger):
    def __init__(self, db_path): 
        self.db = Database(db_path)
        self.usage = self.db.create(Usage)
    
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): self._log_usage(response_obj, kwargs['response_cost'], start_time, end_time)
    def log_success_event(self, kwargs, response_obj, start_time, end_time):             self._log_usage(response_obj, kwargs['response_cost'], start_time, end_time)
    def _log_usage(self, response_obj, response_cost, start_time, end_time):
        usage = response_obj.usage
        ptd   = usage.prompt_tokens_details
        self.usage.insert(Usage(timestamp=time.time(),
                                model=response_obj.model,
                                user_id=self.user_id_fn(),
                                prompt_tokens=usage.prompt_tokens,
                                completion_tokens=usage.completion_tokens,
                                total_tokens=usage.total_tokens,
                                cached_tokens=ptd.cached_tokens if ptd else 0, # used by gemini (read tokens)
                                cache_creation_tokens=nested_idx(usage, 'cache_creation_input_tokens'),
                                cache_read_tokens=nested_idx(usage, 'cache_read_input_tokens'), # used by anthropic
                                web_search_requests=search_count(response_obj),
                                response_cost=response_cost))
                  
    def user_id_fn(self): raise NotImplementedError('Please implement `LisetteUsageLogger.user_id_fn` before initializing, e.g using fastcore.patch.')

## Cost Utils

In [None]:
class PrefixDict(dict):
    def __getitem__(self, key):
        if key in self.keys(): return super().__getitem__(key)
        for k in self.keys(): 
            if key.startswith(k): return super().__getitem__(k)
        raise KeyError(key)

In [None]:
model_prices = PrefixDict({
    'claude-sonnet-4-5': dict(input_prc = 3/1e6, cache_write_prc = 3.75/1e6, cache_read_prc = 0.3/1e6, output_prc = 15/1e6, web_search_prc = 10/1e3)
})

Simplified cost utils to demonstrate total cost calculation (use `Usage.response_cost` in prod):

In [None]:
@patch(as_prop=True)
def inp_cost(self:Usage):         return model_prices[self.model]['input_prc'] * (self.prompt_tokens - self.cache_read_tokens)
@patch(as_prop=True)
def cache_write_cost(self:Usage): return model_prices[self.model]['cache_write_prc'] * self.cache_creation_tokens
@patch(as_prop=True)
def cache_read_cost(self:Usage):  return model_prices[self.model]['cache_read_prc'] * self.cache_read_tokens
@patch(as_prop=True)
def out_cost(self:Usage):         return model_prices[self.model]['output_prc'] * self.completion_tokens
@patch(as_prop=True)
def web_cost(self:Usage):         return model_prices[self.model]['web_search_prc'] * ifnone(self.web_search_requests, 0)
@patch(as_prop=True)
def cost(self:Usage):             return self.inp_cost + self.cache_write_cost + self.cache_read_cost + self.out_cost + self.web_cost


A mapping of model pricing is also available in litellm, which is used to calculate the `response_cost`

In [None]:
model_pricing = dict2obj(httpx.get(litellm.model_cost_map_url).json())

In [None]:
# model_pricing['claude-sonnet-4-5']

In [None]:
# model_pricing['gemini-3-pro-preview']

## Examples

In [None]:
from tempfile import NamedTemporaryFile
tf =NamedTemporaryFile(suffix='.db')

In [None]:
@patch
def user_id_fn(self:LisetteUsageLogger): return 'user-123'
tf=NamedTemporaryFile(suffix='.db')
logger = LisetteUsageLogger(tf.name)
litellm.callbacks = [logger]

In [None]:
slc = ','.join('id model user_id prompt_tokens completion_tokens total_tokens cached_tokens cache_creation_tokens cache_read_tokens web_search_requests response_cost'.split())

In [None]:
# litellm.set_verbose = True

A simple example:

In [None]:
chat = Chat('claude-sonnet-4-5-20250929')
r = chat("What is 2+2?")

In [None]:
time.sleep(0.3) # wait for callback db write
u = logger.usage(select=slc)[-1]; u

Usage(id=1, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=14, completion_tokens=11, total_tokens=25, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=0, response_cost=0.000207)

Our calculated cost matches litellm's `response_cost`. In some cases it might be better to use the custom calculation as we'll see in the remaining of this notebook:

In [None]:
test_eq(u.cost, u.response_cost)

Now, let's test with streaming:

In [None]:
chat = Chat('claude-sonnet-4-5')
res = chat("Count from 1 to 5", stream=True)
for o in res: pass

In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u


Usage(id=2, timestamp=UNSET, model='claude-sonnet-4-5', user_id='user-123', prompt_tokens=15, completion_tokens=17, total_tokens=32, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=0, response_cost=0.00030000000000000003)

In [None]:
test_eq(u.cost, u.response_cost)

Streaming logged successfully. Let's also verify async chat calls are logged properly.

In [None]:
chat_async = AsyncChat('claude-sonnet-4-5-20250929')
await chat_async("What is 3+3?")

3 + 3 = 6

<details>

- id: `chatcmpl-xxx`
- model: `claude-sonnet-4-5-20250929`
- finish_reason: `stop`
- usage: `Usage(completion_tokens=13, prompt_tokens=14, total_tokens=27, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=None, cached_tokens=0, text_tokens=None, image_tokens=None, cache_creation_tokens=0, cache_creation_token_details=CacheCreationTokenDetails(ephemeral_5m_input_tokens=0, ephemeral_1h_input_tokens=0)), cache_creation_input_tokens=0, cache_read_input_tokens=0)`

</details>

In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=2, timestamp=UNSET, model='claude-sonnet-4-5', user_id='user-123', prompt_tokens=15, completion_tokens=17, total_tokens=32, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=0, response_cost=0.00030000000000000003)

In [None]:
test_eq(u.cost, u.response_cost)

Finally, let's test async streaming to ensure all API patterns are covered.

In [None]:
res = await chat_async("Count from 10 to 15", stream=True)
async for o in res: pass
print(o)

ModelResponse(id='chatcmpl-xxx', created=1000000000, model='claude-sonnet-4-5-20250929', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='10, 11, 12, 13, 14, 15', role='assistant', tool_calls=None, function_call=None, provider_specific_fields=None))], usage=Usage(completion_tokens=20, prompt_tokens=38, total_tokens=58, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=None, audio_tokens=None, reasoning_tokens=0, rejected_prediction_tokens=None, text_tokens=None, image_tokens=None), prompt_tokens_details=None))


In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=4, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=38, completion_tokens=20, total_tokens=58, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=0, response_cost=0.00041400000000000003)

In [None]:
test_eq(u.cost, u.response_cost)

### Search

Now let's run a prompt with web search:

In [None]:
chat = Chat('gemini/gemini-2.5-flash')
chat("What is the weather like in NYC? Search web.", search="m")

In New York City, as of Monday, December 15, 2025, it is mostly sunny with a temperature of 24°F (-4°C), feeling like 16°F (-9°C). The humidity is around 52%.

The forecast for today, Monday, December 15, includes light snow during the day and partly cloudy skies at night, with a 20% chance of snow throughout the day and night. Temperatures are expected to range between 20°F (-7°C) and 28°F (-2°C), with humidity around 57%.

<details>

- id: `chatcmpl-xxx`
- model: `gemini-2.5-flash`
- finish_reason: `stop`
- usage: `Usage(completion_tokens=295, prompt_tokens=12, total_tokens=395, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=None, audio_tokens=None, reasoning_tokens=148, rejected_prediction_tokens=None, text_tokens=147, image_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=None, cached_tokens=None, text_tokens=12, image_tokens=None))`

</details>

In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=5, timestamp=UNSET, model='gemini-2.5-flash', user_id='user-123', prompt_tokens=12, completion_tokens=295, total_tokens=395, cached_tokens=None, cache_creation_tokens=None, cache_read_tokens=None, web_search_requests=1, response_cost=0.0007411000000000001)

In [None]:
test_eq(u.web_search_requests,1)

In [None]:
chat = Chat('claude-sonnet-4-5-20250929')
r = chat("What is the weather like in NYC? Search web.", search="m")

In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=6, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=10532, completion_tokens=318, total_tokens=10850, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=1, response_cost=0.036365999999999996)

In [None]:
test_eq(u.web_search_requests,1)

::: {.callout-important}
Litellm's `response_cost` doesn't take web search request cost into account!
:::

Now, this is a case where using the custom calculations is better as it will also include the web search request cost:

In [None]:
test_eq(u.cost, u.response_cost + u.web_search_requests * model_prices[u.model]['web_search_prc'])

### Search with streaming

Web search with streaming:

::: {.callout-important}
Gemini web search requests are part of `prompt_tokens_details` which is only included with `stream_options={"include_usage": True}` when `stream=True`. 

There is currently a bug with gemini web search request counts, [Issue](https://github.com/BerriAI/litellm/issues/17919) and [PR](https://github.com/BerriAI/litellm/pull/17921). Waiting for litellm 1.80.11 pypi release.
:::

In [None]:
chat = Chat('gemini/gemini-2.5-flash')
res = chat("What is the weather like in NYC? Search web.", search="m", stream=True, stream_options={"include_usage": True})
for o in res: pass
# print(o)

In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=7, timestamp=UNSET, model='gemini-2.5-flash', user_id='user-123', prompt_tokens=12, completion_tokens=588, total_tokens=600, cached_tokens=None, cache_creation_tokens=None, cache_read_tokens=None, web_search_requests=1, response_cost=0.0364736)

::: {.callout-important}
Anthropic web search requests are available in `usage.server_tool_use`
:::

In [None]:
chat = Chat('claude-sonnet-4-5')
res = chat("What is the weather like in NYC now? Search web.", search="m", stream=True, stream_options={"include_usage": True})
for o in res: pass
# print(o)

In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=8, timestamp=UNSET, model='claude-sonnet-4-5', user_id='user-123', prompt_tokens=10477, completion_tokens=303, total_tokens=10780, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=1, response_cost=0.035976)

In [None]:
test_eq(u.cost, u.response_cost + u.web_search_requests * model_prices[u.model]['web_search_prc'])

In [None]:
test_eq(len(logger.usage()), 8)

In [None]:
#| export
@patch
def total_cost(self:Usage, sc=0.01): return self.response_cost + sc * ifnone(self.web_search_requests, 0)

In [None]:
L(logger.usage()).attrgot('response_cost').sum()

0.1107147

In [None]:
disable_cachy()

A simple Gemini example (requires min tokens and running twice to see `cached_tokens`):

In [None]:
#| notest
chat = Chat('gemini/gemini-2.5-flash')
chat("What is 2+2?"* 500)
time.sleep(5)
chat("What is 2+2?"* 500)

2 + 2 = 4

<details>

- id: `chatcmpl-xxx`
- model: `gemini-2.5-flash`
- finish_reason: `stop`
- usage: `Usage(completion_tokens=41, prompt_tokens=7010, total_tokens=7051, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=None, audio_tokens=None, reasoning_tokens=34, rejected_prediction_tokens=None, text_tokens=7, image_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=None, cached_tokens=6117, text_tokens=893, image_tokens=None))`

</details>

In [None]:
#| notest
time.sleep(0.3) # wait for callback db write
u = logger.usage(select=slc)[-1];u

Usage(id=10, timestamp=UNSET, model='gemini-2.5-flash', user_id='user-123', prompt_tokens=7010, completion_tokens=41, total_tokens=7051, cached_tokens=6117, cache_creation_tokens=None, cache_read_tokens=None, web_search_requests=0, response_cost=0.00055391)

In [None]:
#| notest
test_eq(len(logger.usage()), 10)
test_eq(logger.usage()[-1].cached_tokens > 3000, True)

In [None]:
tf.close()

# Export -

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()