# Usage

> Lisette usage and cost monitoring 

In [None]:
#| default_exp usage

In [None]:
#| export
from litellm.integrations.custom_logger import CustomLogger
import time
try: 
    from fastlite import *
    from fastlite.core import dataclass
except ImportError: raise ImportError("Please install `fastlite` to use sqlite based lisette usage logging.")

In [None]:
import litellm, importlib, httpx
from lisette.core import Chat, AsyncChat, patch_litellm
from fastcore.all import *
from cachy import enable_cachy

In [None]:
enable_cachy()

## Lisette Usage Logger

In [None]:
importlib.reload(litellm); # to re-run the notebook without kernel restart

In [None]:
patch_litellm()

In [None]:
#| export
class Usage: id:int; timestamp:float; model:str; user_id:str; prompt_tokens:int; completion_tokens:int; total_tokens:int; cached_tokens:int; cache_creation_tokens:int; cache_read_tokens:int; web_search_requests:int; response_cost:int

The precomputed response cost provided is available in `kwargs['response_cost']` according to the [litellm docs](https://docs.litellm.ai/docs/observability/custom_callback#whats-available-in-kwargs):

In [None]:
#| export
class LisetteUsageLogger(CustomLogger):
    def __init__(self, db_path): 
        self.db = Database(db_path)
        self.usage = self.db.create(Usage)
    
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): self._log_usage(response_obj, kwargs['response_cost'], start_time, end_time)
    def log_success_event(self, kwargs, response_obj, start_time, end_time):             self._log_usage(response_obj, kwargs['response_cost'], start_time, end_time)
    def _log_usage(self, response_obj, response_cost, start_time, end_time):
        usage = response_obj.usage
        ptd   = usage.prompt_tokens_details
        self.usage.insert(Usage(timestamp=time.time(), model=response_obj.model, user_id=self.user_id_fn(), prompt_tokens=usage.prompt_tokens, completion_tokens=usage.completion_tokens,
                                    total_tokens=usage.total_tokens, cached_tokens=ptd.cached_tokens if ptd else 0, cache_creation_tokens=usage.cache_creation_input_tokens, 
                                    cache_read_tokens=usage.cache_read_input_tokens, web_search_requests=nested_idx(usage, 'server_tool_use', 'web_search_requests'), response_cost=response_cost))
                  
    def user_id_fn(self): raise NotImplementedError('Please implement `LisetteUsageLogger.user_id_fn` before initializing, e.g using fastcore.patch.')

## Cost Utils

In [None]:
class PrefixDict(dict):
    def __getitem__(self, key):
        if key in self.keys(): return super().__getitem__(key)
        for k in self.keys(): 
            if key.startswith(k): return super().__getitem__(k)
        raise KeyError(key)

In [None]:
model_prices = PrefixDict({
    'claude-sonnet-4-5': dict(input_prc = 3/1e6, cache_write_prc = 3.75/1e6, cache_read_prc = 0.3/1e6, output_prc = 15/1e6, web_search_prc = 10/1e3)
})

Simplified cost utils to demonstrate total cost calculation (use `Usage.response_cost` in prod):

In [None]:
@patch(as_prop=True)
def inp_cost(self:Usage):         return model_prices[self.model]['input_prc'] * (self.prompt_tokens - self.cache_read_tokens)
@patch(as_prop=True)
def cache_write_cost(self:Usage): return model_prices[self.model]['cache_write_prc'] * self.cache_creation_tokens
@patch(as_prop=True)
def cache_read_cost(self:Usage):  return model_prices[self.model]['cache_read_prc'] * self.cache_read_tokens
@patch(as_prop=True)
def out_cost(self:Usage):         return model_prices[self.model]['output_prc'] * self.completion_tokens
@patch(as_prop=True)
def web_cost(self:Usage):         return model_prices[self.model]['web_search_prc'] * ifnone(self.web_search_requests, 0)
@patch(as_prop=True)
def cost(self:Usage):             return self.inp_cost + self.cache_write_cost + self.cache_read_cost + self.out_cost + self.web_cost


A mapping of model pricing is also available in litellm, which is used to calculate the `response_cost`

In [None]:
model_pricing = dict2obj(httpx.get(litellm.model_cost_map_url).json())

In [None]:
model_pricing['claude-sonnet-4-5']

```python
{ 'cache_creation_input_token_cost': 3.75e-06,
  'cache_creation_input_token_cost_above_200k_tokens': 7.5e-06,
  'cache_read_input_token_cost': 3e-07,
  'cache_read_input_token_cost_above_200k_tokens': 6e-07,
  'input_cost_per_token': 3e-06,
  'input_cost_per_token_above_200k_tokens': 6e-06,
  'litellm_provider': 'anthropic',
  'max_input_tokens': 200000,
  'max_output_tokens': 64000,
  'max_tokens': 64000,
  'mode': 'chat',
  'output_cost_per_token': 1.5e-05,
  'output_cost_per_token_above_200k_tokens': 2.25e-05,
  'search_context_cost_per_query': { 'search_context_size_high': 0.01,
                                     'search_context_size_low': 0.01,
                                     'search_context_size_medium': 0.01},
  'supports_assistant_prefill': True,
  'supports_computer_use': True,
  'supports_function_calling': True,
  'supports_pdf_input': True,
  'supports_prompt_caching': True,
  'supports_reasoning': True,
  'supports_response_schema': True,
  'supports_tool_choice': True,
  'supports_vision': True,
  'tool_use_system_prompt_tokens': 346}
```

In [None]:
model_pricing['gemini-3-pro-preview']

```python
{ 'cache_creation_input_token_cost_above_200k_tokens': 2.5e-07,
  'cache_read_input_token_cost': 2e-07,
  'cache_read_input_token_cost_above_200k_tokens': 4e-07,
  'input_cost_per_token': 2e-06,
  'input_cost_per_token_above_200k_tokens': 4e-06,
  'input_cost_per_token_batches': 1e-06,
  'litellm_provider': 'vertex_ai-language-models',
  'max_audio_length_hours': 8.4,
  'max_audio_per_prompt': 1,
  'max_images_per_prompt': 3000,
  'max_input_tokens': 1048576,
  'max_output_tokens': 65535,
  'max_pdf_size_mb': 30,
  'max_tokens': 65535,
  'max_video_length': 1,
  'max_videos_per_prompt': 10,
  'mode': 'chat',
  'output_cost_per_token': 1.2e-05,
  'output_cost_per_token_above_200k_tokens': 1.8e-05,
  'output_cost_per_token_batches': 6e-06,
  'source': 'https://cloud.google.com/vertex-ai/generative-ai/pricing',
  'supported_endpoints': ['/v1/chat/completions', '/v1/completions', '/v1/batch'],
  'supported_modalities': ['text', 'image', 'audio', 'video'],
  'supported_output_modalities': ['text'],
  'supports_audio_input': True,
  'supports_function_calling': True,
  'supports_pdf_input': True,
  'supports_prompt_caching': True,
  'supports_reasoning': True,
  'supports_response_schema': True,
  'supports_system_messages': True,
  'supports_tool_choice': True,
  'supports_video_input': True,
  'supports_vision': True,
  'supports_web_search': True}
```

## Examples

In [None]:
dbfp = Path('.lisette/litellm-usage.db')
dbfp.parent.mkdir(exist_ok=True)

In [None]:
@patch
def user_id_fn(self:LisetteUsageLogger): return 'user-123'
logger = LisetteUsageLogger(dbfp)
litellm.callbacks = [logger]

In [None]:
slc = ','.join('id model user_id prompt_tokens completion_tokens total_tokens cached_tokens cache_creation_tokens cache_read_tokens web_search_requests response_cost'.split())

In [None]:
# litellm.set_verbose = True

A simple example:

In [None]:
chat = Chat('claude-sonnet-4-5-20250929')
chat("What is 2+2?")

2+2 = 4

<details>

- id: `chatcmpl-xxx`
- model: `claude-sonnet-4-5-20250929`
- finish_reason: `stop`
- usage: `Usage(completion_tokens=11, prompt_tokens=14, total_tokens=25, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=None, cached_tokens=0, text_tokens=None, image_tokens=None), cache_creation_input_tokens=0, cache_read_input_tokens=0)`

</details>

In [None]:
time.sleep(0.3) # wait for callback db write
u = logger.usage(select=slc)[-1]; u

Usage(id=1, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=14, completion_tokens=11, total_tokens=25, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=None, response_cost=0.000207)

Our calculated cost matches litellm's `response_cost`. In some cases it might be better to use the custom calculation as we'll see in the remaining of this notebook:

In [None]:
test_eq(u.cost, u.response_cost)

Now, let's test with streaming:

In [None]:
chat = Chat('claude-sonnet-4-5')
res = chat("Count from 1 to 5", stream=True)
for o in res: pass

In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u


Usage(id=2, timestamp=UNSET, model='claude-sonnet-4-5', user_id='user-123', prompt_tokens=15, completion_tokens=17, total_tokens=32, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=None, response_cost=0.00030000000000000003)

In [None]:
test_eq(u.cost, u.response_cost)

Streaming logged successfully. Let's also verify async chat calls are logged properly.

In [None]:
chat_async = AsyncChat('claude-sonnet-4-5-20250929')
await chat_async("What is 3+3?")

3 + 3 = 6

<details>

- id: `chatcmpl-xxx`
- model: `claude-sonnet-4-5-20250929`
- finish_reason: `stop`
- usage: `Usage(completion_tokens=13, prompt_tokens=14, total_tokens=27, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=None, cached_tokens=0, text_tokens=None, image_tokens=None), cache_creation_input_tokens=0, cache_read_input_tokens=0)`

</details>

In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=2, timestamp=UNSET, model='claude-sonnet-4-5', user_id='user-123', prompt_tokens=15, completion_tokens=17, total_tokens=32, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=None, response_cost=0.00030000000000000003)

In [None]:
test_eq(u.cost, u.response_cost)

Finally, let's test async streaming to ensure all API patterns are covered.

In [None]:
res = await chat_async("Count from 10 to 15", stream=True)
async for o in res: pass
print(o)

ModelResponse(id='chatcmpl-xxx', created=1000000000, model='claude-sonnet-4-5-20250929', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='10, 11, 12, 13, 14, 15', role='assistant', tool_calls=None, function_call=None, provider_specific_fields=None))], usage=Usage(completion_tokens=20, prompt_tokens=38, total_tokens=58, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=None, audio_tokens=None, reasoning_tokens=0, rejected_prediction_tokens=None, text_tokens=None), prompt_tokens_details=None))


In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=4, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=38, completion_tokens=20, total_tokens=58, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=None, response_cost=0.00041400000000000003)

In [None]:
test_eq(u.cost, u.response_cost)

Now let's run a prompt with web search:

In [None]:
chat = Chat('claude-sonnet-4-5-20250929')
chat("What is the weather like in NYC? Search web.", search="m")

Based on the current weather information for New York City:

**Today (December 2, 2025):**
Rain, mainly after 7am, with a high near 43째F. Chance of precipitation is 100% with new precipitation amounts between three quarters and one inch possible. North wind 5 to 13 mph.

**Tonight:**
A 20 percent chance of rain before 10pm, then cloudy during the early evening with gradual clearing and a low around 32째F.

**Air Quality:**
The air has reached a high level of pollution and is unhealthy for sensitive groups.

The weather is rainy and cool today with temperatures in the low 40s, and it will clear up tonight with temperatures dropping to freezing.

<details>

- id: `chatcmpl-xxx`
- model: `claude-sonnet-4-5-20250929`
- finish_reason: `stop`
- usage: `Usage(completion_tokens=318, prompt_tokens=12012, total_tokens=12330, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=None, cached_tokens=0, text_tokens=None, image_tokens=None), server_tool_use=ServerToolUse(web_search_requests=1), cache_creation_input_tokens=0, cache_read_input_tokens=0)`

</details>

In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=5, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=12012, completion_tokens=318, total_tokens=12330, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=1, response_cost=0.040805999999999995)

::: {.callout-important}
Litellm's `response_cost` doesn't take search request cost into account!
:::

Now, this is a case where using the custom calculations is better as it will also include the web search request cost:

In [None]:
test_eq(u.cost, u.response_cost + model_prices[u.model]['web_search_prc'])

Web search with streaming:

In [None]:
chat = Chat('claude-sonnet-4-5')
res = chat("What is the weather like in NYC? Search web.", search="m", stream=True)
for o in res: pass
# print(o)

::: {.callout-important}
Web search requests are not included in usage when `stream=True`. Here is an open [Issue](https://github.com/BerriAI/litellm/issues/16631)
:::


In [None]:
time.sleep(0.3)
u = logger.usage(select=slc)[-1]; u

Usage(id=6, timestamp=UNSET, model='claude-sonnet-4-5', user_id='user-123', prompt_tokens=12012, completion_tokens=318, total_tokens=12330, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=None, response_cost=None)

Once this [PR](https://github.com/BerriAI/litellm/pull/16826) is merged `web_search_requests` will be included with `stream=True`, and the following test should pass:

In [None]:
# test_eq(u.cost, u.response_cost + model_prices[u.model]['web_search_prc'])

In [None]:
# u.cost

In [None]:
test_eq(len(logger.usage()), 6)

Let's implement a utility to get the total cost including web search:

In [None]:
#| export
@patch
def total_cost(self:Usage, sc=0.01): return self.response_cost + sc * ifnone(self.web_search_requests, 0)

In [None]:
# test_close((L(logger.usage()).map(lambda o:o.total_cost(sc=0.01)).sum()), 0.086, 1e-3)

In [None]:
dbfp.parent.delete()

# Export -

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()