# Sanity Check
ben gave me sample files which are indicative of requests that would be sent to DeepInfra. Sanity checking that I'm getting reasonable responses.

In [5]:
import os, json, requests, time
import pandas as pd
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load .env
load_dotenv()
API_KEY = os.getenv("DEEPINFRA_API_KEY")

REQUESTS_DIR = "sample_requests"
FORCED_MODEL = "google/gemma-3-12b-it"
MAX_WORKERS = 8  # number of threads

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json",
}

def send_request(fname: str):
    """Send a single request and return usage/cost info."""
    path = os.path.join(REQUESTS_DIR, fname)
    with open(path, "r", encoding="utf-8") as f:
        payload = json.load(f)

    payload["model"] = FORCED_MODEL  # override model

    start = time.perf_counter()
    try:
        resp = requests.post(
            "https://api.deepinfra.com/v1/openai/chat/completions",
            headers=headers,
            json=payload,
            timeout=120,
        )
        latency = time.perf_counter() - start
        resp.raise_for_status()
        data = resp.json()

        usage = data.get("usage", {})
        content = ""
        if "choices" in data and data["choices"]:
            content = data["choices"][0]["message"].get("content", "")[:200]

        return {
            "file": fname,
            "status": resp.status_code,
            "prompt_tokens": usage.get("prompt_tokens"),
            "completion_tokens": usage.get("completion_tokens"),
            "total_tokens": usage.get("total_tokens"),
            "estimated_cost": usage.get("estimated_cost"),
            "latency_sec": round(latency, 3),
            "response_preview": content,
        }
    except Exception as e:
        return {
            "file": fname,
            "status": "error",
            "prompt_tokens": None,
            "completion_tokens": None,
            "total_tokens": None,
            "estimated_cost": None,
            "latency_sec": None,
            "response_preview": f"❌ {e}",
        }

# Collect all JSON files
json_files = [f for f in os.listdir(REQUESTS_DIR) if f.endswith(".json")]

records = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(send_request, fname): fname for fname in json_files}
    for future in as_completed(futures):
        result = future.result()
        records.append(result)
        print(f"Finished {result['file']} -> {result['status']} ({result['latency_sec']}s)")

# Build DataFrame
df = pd.DataFrame(records)
print("\n=== Cost / Token Usage Data ===")
print(df)

Finished api-req3.json -> 200 (20.63s)
Finished api-req2.json -> 200 (25.95s)
Finished 5.json -> 200 (32.48s)
Finished api-req1.json -> 200 (42.134s)
Finished 2.json -> 200 (47.835s)

=== Cost / Token Usage Data ===
            file  status  prompt_tokens  completion_tokens  total_tokens  \
0  api-req3.json     200           4805                731          5536   
1  api-req2.json     200           4805                986          5791   
2         5.json     200           3164               1307          4471   
3  api-req1.json     200           5603               1694          7297   
4         2.json     200           5192               1949          7141   

   estimated_cost  latency_sec  \
0        0.000313       20.630   
1        0.000339       25.950   
2        0.000289       32.480   
3        0.000450       42.134   
4        0.000455       47.835   

                                    response_preview  
0  **1. 테이블 구조 분석:**\n\n*   **테이블 제목:** “(표 1) OI...  
1  **1. 테이블 

# Cost and throughput
based on `parameter_analysis.ipynb`, a typical claim might have around 4,658 pages. at around 1.37 chunks per page, that's 6,381 chunks. Each chunk takes 2-5 of these API calls to produce, with some miscilaneous surrounding API calls. For our sake, we'll say each chunk requires 4 API calls, or 25,524 VLM calls per claim.

The following estimates costs and bandwidth based on how many claims might be processed simultaniously, and how quickly they should be processed.

In [None]:
# configurable
PARALLEL_CLAIMS = 20 # how many claims we want to be able to process in parallel
PROCESSING_TIME_SECONDS = 600 #600 = want each claim to be done in 10 minutes

# constants
VLM_CALLS_PER_CLAIM = 25524

In [14]:
average_cost = float(df.estimated_cost.mean())
average_latency = float(df.latency_sec.mean())

In [15]:
# Calls per second needed to finish all claims in time
calls_needed_per_second = (PARALLEL_CLAIMS * VLM_CALLS_PER_CLAIM) / PROCESSING_TIME_SECONDS

# Throughput per worker (calls/sec)
calls_per_worker_per_second = 1 / average_latency

# How many parallel workers required
required_parallelism = calls_needed_per_second / calls_per_worker_per_second

# Cost per claim
cost_per_claim = average_cost * VLM_CALLS_PER_CLAIM

# Total cost per batch (parallel claims in the given window)
total_cost = cost_per_claim * PARALLEL_CLAIMS

# Scale up to hourly and daily
cost_per_hour = total_cost * (3600 / PROCESSING_TIME_SECONDS)
cost_per_day = cost_per_hour * 24

print("=== Throughput & Cost Estimates ===")
print(f"Average cost per call: ${average_cost:.6f}")
print(f"Average latency per call: {average_latency:.3f}s")
print(f"Calls needed per second: {calls_needed_per_second:.2f}")
print(f"Effective calls/sec per worker: {calls_per_worker_per_second:.2f}")
print(f"Required parallel workers: {required_parallelism:.1f}")
print(f"Cost per claim: ${cost_per_claim:,.2f}")
print(f"Cost per {PROCESSING_TIME_SECONDS/60:.1f} minutes: ${total_cost:,.2f}")
print(f"Cost per hour: ${cost_per_hour:,.2f}")
print(f"Cost per day: ${cost_per_day:,.2f}")


=== Throughput & Cost Estimates ===
Average cost per call: $0.000369
Average latency per call: 33.806s
Calls needed per second: 850.80
Effective calls/sec per worker: 0.03
Required parallel workers: 28762.0
Cost per claim: $9.42
Cost per 10.0 minutes: $188.38
Cost per hour: $1,130.29
Cost per day: $27,127.07
