# Sanity Check
ben gave me sample files which are indicative of requests that would be sent to DeepInfra. Sanity checking that I'm getting reasonable responses.

In [1]:
import os, json, requests, time
import pandas as pd
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load .env
load_dotenv()
API_KEY = os.getenv("DEEPINFRA_API_KEY")

REQUESTS_DIR = "sample_requests"
FORCED_MODEL = "google/gemma-3-12b-it"
MAX_WORKERS = 8  # number of threads

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json",
}

def send_request(fname: str):
    """Send a single request and return usage/cost info."""
    path = os.path.join(REQUESTS_DIR, fname)
    with open(path, "r", encoding="utf-8") as f:
        payload = json.load(f)

    payload["model"] = FORCED_MODEL  # override model

    start = time.perf_counter()
    try:
        resp = requests.post(
            "https://api.deepinfra.com/v1/openai/chat/completions",
            headers=headers,
            json=payload,
            timeout=120,
        )
        latency = time.perf_counter() - start
        resp.raise_for_status()
        data = resp.json()

        usage = data.get("usage", {})
        content = ""
        if "choices" in data and data["choices"]:
            content = data["choices"][0]["message"].get("content", "")[:200]

        return {
            "file": fname,
            "status": resp.status_code,
            "prompt_tokens": usage.get("prompt_tokens"),
            "completion_tokens": usage.get("completion_tokens"),
            "total_tokens": usage.get("total_tokens"),
            "estimated_cost": usage.get("estimated_cost"),
            "latency_sec": round(latency, 3),
            "response_preview": content,
        }
    except Exception as e:
        return {
            "file": fname,
            "status": "error",
            "prompt_tokens": None,
            "completion_tokens": None,
            "total_tokens": None,
            "estimated_cost": None,
            "latency_sec": None,
            "response_preview": f"❌ {e}",
        }

# Collect all JSON files
json_files = [f for f in os.listdir(REQUESTS_DIR) if f.endswith(".json")]

records = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(send_request, fname): fname for fname in json_files}
    for future in as_completed(futures):
        result = future.result()
        records.append(result)
        print(f"Finished {result['file']} -> {result['status']} ({result['latency_sec']}s)")

# Build DataFrame
df = pd.DataFrame(records)
print("\n=== Cost / Token Usage Data ===")
print(df)

Finished 5.json -> 200 (18.58s)
Finished api-req3.json -> 200 (23.429s)
Finished api-req2.json -> 200 (23.895s)
Finished api-req1.json -> 200 (26.745s)
Finished 2.json -> 200 (39.793s)

=== Cost / Token Usage Data ===
            file  status  prompt_tokens  completion_tokens  total_tokens  \
0         5.json     200           3164                927          4091   
1  api-req3.json     200           4805               1224          6029   
2  api-req2.json     200           4805               1198          6003   
3  api-req1.json     200           5603               1352          6955   
4         2.json     200           5192               2340          7532   

   estimated_cost  latency_sec  \
0        0.000251       18.580   
1        0.000363       23.429   
2        0.000360       23.895   
3        0.000415       26.745   
4        0.000494       39.793   

                                    response_preview  
0  {"figure_title":"Characteristics of InternVL 1...  
1  **1. 테이

# Cost and throughput
based on `parameter_analysis.ipynb`, a typical claim might have around 4,658 pages. at around 1.37 chunks per page, that's 6,381 chunks. Each chunk takes 2-5 of these API calls to produce, with some miscilaneous surrounding API calls. For our sake, we'll say each chunk requires 4 API calls, or 25,524 VLM calls per claim.

The following estimates costs and bandwidth based on how many claims might be processed simultaniously, and how quickly they should be processed.

In [2]:
# configurable
PARALLEL_CLAIMS = 20 # how many claims we want to be able to process in parallel
PROCESSING_TIME_SECONDS = 600 #600 = want each claim to be done in 10 minutes

# constants
VLM_CALLS_PER_CLAIM = 25524

In [3]:
average_cost = float(df.estimated_cost.mean())
average_latency = float(df.latency_sec.mean())

In [4]:
# Calls per second needed to finish all claims in time
calls_needed_per_second = (PARALLEL_CLAIMS * VLM_CALLS_PER_CLAIM) / PROCESSING_TIME_SECONDS

# Throughput per worker (calls/sec)
calls_per_worker_per_second = 1 / average_latency

# How many parallel workers required
required_parallelism = calls_needed_per_second / calls_per_worker_per_second

# Cost per claim
cost_per_claim = average_cost * VLM_CALLS_PER_CLAIM

# Total cost per batch (parallel claims in the given window)
total_cost = cost_per_claim * PARALLEL_CLAIMS

# Scale up to hourly and daily
cost_per_hour = total_cost * (3600 / PROCESSING_TIME_SECONDS)
cost_per_day = cost_per_hour * 24

print("=== Throughput & Cost Estimates ===")
print(f"Average cost per call: ${average_cost:.6f}")
print(f"Average latency per call: {average_latency:.3f}s")
print(f"Calls needed per second: {calls_needed_per_second:.2f}")
print(f"Effective calls/sec per worker: {calls_per_worker_per_second:.2f}")
print(f"Required parallel workers: {required_parallelism:.1f}")
print(f"Cost per claim: ${cost_per_claim:,.2f}")
print(f"Cost per {PROCESSING_TIME_SECONDS/60:.1f} minutes: ${total_cost:,.2f}")
print(f"Cost per hour: ${cost_per_hour:,.2f}")
print(f"Cost per day: ${cost_per_day:,.2f}")


=== Throughput & Cost Estimates ===
Average cost per call: $0.000377
Average latency per call: 26.488s
Calls needed per second: 850.80
Effective calls/sec per worker: 0.04
Required parallel workers: 22536.3
Cost per claim: $9.61
Cost per 10.0 minutes: $192.20
Cost per hour: $1,153.20
Cost per day: $27,676.92


# Comparing Requests
curious how much of it is system prompting

In [11]:
import os, json, requests, time
import pandas as pd
from dotenv import load_dotenv

# =========================================
# Config
# =========================================
load_dotenv()
API_KEY = os.getenv("DEEPINFRA_API_KEY")

# JSON_PATH = "sample_requests/api-req1.json"
# MESSAGE_INDEXES = [0, 2]

# JSON_PATH = "sample_requests/api-req2.json"
# MESSAGE_INDEXES = [0, 2]

JSON_PATH = "sample_requests/api-req3.json"
MESSAGE_INDEXES = [0, 2]


FORCED_MODEL = "google/gemma-3-12b-it"
DEEPINFRA_URL = "https://api.deepinfra.com/v1/openai/chat/completions"

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json",
}


# =========================================
# Helpers
# =========================================
def send_request(label: str, payload: dict):
    """Send a single DeepInfra request and return usage/cost info."""
    start = time.perf_counter()
    try:
        resp = requests.post(
            DEEPINFRA_URL, headers=headers, json=payload, timeout=120
        )
        latency = round(time.perf_counter() - start, 3)
        resp.raise_for_status()
        data = resp.json()

        usage = data.get("usage", {})
        content_preview = ""
        if "choices" in data and data["choices"]:
            msg = data["choices"][0].get("message", {})
            content_preview = msg.get("content", "")[:200]

        return {
            "label": label,
            "status": resp.status_code,
            "prompt_tokens": usage.get("prompt_tokens"),
            "completion_tokens": usage.get("completion_tokens"),
            "total_tokens": usage.get("total_tokens"),
            "estimated_cost": usage.get("estimated_cost"),
            "latency_sec": latency,
            # "response_preview": content_preview,
        }

    except Exception as e:
        return {
            "label": label,
            "status": "error",
            "prompt_tokens": None,
            "completion_tokens": None,
            "total_tokens": None,
            "estimated_cost": None,
            "latency_sec": None,
            # "response_preview": f"❌ {e}",
        }


# =========================================
# Main
# =========================================
if __name__ == "__main__":
    # Load base JSON
    with open(JSON_PATH, "r", encoding="utf-8") as f:
        base_payload = json.load(f)

    base_payload["model"] = FORCED_MODEL

    # ---- 1. Full Request (default max_tokens) ----
    print("=" * 80)
    print(f"Sending full request (default max_tokens)")
    result_full_default = send_request("FULL_DEFAULT", base_payload)
    print(f"✅ Done ({result_full_default['status']}) in {result_full_default['latency_sec']}s\n")

    # ---- 2. Full Request (max_tokens = 1) ----
    full_short = base_payload.copy()
    full_short["max_tokens"] = 1
    print("=" * 80)
    print(f"Sending full request (max_tokens=1)")
    result_full_1tok = send_request("FULL_MAXTOK_1", full_short)
    print(f"✅ Done ({result_full_1tok['status']}) in {result_full_1tok['latency_sec']}s\n")

    # ---- 3. Subset Request (max_tokens = 1) ----
    subset_payload = base_payload.copy()
    subset_payload["messages"] = [
        base_payload["messages"][i]
        for i in MESSAGE_INDEXES
        if 0 <= i < len(base_payload["messages"])
    ]
    subset_payload["max_tokens"] = 1

    print("=" * 80)
    print(f"Sending subset request (messages={MESSAGE_INDEXES}, max_tokens=1)")
    result_subset_1tok = send_request(f"SUBSET_{MESSAGE_INDEXES}_MAXTOK_1", subset_payload)
    print(f"✅ Done ({result_subset_1tok['status']}) in {result_subset_1tok['latency_sec']}s\n")

    # ---- Combine results ----
    df = pd.DataFrame([result_full_default, result_full_1tok, result_subset_1tok])
    print("\n=== Cost / Token Usage Data ===")
    print(df.to_string(index=False))


Sending full request (default max_tokens)
✅ Done (200) in 26.876s

Sending full request (max_tokens=1)
✅ Done (200) in 2.659s

Sending subset request (messages=[0, 2], max_tokens=1)
✅ Done (200) in 0.507s


=== Cost / Token Usage Data ===
                 label  status  prompt_tokens  completion_tokens  total_tokens  estimated_cost  latency_sec
          FULL_DEFAULT     200           4805               1079          5884        0.000332       26.876
         FULL_MAXTOK_1     200           4805                  1          4806        0.000192        2.659
SUBSET_[0, 2]_MAXTOK_1     200           2948                  1          2949        0.000118        0.507
