#### Cell 1 — Install

In [1]:
!pip install -U google-genai python-dotenv pandas tabulate

Collecting google-genai
  Downloading google_genai-1.35.0-py3-none-any.whl.metadata (43 kB)
Downloading google_genai-1.35.0-py3-none-any.whl (244 kB)
Installing collected packages: google-genai
  Attempting uninstall: google-genai
    Found existing installation: google-genai 1.33.0
    Uninstalling google-genai-1.33.0:
      Successfully uninstalled google-genai-1.33.0
Successfully installed google-genai-1.35.0


#### Cell 2 — Load API key

In [2]:
from dotenv import load_dotenv
import os

# Change "keys.env" to ".env" if that's your filename
load_dotenv("keys.env")

def masked(v): 
    return v[:4] + "..." + v[-4:] if v and len(v) >= 8 else str(bool(v))

print("GEMINI_API_KEY:", masked(os.getenv("GEMINI_API_KEY")))
assert os.getenv("GEMINI_API_KEY"), "Missing GEMINI_API_KEY in keys.env/.env"


GEMINI_API_KEY: AIza...2-E8


#### Cell 3 — Imports & global config

In [3]:
import time, math, statistics as stats, pandas as pd
from tabulate import tabulate
from typing import List, Tuple, Dict, Any
from google import genai
from google.genai import types

# Global defaults (you can tweak in experiments)
PROMPT = "Explain transformers in AI in 3 short sentences."
N_RUNS = 5                 # repeats per model (excluding warm-up)
MAX_OUTPUT_TOKENS = 128    # default answer length
TEMPERATURE = 0.5          # moderately deterministic

client = genai.Client()    # uses GEMINI_API_KEY from env


#### Cell 4 — Model helpers (filters & rules)

In [4]:
def needs_thinking(model_id: str) -> bool:
    """Gemini 2.5 Pro family requires thinking mode (budget 128..32768)."""
    return "gemini-2.5-pro" in model_id

def disallow_thinking(model_id: str) -> bool:
    """Gemma family does not support thinking_config."""
    return model_id.startswith("models/gemma-")

def is_text_model(model_id: str) -> bool:
    """Filter out non-text models (imagen/veo/embeddings/tts/live/image)."""
    bad = ["imagen", "veo", "embedding", "aqa", "tts", "live", "image"]
    return all(b not in model_id for b in bad)

def normalize_ids(ids):
    def norm(mid):
        return mid if mid.startswith("models/") else f"models/{mid}"
    return [norm(m) for m in ids]


#### Cell 5 — Core runner (streaming + conditional thinking + retries)

In [5]:
def run_once(model_id: str, prompt: str,
             max_output_tokens: int = None,
             temperature: float = None,
             candidate_count: int = 1,
             pro_thinking_budget: int = 256) -> Dict[str, Any]:
    """
    Streams the response and measures:
      - ttfb_s (time to first token)
      - time_to_final_token_s (first → last token)
      - total_s (end-to-end)
    Applies:
      - thinking_config=pro_thinking_budget for 2.5 Pro (128..32768)
      - thinking_config omitted for Gemma
      - thinking_config=0 for other Gemini
    Retries transient 429/503 errors.
    """
    retries = 2
    backoff = 1.5
    attempt = 0

    max_output_tokens = MAX_OUTPUT_TOKENS if max_output_tokens is None else max_output_tokens
    temperature = TEMPERATURE if temperature is None else temperature

    while True:
        try:
            t0 = time.perf_counter()
            first = None
            last = None
            chunks = []

            cfg = dict(
                temperature=temperature,
                max_output_tokens=max_output_tokens,
                candidate_count=candidate_count,
            )
            if needs_thinking(model_id):
                cfg["thinking_config"] = types.ThinkingConfig(thinking_budget=pro_thinking_budget)
            elif disallow_thinking(model_id):
                pass  # omit thinking_config
            else:
                cfg["thinking_config"] = types.ThinkingConfig(thinking_budget=0)

            stream = client.models.generate_content_stream(
                model=model_id,
                contents=prompt,
                config=types.GenerateContentConfig(**cfg),
            )

            for chunk in stream:
                if chunk.text:
                    now = time.perf_counter()
                    if first is None:
                        first = now
                    last = now
                    chunks.append(chunk.text)

            total_s = time.perf_counter() - t0
            ttfb_s = (first - t0) if first else math.nan
            time_to_final_token_s = (last - first) if (first and last) else math.nan

            return {
                "model": model_id,
                "ttfb_s": ttfb_s,
                "time_to_final_token_s": time_to_final_token_s,
                "total_s": total_s,
                "text": "".join(chunks),
            }

        except Exception as e:
            msg = str(e)
            transient = ("RESOURCE_EXHAUSTED" in msg) or ("UNAVAILABLE" in msg) or ("429" in msg) or ("503" in msg)
            if transient and attempt < retries:
                attempt += 1
                time.sleep(backoff * attempt)
                continue
            raise


#### Cell 6 — Benchmark function (table + CSV)

In [6]:
def bench_models(models: List[str], prompt: str,
                 max_output_tokens: int = None,
                 temperature: float = None,
                 candidate_count: int = 1,
                 pro_thinking_budget: int = 256,
                 runs: int = None,
                 csv_path: str = None,
                 warmup_runs: int = 2) -> pd.DataFrame:
    rows = []
    runs = N_RUNS if runs is None else runs

    for mid in models:
        print(f"\n--- {mid} ---")
        status = "ok"
        try:
            # Warm-ups to reduce cold-start variance
            for _ in range(max(0, warmup_runs)):
                _ = run_once(mid, prompt,
                             max_output_tokens=max_output_tokens,
                             temperature=temperature,
                             candidate_count=candidate_count,
                             pro_thinking_budget=pro_thinking_budget)

            # Measured runs
            times = [run_once(mid, prompt,
                              max_output_tokens=max_output_tokens,
                              temperature=temperature,
                              candidate_count=candidate_count,
                              pro_thinking_budget=pro_thinking_budget)
                     for _ in range(runs)]

            ttfb_vals = [t["ttfb_s"] for t in times if not math.isnan(t["ttfb_s"])]
            t2last_vals = [t["time_to_final_token_s"] for t in times if not math.isnan(t["time_to_final_token_s"])]
            total_vals = [t["total_s"] for t in times]

            def summary(xs: List[float]) -> Tuple[float,float,float]:
                return (min(xs), sum(xs)/len(xs), max(xs))

            ttfb = summary(ttfb_vals) if ttfb_vals else (math.nan, math.nan, math.nan)
            t2last = summary(t2last_vals) if t2last_vals else (math.nan, math.nan, math.nan)
            total = summary(total_vals)

            print(f"TTFB          min/avg/max: {ttfb}")
            print(f"Time-to-final min/avg/max: {t2last}")
            print(f"Total         min/avg/max: {total}")

            rows.append({
                "model": mid,
                "status": status,
                "ttfb_min": ttfb[0], "ttfb_avg": ttfb[1], "ttfb_max": ttfb[2],
                "t2last_min": t2last[0], "t2last_avg": t2last[1], "t2last_max": t2last[2],
                "total_min": total[0], "total_avg": total[1], "total_max": total[2],
                "max_output_tokens": max_output_tokens if max_output_tokens is not None else MAX_OUTPUT_TOKENS,
                "temperature": temperature if temperature is not None else TEMPERATURE,
                "candidate_count": candidate_count,
                "pro_thinking_budget": pro_thinking_budget if needs_thinking(mid) else 0,
            })

        except Exception as e:
            status = str(e).split("\n", 1)[0][:200]  # short error
            print(f"Error: {status}")
            rows.append({
                "model": mid,
                "status": status,
                "ttfb_min": None, "ttfb_avg": None, "ttfb_max": None,
                "t2last_min": None, "t2last_avg": None, "t2last_max": None,
                "total_min": None, "total_avg": None, "total_max": None,
                "max_output_tokens": max_output_tokens if max_output_tokens is not None else MAX_OUTPUT_TOKENS,
                "temperature": temperature if temperature is not None else TEMPERATURE,
                "candidate_count": candidate_count,
                "pro_thinking_budget": pro_thinking_budget if needs_thinking(mid) else 0,
            })

    df = pd.DataFrame(
        rows,
        columns=[
            "model","status",
            "ttfb_min","ttfb_avg","ttfb_max",
            "t2last_min","t2last_avg","t2last_max",
            "total_min","total_avg","total_max",
            "max_output_tokens","temperature","candidate_count","pro_thinking_budget"
        ]
    )

    # Save the full (including errors) table
    if csv_path:
        df.to_csv(csv_path, index=False)
        print(f"\nSaved full results (including errors): {csv_path}")

    # Show a clean table with only successful rows
    ok_df = df[df["status"] == "ok"].copy()
    if not ok_df.empty:
        ok_df = ok_df.sort_values("total_avg", na_position="last")
        print("\n=== Successful models (sorted by total_avg) ===")
        from tabulate import tabulate
        print(tabulate(ok_df, headers="keys", tablefmt="github", floatfmt=".3f"))
    else:
        print("\nNo successful rows to show.")

    return df


#### Cell 7 — Choose models to test (from your list)

In [7]:
# Representative set (edit as needed)
wanted = normalize_ids([
    # Gemini 1.5 (older; often free-tier)
    "gemini-1.5-flash",
    "gemini-1.5-pro",      # may be throttled on free tier

    # Gemini 2.0 (legacy/experimental speed tiers)
    "gemini-2.0-flash",
    "gemini-2.0-flash-lite",

    # Gemini 2.5 (current mainline; best for production)
    "gemini-2.5-flash",
    "gemini-2.5-flash-lite",
    "gemini-2.5-pro",      # requires thinking mode (handled automatically)

    # Optional: open-weight via API (omit if not needed)
    "gemma-3-4b-it",
])

available = {m.name for m in client.models.list()}
to_test = [m for m in wanted if (m in available and is_text_model(m))]

print("Testing these models:", to_test)
assert len(to_test) > 0, "After filtering, no models remain. Check model IDs or permissions."


Testing these models: ['models/gemini-1.5-flash', 'models/gemini-1.5-pro', 'models/gemini-2.0-flash', 'models/gemini-2.0-flash-lite', 'models/gemini-2.5-flash', 'models/gemini-2.5-flash-lite', 'models/gemini-2.5-pro', 'models/gemma-3-4b-it']


#### Cell 8 — Baseline benchmark + CSV

In [8]:
df_base = bench_models(
    to_test,
    PROMPT,
    max_output_tokens=128,
    temperature=0.5,
    candidate_count=1,
    pro_thinking_budget=256,
    runs=3,
    csv_path="gemini_latency_baseline.csv",  # full table (incl. errors)
    warmup_runs=2,                            # NEW: stabilise streaming
)



--- models/gemini-1.5-flash ---
TTFB          min/avg/max: (2.4288187000202015, 3.545865833371257, 4.148669500020333)
Time-to-final min/avg/max: (0.42520169995259494, 0.45863549997253966, 0.48275339999236166)
Total         min/avg/max: (2.901229300070554, 4.007619966675217, 4.631613799952902)

--- models/gemini-1.5-pro ---
Error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.

--- models/gemini-2.0-flash ---
TTFB          min/avg/max: (0.7008652999065816, 0.7529738333153849, 0.8268949999473989)
Time-to-final min/avg/max: (0.3411140999523923, 0.43472943334685016, 0.6056403000839055)
Total         min/avg/max: (1.0630029999883845, 1.190409666664588, 1.432850499986671)

--- models/gemini-2.0-flash-lite ---
TTFB          min/avg/max: (0.4786669999593869, 0.5195707000093535, 0.5643330999882892)
Time-to-final min/avg/max: (0.42550220002885

### Experiments

#### A) Output length — max_output_tokens sweep

In [9]:
TEST_MODELS_A = ["models/gemini-2.5-flash", "models/gemini-2.5-flash-lite"]
TOKENS = [32, 64, 128, 256]

frames = []
for tok in TOKENS:
    df = bench_models(TEST_MODELS_A, PROMPT, max_output_tokens=tok, temperature=0.5,
                      candidate_count=1, pro_thinking_budget=256, runs=3)
    df["sweep"] = f"max_tokens={tok}"
    frames.append(df)

df_tokens = pd.concat(frames, ignore_index=True)
df_tokens.to_csv("gemini_sweep_tokens.csv", index=False)
print("\nSaved: gemini_sweep_tokens.csv")
df_tokens.head()



--- models/gemini-2.5-flash ---
TTFB          min/avg/max: (0.5129429999506101, 0.5990121666885292, 0.6883639000589028)
Time-to-final min/avg/max: (0.15281190001405776, 0.22903590001321086, 0.2991009000688791)
Total         min/avg/max: (0.7512671999866143, 0.8302920333032185, 0.9254178999690339)

--- models/gemini-2.5-flash-lite ---
TTFB          min/avg/max: (0.5537831999827176, 0.5921766666773086, 0.622090799966827)
Time-to-final min/avg/max: (0.14221620000898838, 0.3474687999890496, 0.7561650000279769)
Total         min/avg/max: (0.7461225000442937, 0.9412760333313296, 1.3101708999602124)

=== Successful models (sorted by total_avg) ===
|    | model                        | status   |   ttfb_min |   ttfb_avg |   ttfb_max |   t2last_min |   t2last_avg |   t2last_max |   total_min |   total_avg |   total_max |   max_output_tokens |   temperature |   candidate_count |   pro_thinking_budget |
|----|------------------------------|----------|------------|------------|------------|------

Unnamed: 0,model,status,ttfb_min,ttfb_avg,ttfb_max,t2last_min,t2last_avg,t2last_max,total_min,total_avg,total_max,max_output_tokens,temperature,candidate_count,pro_thinking_budget,sweep
0,models/gemini-2.5-flash,ok,0.512943,0.599012,0.688364,0.152812,0.229036,0.299101,0.751267,0.830292,0.925418,32,0.5,1,0,max_tokens=32
1,models/gemini-2.5-flash-lite,ok,0.553783,0.592177,0.622091,0.142216,0.347469,0.756165,0.746123,0.941276,1.310171,32,0.5,1,0,max_tokens=32
2,models/gemini-2.5-flash,ok,0.496832,0.58226,0.64407,0.375979,0.402184,0.433461,0.872973,0.984961,1.078712,64,0.5,1,0,max_tokens=64
3,models/gemini-2.5-flash-lite,ok,0.469919,0.552592,0.599782,0.210905,0.236034,0.255746,0.725959,0.79067,0.832152,64,0.5,1,0,max_tokens=64
4,models/gemini-2.5-flash,ok,0.537131,0.563906,0.597281,0.394624,0.429178,0.451912,0.980735,0.994835,1.009382,128,0.5,1,0,max_tokens=128


#### B) Prompt size — small vs large input

In [10]:
BASE_PROMPT = "Explain transformers in AI in 3 short sentences."
FILLER = " The quick brown fox jumps over the lazy dog."  # ~45 chars

def make_prompt(repeats):
    return BASE_PROMPT + FILLER * repeats

REPEATS = [0, 20, 50, 100]
MODEL_B = "models/gemini-2.5-flash"

frames = []
for r in REPEATS:
    p = make_prompt(r)
    df = bench_models([MODEL_B], p, max_output_tokens=128, temperature=0.5,
                      candidate_count=1, pro_thinking_budget=256, runs=3)
    df["sweep"] = f"repeats={r}"
    frames.append(df)

df_prompt = pd.concat(frames, ignore_index=True)
df_prompt.to_csv("gemini_sweep_prompt_size.csv", index=False)
print("\nSaved: gemini_sweep_prompt_size.csv")
df_prompt.head()



--- models/gemini-2.5-flash ---
Error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.

No successful rows to show.

--- models/gemini-2.5-flash ---
Error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.

No successful rows to show.

--- models/gemini-2.5-flash ---
Error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.

No successful rows to show.

--- models/gemini-2.5-flash ---
Error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more info

Unnamed: 0,model,status,ttfb_min,ttfb_avg,ttfb_max,t2last_min,t2last_avg,t2last_max,total_min,total_avg,total_max,max_output_tokens,temperature,candidate_count,pro_thinking_budget,sweep
0,models/gemini-2.5-flash,429 RESOURCE_EXHAUSTED. {'error': {'code': 429...,,,,,,,,,,128,0.5,1,0,repeats=0
1,models/gemini-2.5-flash,429 RESOURCE_EXHAUSTED. {'error': {'code': 429...,,,,,,,,,,128,0.5,1,0,repeats=20
2,models/gemini-2.5-flash,429 RESOURCE_EXHAUSTED. {'error': {'code': 429...,,,,,,,,,,128,0.5,1,0,repeats=50
3,models/gemini-2.5-flash,429 RESOURCE_EXHAUSTED. {'error': {'code': 429...,,,,,,,,,,128,0.5,1,0,repeats=100


#### C) Thinking budget (2.5 Pro only)

In [11]:
MODEL_C = "models/gemini-2.5-pro"
BUDGETS = [128, 256, 512]

frames = []
for b in BUDGETS:
    df = bench_models([MODEL_C], PROMPT, max_output_tokens=128, temperature=0.5,
                      candidate_count=1, pro_thinking_budget=b, runs=3)
    df["sweep"] = f"budget={b}"
    frames.append(df)

df_budget = pd.concat(frames, ignore_index=True)
df_budget.to_csv("gemini_sweep_thinking_budget.csv", index=False)
print("\nSaved: gemini_sweep_thinking_budget.csv")
df_budget.head()



--- models/gemini-2.5-pro ---
TTFB          min/avg/max: (3.3087566000176594, 7.001742099993862, 11.6514549999265)
Time-to-final min/avg/max: (0.20669559994712472, 0.23959606668601433, 0.2755213000345975)
Total         min/avg/max: (3.5844426000257954, 7.2461331333421795, 11.897535700001754)

=== Successful models (sorted by total_avg) ===
|    | model                 | status   |   ttfb_min |   ttfb_avg |   ttfb_max |   t2last_min |   t2last_avg |   t2last_max |   total_min |   total_avg |   total_max |   max_output_tokens |   temperature |   candidate_count |   pro_thinking_budget |
|----|-----------------------|----------|------------|------------|------------|--------------|--------------|--------------|-------------|-------------|-------------|---------------------|---------------|-------------------|-----------------------|
|  0 | models/gemini-2.5-pro | ok       |      3.309 |      7.002 |     11.651 |        0.207 |        0.240 |        0.276 |       3.584 |       7.246 |    

  df_budget = pd.concat(frames, ignore_index=True)


Unnamed: 0,model,status,ttfb_min,ttfb_avg,ttfb_max,t2last_min,t2last_avg,t2last_max,total_min,total_avg,total_max,max_output_tokens,temperature,candidate_count,pro_thinking_budget,sweep
0,models/gemini-2.5-pro,ok,3.308757,7.001742,11.651455,0.206696,0.239596,0.275521,3.584443,7.246133,11.897536,128,0.5,1,128,budget=128
1,models/gemini-2.5-pro,429 RESOURCE_EXHAUSTED. {'error': {'code': 429...,,,,,,,,,,128,0.5,1,256,budget=256
2,models/gemini-2.5-pro,429 RESOURCE_EXHAUSTED. {'error': {'code': 429...,,,,,,,,,,128,0.5,1,512,budget=512


#### D) Candidate count

In [12]:
MODEL_D = "models/gemini-2.5-flash"
CANDS = [1, 2, 3]

frames = []
for c in CANDS:
    df = bench_models([MODEL_D], PROMPT, max_output_tokens=128, temperature=0.5,
                      candidate_count=c, pro_thinking_budget=256, runs=3)
    df["sweep"] = f"candidates={c}"
    frames.append(df)

df_cands = pd.concat(frames, ignore_index=True)
df_cands.to_csv("gemini_sweep_candidates.csv", index=False)
print("\nSaved: gemini_sweep_candidates.csv")
df_cands.head()



--- models/gemini-2.5-flash ---
TTFB          min/avg/max: (0.5611943999538198, 0.6692118332721293, 0.7514743999345228)
Time-to-final min/avg/max: (0.4083305000094697, 0.42332870000973344, 0.43731800001114607)
Total         min/avg/max: (0.9716687999898568, 1.0969886999810115, 1.1850864000152797)

=== Successful models (sorted by total_avg) ===
|    | model                   | status   |   ttfb_min |   ttfb_avg |   ttfb_max |   t2last_min |   t2last_avg |   t2last_max |   total_min |   total_avg |   total_max |   max_output_tokens |   temperature |   candidate_count |   pro_thinking_budget |
|----|-------------------------|----------|------------|------------|------------|--------------|--------------|--------------|-------------|-------------|-------------|---------------------|---------------|-------------------|-----------------------|
|  0 | models/gemini-2.5-flash | ok       |      0.561 |      0.669 |      0.751 |        0.408 |        0.423 |        0.437 |       0.972 |       

  df_cands = pd.concat(frames, ignore_index=True)


Unnamed: 0,model,status,ttfb_min,ttfb_avg,ttfb_max,t2last_min,t2last_avg,t2last_max,total_min,total_avg,total_max,max_output_tokens,temperature,candidate_count,pro_thinking_budget,sweep
0,models/gemini-2.5-flash,ok,0.561194,0.669212,0.751474,0.408331,0.423329,0.437318,0.971669,1.096989,1.185086,128,0.5,1,0,candidates=1
1,models/gemini-2.5-flash,"400 INVALID_ARGUMENT. {'error': {'code': 400, ...",,,,,,,,,,128,0.5,2,0,candidates=2
2,models/gemini-2.5-flash,"400 INVALID_ARGUMENT. {'error': {'code': 400, ...",,,,,,,,,,128,0.5,3,0,candidates=3


#### E) Temperature

In [13]:
MODEL_E = "models/gemini-2.5-flash"
TEMPS = [0.0, 0.2, 0.5, 0.9]

frames = []
for t in TEMPS:
    df = bench_models([MODEL_E], PROMPT, max_output_tokens=128, temperature=t,
                      candidate_count=1, pro_thinking_budget=256, runs=3)
    df["sweep"] = f"temperature={t}"
    frames.append(df)

df_temp = pd.concat(frames, ignore_index=True)
df_temp.to_csv("gemini_sweep_temperature.csv", index=False)
print("\nSaved: gemini_sweep_temperature.csv")
df_temp.head()



--- models/gemini-2.5-flash ---
TTFB          min/avg/max: (0.48513009992893785, 0.593769499954457, 0.6797960000112653)
Time-to-final min/avg/max: (0.3512427000096068, 0.4996522666964059, 0.6030079000629485)
Total         min/avg/max: (1.030003499938175, 1.0958349666325375, 1.2231318999547511)

=== Successful models (sorted by total_avg) ===
|    | model                   | status   |   ttfb_min |   ttfb_avg |   ttfb_max |   t2last_min |   t2last_avg |   t2last_max |   total_min |   total_avg |   total_max |   max_output_tokens |   temperature |   candidate_count |   pro_thinking_budget |
|----|-------------------------|----------|------------|------------|------------|--------------|--------------|--------------|-------------|-------------|-------------|---------------------|---------------|-------------------|-----------------------|
|  0 | models/gemini-2.5-flash | ok       |      0.485 |      0.594 |      0.680 |        0.351 |        0.500 |        0.603 |       1.030 |       1.0

  df_temp = pd.concat(frames, ignore_index=True)


Unnamed: 0,model,status,ttfb_min,ttfb_avg,ttfb_max,t2last_min,t2last_avg,t2last_max,total_min,total_avg,total_max,max_output_tokens,temperature,candidate_count,pro_thinking_budget,sweep
0,models/gemini-2.5-flash,ok,0.48513,0.593769,0.679796,0.351243,0.499652,0.603008,1.030003,1.095835,1.223132,128,0.0,1,0,temperature=0.0
1,models/gemini-2.5-flash,429 RESOURCE_EXHAUSTED. {'error': {'code': 429...,,,,,,,,,,128,0.2,1,0,temperature=0.2
2,models/gemini-2.5-flash,429 RESOURCE_EXHAUSTED. {'error': {'code': 429...,,,,,,,,,,128,0.5,1,0,temperature=0.5
3,models/gemini-2.5-flash,429 RESOURCE_EXHAUSTED. {'error': {'code': 429...,,,,,,,,,,128,0.9,1,0,temperature=0.9


#### F) Streaming vs non-streaming

In [15]:
def run_once_nostream(model_id: str, prompt: str,
                      max_output_tokens: int = None,
                      temperature: float = None,
                      candidate_count: int = 1,
                      pro_thinking_budget: int = 256):
    max_output_tokens = MAX_OUTPUT_TOKENS if max_output_tokens is None else max_output_tokens
    temperature = TEMPERATURE if temperature is None else temperature

    t0 = time.perf_counter()
    cfg = dict(
        temperature=temperature,
        max_output_tokens=max_output_tokens,
        candidate_count=candidate_count,
    )
    if needs_thinking(model_id):
        cfg["thinking_config"] = types.ThinkingConfig(thinking_budget=pro_thinking_budget)
    elif disallow_thinking(model_id):
        pass
    else:
        cfg["thinking_config"] = types.ThinkingConfig(thinking_budget=0)

    resp = client.models.generate_content(
        model=model_id,
        contents=prompt,
        config=types.GenerateContentConfig(**cfg),
    )
    total = time.perf_counter() - t0
    return {"ttfb_s": math.nan, "time_to_final_token_s": math.nan, "total_s": total}

def compare_stream_vs_nostream(model="models/gemini-2.5-flash", runs=3):
    print(f"\n--- {model} | streaming vs non-streaming ---")
    # warm-up
    _ = run_once(model, PROMPT)
    _ = run_once_nostream(model, PROMPT)

    # measured
    s = [run_once(model, PROMPT) for _ in range(runs)]
    ns = [run_once_nostream(model, PROMPT) for _ in range(runs)]

    def avg(xs, key):
        vals = [v[key] for v in xs if not math.isnan(v[key])]
        return sum(vals)/len(vals) if vals else math.nan

    print("Streaming:    TTFB avg =", avg(s, "ttfb_s"),
          "Time-to-final avg =", avg(s, "time_to_final_token_s"),
          "Total avg =", avg(s, "total_s"))
    print("Non-streaming:           Total avg =", avg(ns, "total_s"))

compare_stream_vs_nostream()



--- models/gemini-2.5-flash | streaming vs non-streaming ---
Streaming:    TTFB avg = 2.4155947666149586 Time-to-final avg = 0.42524026668009657 Total avg = 2.8452505999788023
Non-streaming:           Total avg = 1.0309919667197391
