## Final plots (GPT-2, A100)

This notebook generates the plots used in the final `PROJECT.md` writeup:
- Baseline full-cache metrics at total lengths 256 and 512
- Baseline comparisons (sliding window, quantization)
- CMS eviction runs (targeted 12-config grid)

Outputs are saved under `results/figures/`.


In [None]:
import json
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

ROOT = Path("..")
RESULTS = ROOT / "results"
FIG_DIR = RESULTS / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

BASELINE_SUMMARY = RESULTS / "baseline" / "baseline_summary.json"
BASELINES_COMPARISON = RESULTS / "baselines" / "comparison_summary.json"
SKETCH_TARGETED = RESULTS / "sketch" / "targeted_summary.json"

print("baseline:", BASELINE_SUMMARY.exists())
print("baselines comparison:", BASELINES_COMPARISON.exists())
print("sketch targeted:", SKETCH_TARGETED.exists())


In [None]:
def read_json(path: Path):
    with path.open("r") as f:
        return json.load(f)

baseline = read_json(BASELINE_SUMMARY)
comparison = read_json(BASELINES_COMPARISON)

baseline_df = pd.DataFrame(
    [
        {
            "total_length": 256,
            "memory_mb": baseline["memory_mb_seq256_mean"],
            "latency_ms_per_token": baseline["latency_ms_seq256_mean"],
            "tokens_per_sec": baseline["tokens_per_sec_seq256_mean"],
        },
        {
            "total_length": 512,
            "memory_mb": baseline["memory_mb_seq512_mean"],
            "latency_ms_per_token": baseline["latency_ms_seq512_mean"],
            "tokens_per_sec": baseline["tokens_per_sec_seq512_mean"],
        },
    ]
)

rows = []
for bucket, data in comparison["results"].items():
    total_length = int(bucket.replace("len", ""))
    for method, stats in data.items():
        rows.append(
            {
                "total_length": total_length,
                "method": method,
                "memory_mb": stats.get("avg_memory_mb"),
                "latency_s": stats.get("avg_latency_s"),
            }
        )
comparison_df = pd.DataFrame(rows)

baseline_df, comparison_df.head()


In [None]:
# Plot baseline vs baselines-comparison latency (total wall time)
plt.figure(figsize=(7, 4))
plot_df = comparison_df.copy()
plot_df["latency_s"] = plot_df["latency_s"].astype(float)

sns.barplot(data=plot_df, x="total_length", y="latency_s", hue="method")
plt.title("Baseline comparisons: wall-time latency")
plt.ylabel("seconds (avg per sample)")
plt.xlabel("total length")
plt.tight_layout()
plt.savefig(FIG_DIR / "baselines_comparison_latency.png", dpi=200)
plt.show()

# Plot baseline throughput
plt.figure(figsize=(7, 4))
sns.barplot(data=baseline_df, x="total_length", y="tokens_per_sec")
plt.title("Full-cache baseline throughput")
plt.ylabel("tokens/sec")
plt.xlabel("total length")
plt.tight_layout()
plt.savefig(FIG_DIR / "baseline_throughput.png", dpi=200)
plt.show()


In [None]:
# Load sketch targeted summary (generated by experiments/sketch_experiments.py --grid targeted)
sketch = read_json(SKETCH_TARGETED)

sk_rows = []
for r in sketch["results"]:
    cfg = r["config"]
    sk_rows.append(
        {
            "total_length": r["total_length"],
            "sketch_width": cfg["sketch_width"],
            "sketch_depth": cfg["sketch_depth"],
            "max_cache_size": cfg["max_cache_size"],
            "avg_memory_mb": r["avg_memory_mb"],
            "cache_memory_mb": r["cache_memory_mb"],
            "sketch_memory_mb": r["sketch_memory_mb"],
            "avg_latency_s": r["avg_latency_s"],
            "avg_throughput": r["avg_throughput"],
        }
    )

sketch_df = pd.DataFrame(sk_rows)
sketch_df.head()


In [None]:
# Cache size vs max_cache_size
plt.figure(figsize=(7, 4))
sns.lineplot(data=sketch_df, x="max_cache_size", y="cache_memory_mb", hue="total_length", marker="o")
plt.title("Sketch eviction: cache MB vs max_cache_size")
plt.ylabel("cache MB (all layers)")
plt.xlabel("max_cache_size (tokens)")
plt.tight_layout()
plt.savefig(FIG_DIR / "sketch_cache_mb_vs_max_cache_size.png", dpi=200)
plt.show()

# Throughput vs max_cache_size
plt.figure(figsize=(7, 4))
sns.lineplot(data=sketch_df, x="max_cache_size", y="avg_throughput", hue="total_length", marker="o")
plt.title("Sketch eviction: throughput vs max_cache_size")
plt.ylabel("tokens/sec")
plt.xlabel("max_cache_size (tokens)")
plt.tight_layout()
plt.savefig(FIG_DIR / "sketch_throughput_vs_max_cache_size.png", dpi=200)
plt.show()
