# KV-Cache Sketch Compression Analysis

This notebook analyzes the results from sketch-based KV-cache compression experiments.


In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path
import os

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12


## Load Results

Load experimental results from JSON files for analysis.


In [None]:
# Define results directories
results_dir = Path('../results')
baseline_dir = results_dir / 'baseline'
sketch_dir = results_dir / 'sketch'

# Load results
def load_json(filepath):
    if os.path.exists(filepath):
        with open(filepath, 'r') as f:
            return json.load(f)
    return None

baseline_summary = load_json(baseline_dir / 'baseline_summary.json')
sketch_summary = load_json(sketch_dir / 'experiment_summary.json')

print(f"Results loaded successfully!")
if sketch_summary:
    print(f"Number of sketch configurations: {sketch_summary.get('num_configs', 0)}")


## Visualization: Memory Comparison

Compare memory usage between sketch-based and full cache approaches.


In [None]:
if sketch_summary and baseline_summary:
    sketch_results = sketch_summary.get('results', [])
    
    if sketch_results:
        memories = [r['avg_memory_mb'] for r in sketch_results]
        strategies = [r['config']['strategy'] for r in sketch_results]
        
        # Create visualization
        plt.figure(figsize=(10, 6))
        colors = ['blue' if s == 'topk' else 'red' for s in strategies]
        plt.scatter(range(len(memories)), memories, c=colors, s=100, alpha=0.6)
        
        # Add baseline
        baseline_memory = baseline_summary.get('memory_mb_seq256_mean', 0)
        if baseline_memory > 0:
            plt.axhline(baseline_memory, color='gold', linestyle='--', 
                       linewidth=2, label=f'Full Cache: {baseline_memory:.2f} MB')
        
        plt.xlabel('Configuration Index')
        plt.ylabel('Memory (MB)')
        plt.title('Memory Usage: Sketch Configurations vs Full Cache')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig('../results/memory_comparison.png', dpi=300)
        plt.show()
        
        print(f"Memory savings: {(1 - np.mean(memories)/baseline_memory)*100:.1f}% on average")


In [None]:
print("="*60)
print("EXPERIMENTAL RESULTS SUMMARY")
print("="*60)

if baseline_summary:
    print("\n1. BASELINE (Full KV-Cache):")
    baseline_memory = baseline_summary.get('memory_mb_seq256_mean', 0)
    print(f"   Memory: {baseline_memory:.2f} MB")

if sketch_summary:
    sketch_results = sketch_summary.get('results', [])
    if sketch_results:
        print("\n2. SKETCH COMPRESSION:")
        best_memory = min(sketch_results, key=lambda x: x['avg_memory_mb'])
        print(f"\n   Best Memory Configuration:")
        print(f"     Memory: {best_memory['avg_memory_mb']:.2f} MB")
        if baseline_memory > 0:
            savings = (1 - best_memory['avg_memory_mb']/baseline_memory) * 100
            print(f"     Savings: {savings:.1f}%")
        
        best_throughput = max(sketch_results, key=lambda x: x['avg_throughput'])
        print(f"\n   Best Throughput Configuration:")
        print(f"     Throughput: {best_throughput['avg_throughput']:.2f} tokens/s")

print("\n" + "="*60)
