# Day 12 — Cache Benchmark
**Internship Task:** Compare caching patterns: Cache-Aside, Write-Through, Write-Behind  
**Tools:** `redis-py`, `pymongo`, `time`, `matplotlib`

> Ensure MongoDB and Redis are running before executing cells.

In [None]:
import redis
import time
import json
import random
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from pymongo import MongoClient

# Connections
r = redis.Redis(host='localhost', port=6379, db=1, decode_responses=True)
client = MongoClient('mongodb://localhost:27017')
db = client['benchmark_db']
col = db['products']

# Seed MongoDB with 100 products
col.drop()
products = [{'_id': i, 'name': f'Product_{i}', 'price': round(random.uniform(10, 5000), 2), 'stock': random.randint(0, 500)} for i in range(1, 101)]
col.insert_many(products)
print(f'✅ Seeded {col.count_documents({})} products into MongoDB')
print(f'✅ Redis connected: {r.ping()}')

## Pattern 1 — Cache-Aside (Lazy Loading)
**How it works:**
1. App checks cache first
2. If **MISS** → fetch from DB → store in cache → return
3. If **HIT** → return from cache directly

**Best for:** Read-heavy workloads where not all data is needed

In [None]:
CACHE_TTL = 60   # seconds

def cache_aside_get(product_id: int) -> dict:
    key = f'cache_aside:product:{product_id}'
    cached = r.get(key)
    if cached:
        return json.loads(cached), 'HIT'
    # Cache MISS — go to DB
    doc = col.find_one({'_id': product_id}, {'_id': 0})
    r.setex(key, CACHE_TTL, json.dumps(doc))
    return doc, 'MISS'

# Clear cache for clean test
for k in r.keys('cache_aside:*'): r.delete(k)

# Benchmark: 200 reads (IDs 1-20 repeated to create hits)
ids = [random.randint(1, 20) for _ in range(200)]
times_ca, hits_ca, misses_ca = [], 0, 0

for pid in ids:
    t0 = time.perf_counter()
    _, status = cache_aside_get(pid)
    times_ca.append((time.perf_counter() - t0) * 1000)  # ms
    if status == 'HIT': hits_ca += 1
    else: misses_ca += 1

hit_rate_ca = hits_ca / len(ids) * 100
avg_ca = sum(times_ca) / len(times_ca)
print(f'Cache-Aside: hits={hits_ca}, misses={misses_ca}, hit_rate={hit_rate_ca:.1f}%')
print(f'Avg response time = {avg_ca:.3f} ms')

## Pattern 2 — Write-Through
**How it works:**
- Every **write** updates **both** cache AND database synchronously
- Cache is always in sync with DB

**Best for:** Read-heavy workloads where data consistency is critical

In [None]:
def write_through_update(product_id: int, new_price: float):
    key = f'write_through:product:{product_id}'
    # Update DB first
    col.update_one({'_id': product_id}, {'$set': {'price': new_price}})
    # Immediately update cache
    doc = col.find_one({'_id': product_id}, {'_id': 0})
    r.setex(key, CACHE_TTL, json.dumps(doc))
    return doc

def write_through_get(product_id: int):
    key = f'write_through:product:{product_id}'
    cached = r.get(key)
    if cached: return json.loads(cached), 'HIT'
    doc = col.find_one({'_id': product_id}, {'_id': 0})
    r.setex(key, CACHE_TTL, json.dumps(doc))
    return doc, 'MISS'

for k in r.keys('write_through:*'): r.delete(k)

# Benchmark: 100 writes + 100 reads
write_times_wt, read_times_wt = [], []

for i in range(1, 101):
    t0 = time.perf_counter()
    write_through_update(i, round(random.uniform(10, 5000), 2))
    write_times_wt.append((time.perf_counter() - t0) * 1000)

for pid in [random.randint(1, 100) for _ in range(100)]:
    t0 = time.perf_counter()
    write_through_get(pid)
    read_times_wt.append((time.perf_counter() - t0) * 1000)

avg_write_wt = sum(write_times_wt) / len(write_times_wt)
avg_read_wt  = sum(read_times_wt)  / len(read_times_wt)
print(f'Write-Through: avg write = {avg_write_wt:.3f} ms | avg read = {avg_read_wt:.3f} ms')

## Pattern 3 — Write-Behind (Write-Back)
**How it works:**
- Write goes to **cache immediately** (fast response)
- DB write happens **later** in batch (async)

**Best for:** Write-heavy workloads where you can tolerate a short data-loss window

In [None]:
DIRTY_SET = 'write_behind:dirty_keys'

def write_behind_update(product_id: int, new_price: float):
    key = f'write_behind:product:{product_id}'
    doc = {'price': new_price, 'product_id': product_id}
    r.setex(key, CACHE_TTL, json.dumps(doc))
    r.sadd(DIRTY_SET, product_id)   # mark as needing DB sync
    return doc

def flush_dirty_to_db():
    """Called periodically to sync cache → DB."""
    dirty_ids = r.smembers(DIRTY_SET)
    flushed = 0
    for pid in dirty_ids:
        key = f'write_behind:product:{int(pid)}'
        cached = r.get(key)
        if cached:
            data = json.loads(cached)
            col.update_one({'_id': int(pid)}, {'$set': {'price': data['price']}})
            r.srem(DIRTY_SET, pid)
            flushed += 1
    return flushed

for k in r.keys('write_behind:*'): r.delete(k)

write_times_wb = []
for i in range(1, 101):
    t0 = time.perf_counter()
    write_behind_update(i, round(random.uniform(10, 5000), 2))
    write_times_wb.append((time.perf_counter() - t0) * 1000)

avg_write_wb = sum(write_times_wb) / len(write_times_wb)
print(f'Write-Behind: avg write = {avg_write_wb:.3f} ms (cache only, no DB yet)')

t0 = time.perf_counter()
flushed = flush_dirty_to_db()
flush_time = (time.perf_counter() - t0) * 1000
print(f'Flush {flushed} records to DB = {flush_time:.1f} ms total ({flush_time/flushed:.2f} ms each)')

## TTL & Eviction Policy Demo

In [None]:
print('── TTL Demonstration ───────────────────────')
r.set('demo:ttl_short', 'expires in 3s', ex=3)
r.set('demo:ttl_long',  'expires in 60s', ex=60)
r.set('demo:no_ttl',    'never expires')

print(f"TTL short  : {r.ttl('demo:ttl_short')} sec")
print(f"TTL long   : {r.ttl('demo:ttl_long')} sec")
print(f"TTL no_ttl : {r.ttl('demo:no_ttl')} (-1 = no expiry)")

time.sleep(3.5)
print(f"\nAfter 3.5 sec wait:")
print(f"  short_lived exists? {r.exists('demo:ttl_short') == 1}")
print(f"  long key exists?    {r.exists('demo:ttl_long') == 1}")

print('\n── Eviction Policy ─────────────────────────')
policy = r.config_get('maxmemory-policy')
print(f"Current: {policy}")
print("\nPolicy options:")
policies = [
    ('noeviction',      'Returns error when memory is full (default)'),
    ('allkeys-lru',     'Evict least recently used keys'),
    ('volatile-lru',    'Evict LRU keys with TTL only'),
    ('allkeys-lfu',     'Evict least frequently used keys'),
    ('volatile-ttl',    'Evict keys with shortest TTL first'),
    ('allkeys-random',  'Evict random keys'),
]
for name, desc in policies:
    print(f"  {name:20s} → {desc}")

## Benchmark Comparison Chart

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle('Day 12 — Caching Pattern Benchmark', fontsize=15, fontweight='bold')

# ── Plot 1: Response time distribution ──────
miss_times = [t for t, (_, s) in zip(times_ca, [cache_aside_get(pid) for pid in ids[:50]]) if s == 'MISS']
# Approximate: first 20 are misses, rest are hits
hit_times_ca  = times_ca[20:]
miss_times_ca = times_ca[:20]

axes[0].boxplot([miss_times_ca, hit_times_ca], labels=['Cache MISS\n(DB query)', 'Cache HIT\n(Redis only)'])
axes[0].set_title('Cache-Aside\nHit vs Miss Latency')
axes[0].set_ylabel('Response Time (ms)')
axes[0].set_facecolor('#f8f9fa')

# ── Plot 2: Write latency comparison ─────────
patterns = ['Write-Through\n(DB+Cache)', 'Write-Behind\n(Cache only)']
avgs     = [avg_write_wt, avg_write_wb]
colors   = ['#e74c3c', '#2ecc71']
bars = axes[1].bar(patterns, avgs, color=colors, edgecolor='white', linewidth=1.5, width=0.5)
for bar, val in zip(bars, avgs):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                 f'{val:.3f}ms', ha='center', va='bottom', fontsize=10, fontweight='bold')
axes[1].set_title('Write Latency Comparison')
axes[1].set_ylabel('Avg Write Time (ms)')
axes[1].set_facecolor('#f8f9fa')

# ── Plot 3: Cache-Aside hit rate pie ─────────
axes[2].pie(
    [hits_ca, misses_ca],
    labels=[f'HIT ({hits_ca})', f'MISS ({misses_ca})'],
    colors=['#2ecc71', '#e74c3c'],
    autopct='%1.1f%%',
    startangle=90,
    explode=(0.05, 0)
)
axes[2].set_title(f'Cache-Aside Hit Rate\n{hit_rate_ca:.1f}% over 200 reads')

plt.tight_layout()
plt.savefig('cache_benchmark_results.png', dpi=150, bbox_inches='tight')
plt.show()
print('Chart saved: cache_benchmark_results.png')

## Summary Table

In [None]:
print('=' * 70)
print(f'{"CACHING PATTERN BENCHMARK SUMMARY":^70}')
print('=' * 70)

print(f'\n{"Pattern":<20} {"Avg Latency":<18} {"Consistency":<20} {"Best For"}')
print('-' * 70)
rows = [
    ('Cache-Aside',    f'{avg_ca:.3f} ms (read)',   'Eventual',   'Read-heavy, sparse access'),
    ('Write-Through',  f'{avg_write_wt:.3f} ms (write)', 'Strong', 'Read-heavy, always fresh'),
    ('Write-Behind',   f'{avg_write_wb:.3f} ms (write)', 'Eventual', 'Write-heavy, tolerate lag'),
]
for r_name, lat, cons, best in rows:
    print(f'{r_name:<20} {lat:<18} {cons:<20} {best}')

print('=' * 70)
print(f'\nCache-Aside Hit Rate : {hit_rate_ca:.1f}%')
print(f'Write-Behind speedup : {avg_write_wt / avg_write_wb:.1f}x faster than Write-Through')
print('\n✅ cache_benchmark.ipynb completed!')