# AluminatAI GPU Agent - A100 Test Suite

**Requirements:** Google Colab with A100 GPU runtime

This notebook tests:
1. GPU collector on real A100 hardware
2. Metrics data integrity and validation
3. API key authentication and ingest endpoint
4. End-to-end flow: collect → upload → verify
5. Energy calculation accuracy
6. Stress test under GPU load

---

## Setup

Go to **Runtime > Change runtime type** and select **A100 GPU** before running.

In [None]:
# Cell 1: Verify GPU & Install Dependencies
!nvidia-smi
print("\n" + "="*70)

import subprocess
result = subprocess.run(['nvidia-smi', '--query-gpu=name,uuid,memory.total,power.limit',
                         '--format=csv,noheader'], capture_output=True, text=True)
print(f"GPU Info: {result.stdout.strip()}")
print("="*70)

!pip install -q nvidia-ml-py3 requests rich
print("\nDependencies installed.")

In [None]:
# Cell 2: Configuration
# ==========================================
# PASTE YOUR API KEY HERE (from dashboard)
# ==========================================
API_KEY = ""  # e.g. "alum_AbCdEf12345..."
API_ENDPOINT = "https://aluminatiai-landing.vercel.app/api/metrics/ingest"

# Test settings
SAMPLE_INTERVAL = 2.0  # seconds between samples
NUM_SAMPLES = 10       # number of samples to collect

print(f"API Endpoint: {API_ENDPOINT}")
print(f"API Key: {'alum_****' + API_KEY[-4:] if API_KEY else 'NOT SET'}")
print(f"Samples: {NUM_SAMPLES} at {SAMPLE_INTERVAL}s intervals")

if not API_KEY:
    print("\n⚠️  WARNING: No API key set. Upload tests will be skipped.")
    print("   Sign up at https://aluminatiai-landing.vercel.app to get your key.")

---
## Test 1: GPU Collector (NVML)

Test that we can initialize the NVML collector and read GPU metrics from the A100.

In [None]:
# Cell 3: Test NVML Collector
import pynvml
import time
from datetime import datetime, timezone
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional

passed = 0
failed = 0

# --- Test 1.1: NVML Initialization ---
try:
    pynvml.nvmlInit()
    gpu_count = pynvml.nvmlDeviceGetCount()
    assert gpu_count >= 1, "Expected at least 1 GPU"
    print(f"✅ Test 1.1: NVML initialized - {gpu_count} GPU(s) found")
    passed += 1
except Exception as e:
    print(f"❌ Test 1.1: NVML init failed - {e}")
    failed += 1

# --- Test 1.2: GPU Info Retrieval ---
try:
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    gpu_name = pynvml.nvmlDeviceGetName(handle)
    gpu_uuid = pynvml.nvmlDeviceGetUUID(handle)
    if isinstance(gpu_name, bytes): gpu_name = gpu_name.decode('utf-8')
    if isinstance(gpu_uuid, bytes): gpu_uuid = gpu_uuid.decode('utf-8')

    assert 'A100' in gpu_name or 'GPU' in gpu_name, f"Unexpected GPU: {gpu_name}"
    assert len(gpu_uuid) > 10, "UUID too short"
    print(f"✅ Test 1.2: GPU info - {gpu_name} ({gpu_uuid[:20]}...)")
    passed += 1
except Exception as e:
    print(f"❌ Test 1.2: GPU info failed - {e}")
    failed += 1

# --- Test 1.3: Power Reading ---
try:
    power_mw = pynvml.nvmlDeviceGetPowerUsage(handle)
    power_w = power_mw / 1000.0
    power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0

    assert 0 <= power_w <= 1500, f"Power out of range: {power_w}W"
    assert power_limit > 0, f"Power limit invalid: {power_limit}W"
    print(f"✅ Test 1.3: Power reading - {power_w:.1f}W / {power_limit:.0f}W limit")
    passed += 1
except Exception as e:
    print(f"❌ Test 1.3: Power reading failed - {e}")
    failed += 1

# --- Test 1.4: Utilization ---
try:
    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
    assert 0 <= util.gpu <= 100, f"GPU util out of range: {util.gpu}%"
    assert 0 <= util.memory <= 100, f"Mem util out of range: {util.memory}%"
    print(f"✅ Test 1.4: Utilization - GPU: {util.gpu}%, Memory: {util.memory}%")
    passed += 1
except Exception as e:
    print(f"❌ Test 1.4: Utilization failed - {e}")
    failed += 1

# --- Test 1.5: Temperature ---
try:
    temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
    assert 0 <= temp <= 120, f"Temperature out of range: {temp}C"
    print(f"✅ Test 1.5: Temperature - {temp}°C")
    passed += 1
except Exception as e:
    print(f"❌ Test 1.5: Temperature failed - {e}")
    failed += 1

# --- Test 1.6: Memory ---
try:
    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
    mem_used_mb = mem.used / 1024 / 1024
    mem_total_mb = mem.total / 1024 / 1024
    assert mem_total_mb > 1000, f"Total memory too low: {mem_total_mb}MB"
    assert mem_used_mb >= 0, f"Used memory negative: {mem_used_mb}MB"
    print(f"✅ Test 1.6: Memory - {mem_used_mb:.0f}MB / {mem_total_mb:.0f}MB")
    passed += 1
except Exception as e:
    print(f"❌ Test 1.6: Memory failed - {e}")
    failed += 1

pynvml.nvmlShutdown()

print(f"\n{'='*50}")
print(f"Test 1 Results: {passed} passed, {failed} failed")
print(f"{'='*50}")

---
## Test 2: Full Collector Class

Test the GPUCollector class with energy delta calculations.

In [None]:
# Cell 4: Define GPUCollector inline (same as agent/collector.py)
import pynvml
import time
from datetime import datetime, timezone
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional


@dataclass
class GPUMetrics:
    """Single GPU metrics snapshot"""
    timestamp: str
    gpu_index: int
    gpu_uuid: str
    gpu_name: str
    power_draw_w: float
    power_limit_w: float
    energy_delta_j: Optional[float] = None
    utilization_gpu_pct: int = 0
    utilization_memory_pct: int = 0
    temperature_c: int = 0
    fan_speed_pct: int = 0
    sm_clock_mhz: Optional[int] = None
    memory_clock_mhz: Optional[int] = None
    memory_used_mb: float = 0
    memory_total_mb: float = 0

    def to_dict(self) -> Dict:
        return asdict(self)


class GPUCollector:
    def __init__(self, collect_clocks: bool = False):
        self.collect_clocks = collect_clocks
        self.initialized = False
        self.gpu_count = 0
        self.gpu_handles = []
        self.gpu_info = []
        self.last_sample_time = {}
        self.last_power_draw = {}
        self._initialize()

    def _initialize(self):
        pynvml.nvmlInit()
        self.gpu_count = pynvml.nvmlDeviceGetCount()
        for i in range(self.gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            self.gpu_handles.append(handle)
            gpu_uuid = pynvml.nvmlDeviceGetUUID(handle)
            gpu_name = pynvml.nvmlDeviceGetName(handle)
            if isinstance(gpu_uuid, bytes): gpu_uuid = gpu_uuid.decode('utf-8')
            if isinstance(gpu_name, bytes): gpu_name = gpu_name.decode('utf-8')
            self.gpu_info.append({'index': i, 'uuid': gpu_uuid, 'name': gpu_name})
        self.initialized = True

    def collect(self) -> List[GPUMetrics]:
        metrics = []
        timestamp = datetime.now(timezone.utc).isoformat()
        current_time = time.time()
        for i, handle in enumerate(self.gpu_handles):
            power_draw = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0
            power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0
            energy_delta = None
            if i in self.last_sample_time:
                dt = current_time - self.last_sample_time[i]
                avg_power = (power_draw + self.last_power_draw[i]) / 2.0
                energy_delta = avg_power * dt
            self.last_sample_time[i] = current_time
            self.last_power_draw[i] = power_draw
            try:
                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                util_gpu, util_mem = util.gpu, util.memory
            except: util_gpu, util_mem = 0, 0
            try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
            except: temp = 0
            try: fan = pynvml.nvmlDeviceGetFanSpeed(handle)
            except: fan = 0
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                mem_used, mem_total = mem.used/1024/1024, mem.total/1024/1024
            except: mem_used, mem_total = 0, 0

            metrics.append(GPUMetrics(
                timestamp=timestamp, gpu_index=i,
                gpu_uuid=self.gpu_info[i]['uuid'],
                gpu_name=self.gpu_info[i]['name'],
                power_draw_w=power_draw, power_limit_w=power_limit,
                energy_delta_j=energy_delta,
                utilization_gpu_pct=util_gpu, utilization_memory_pct=util_mem,
                temperature_c=temp, fan_speed_pct=fan,
                memory_used_mb=mem_used, memory_total_mb=mem_total,
            ))
        return metrics

    def get_gpu_count(self): return self.gpu_count
    def get_gpu_info(self): return self.gpu_info

    def shutdown(self):
        if self.initialized:
            pynvml.nvmlShutdown()
            self.initialized = False

    def __enter__(self): return self
    def __exit__(self, *args): self.shutdown()

print("GPUCollector class defined.")

In [None]:
# Cell 5: Test Collector - Metrics Collection & Energy Calculation
passed = 0
failed = 0

# --- Test 2.1: Initialization ---
try:
    collector = GPUCollector()
    assert collector.initialized
    assert collector.get_gpu_count() >= 1
    print(f"✅ Test 2.1: Collector initialized with {collector.get_gpu_count()} GPU(s)")
    passed += 1
except Exception as e:
    print(f"❌ Test 2.1: Collector init failed - {e}")
    failed += 1

# --- Test 2.2: First sample (no energy delta) ---
try:
    metrics1 = collector.collect()
    assert len(metrics1) == collector.get_gpu_count()
    m = metrics1[0]
    assert m.energy_delta_j is None, "First sample should have no energy delta"
    assert m.power_draw_w >= 0
    assert 0 <= m.utilization_gpu_pct <= 100
    assert 0 <= m.temperature_c <= 120
    assert m.memory_total_mb > 0
    print(f"✅ Test 2.2: First sample - {m.power_draw_w:.1f}W, {m.utilization_gpu_pct}% util, {m.temperature_c}°C")
    passed += 1
except Exception as e:
    print(f"❌ Test 2.2: First sample failed - {e}")
    failed += 1

# --- Test 2.3: Second sample (with energy delta) ---
try:
    time.sleep(SAMPLE_INTERVAL)
    metrics2 = collector.collect()
    m = metrics2[0]
    assert m.energy_delta_j is not None, "Second sample must have energy delta"
    assert m.energy_delta_j > 0, "Energy delta must be positive (GPU is drawing power)"
    # Sanity: E = P * t, for 2s at max 400W = 800J
    assert m.energy_delta_j < 3000, f"Energy delta too high: {m.energy_delta_j}J"
    print(f"✅ Test 2.3: Energy delta - {m.energy_delta_j:.2f}J ({m.power_draw_w:.1f}W * ~{SAMPLE_INTERVAL}s)")
    passed += 1
except Exception as e:
    print(f"❌ Test 2.3: Energy delta failed - {e}")
    failed += 1

# --- Test 2.4: Serialization ---
try:
    d = metrics2[0].to_dict()
    required_fields = ['timestamp', 'gpu_index', 'gpu_uuid', 'gpu_name',
                       'power_draw_w', 'power_limit_w', 'energy_delta_j',
                       'utilization_gpu_pct', 'utilization_memory_pct',
                       'temperature_c', 'memory_used_mb', 'memory_total_mb']
    for field in required_fields:
        assert field in d, f"Missing field: {field}"
    print(f"✅ Test 2.4: Serialization - all {len(required_fields)} required fields present")
    passed += 1
except Exception as e:
    print(f"❌ Test 2.4: Serialization failed - {e}")
    failed += 1

# --- Test 2.5: GPU Info ---
try:
    info = collector.get_gpu_info()
    assert len(info) >= 1
    assert 'uuid' in info[0]
    assert 'name' in info[0]
    print(f"✅ Test 2.5: GPU info - {info[0]['name']}")
    passed += 1
except Exception as e:
    print(f"❌ Test 2.5: GPU info failed - {e}")
    failed += 1

collector.shutdown()

print(f"\n{'='*50}")
print(f"Test 2 Results: {passed} passed, {failed} failed")
print(f"{'='*50}")

---
## Test 3: API Endpoint Validation

Test the API authentication, validation, and rate limiting.

In [None]:
# Cell 6: Test API Endpoint
import requests
import json

passed = 0
failed = 0

# --- Test 3.1: Health check ---
try:
    r = requests.get(API_ENDPOINT, timeout=10)
    assert r.status_code == 200
    data = r.json()
    assert data['status'] == 'ok'
    print(f"✅ Test 3.1: Health check passed - {data}")
    passed += 1
except Exception as e:
    print(f"❌ Test 3.1: Health check failed - {e}")
    failed += 1

# --- Test 3.2: Missing API key ---
try:
    r = requests.post(API_ENDPOINT, json=[{"test": True}], timeout=10)
    assert r.status_code == 401, f"Expected 401, got {r.status_code}"
    assert 'Missing API key' in r.json()['error']
    print(f"✅ Test 3.2: Missing key returns 401")
    passed += 1
except Exception as e:
    print(f"❌ Test 3.2: Missing key test failed - {e}")
    failed += 1

# --- Test 3.3: Invalid API key format ---
try:
    r = requests.post(API_ENDPOINT,
                      json=[{"test": True}],
                      headers={'X-API-Key': 'bad_key_format'},
                      timeout=10)
    assert r.status_code == 401, f"Expected 401, got {r.status_code}"
    assert 'format' in r.json()['error'].lower()
    print(f"✅ Test 3.3: Invalid key format returns 401")
    passed += 1
except Exception as e:
    print(f"❌ Test 3.3: Invalid format test failed - {e}")
    failed += 1

# --- Test 3.4: Wrong API key (valid format) ---
try:
    fake_key = 'alum_' + 'A' * 59
    r = requests.post(API_ENDPOINT,
                      json=[{"test": True}],
                      headers={'X-API-Key': fake_key},
                      timeout=10)
    assert r.status_code == 401, f"Expected 401, got {r.status_code}"
    print(f"✅ Test 3.4: Wrong key returns 401")
    passed += 1
except Exception as e:
    print(f"❌ Test 3.4: Wrong key test failed - {e}")
    failed += 1

# --- Test 3.5: Empty payload (with valid key) ---
if API_KEY:
    try:
        r = requests.post(API_ENDPOINT,
                          json=[],
                          headers={'X-API-Key': API_KEY},
                          timeout=10)
        assert r.status_code == 400, f"Expected 400, got {r.status_code}"
        print(f"✅ Test 3.5: Empty payload returns 400")
        passed += 1
    except Exception as e:
        print(f"❌ Test 3.5: Empty payload test failed - {e}")
        failed += 1

    # --- Test 3.6: Missing required fields ---
    try:
        r = requests.post(API_ENDPOINT,
                          json=[{'gpu_index': 0}],  # missing most fields
                          headers={'X-API-Key': API_KEY},
                          timeout=10)
        assert r.status_code == 400, f"Expected 400, got {r.status_code}"
        assert 'missing' in r.json()['error'].lower()
        print(f"✅ Test 3.6: Missing fields returns 400 - {r.json()['error']}")
        passed += 1
    except Exception as e:
        print(f"❌ Test 3.6: Missing fields test failed - {e}")
        failed += 1

    # --- Test 3.7: Invalid value ranges ---
    try:
        bad_metric = {
            'timestamp': datetime.now(timezone.utc).isoformat(),
            'gpu_index': 0,
            'gpu_uuid': 'test-uuid',
            'gpu_name': 'Test GPU',
            'power_draw_w': -100,  # Invalid: negative
            'utilization_gpu_pct': 50,
            'utilization_memory_pct': 50,
            'temperature_c': 40,
            'memory_used_mb': 1000,
        }
        r = requests.post(API_ENDPOINT,
                          json=[bad_metric],
                          headers={'X-API-Key': API_KEY},
                          timeout=10)
        assert r.status_code == 400, f"Expected 400 for negative power, got {r.status_code}"
        print(f"✅ Test 3.7: Negative power rejected - {r.json()['error']}")
        passed += 1
    except Exception as e:
        print(f"❌ Test 3.7: Value range test failed - {e}")
        failed += 1

    # --- Test 3.8: Utilization out of range ---
    try:
        bad_metric = {
            'timestamp': datetime.now(timezone.utc).isoformat(),
            'gpu_index': 0,
            'gpu_uuid': 'test-uuid',
            'gpu_name': 'Test GPU',
            'power_draw_w': 100,
            'utilization_gpu_pct': 150,  # Invalid: > 100
            'utilization_memory_pct': 50,
            'temperature_c': 40,
            'memory_used_mb': 1000,
        }
        r = requests.post(API_ENDPOINT,
                          json=[bad_metric],
                          headers={'X-API-Key': API_KEY},
                          timeout=10)
        assert r.status_code == 400, f"Expected 400 for util>100, got {r.status_code}"
        print(f"✅ Test 3.8: Utilization >100 rejected")
        passed += 1
    except Exception as e:
        print(f"❌ Test 3.8: Utilization range test failed - {e}")
        failed += 1
else:
    print("⏭️  Tests 3.5-3.8 skipped (no API key)")

print(f"\n{'='*50}")
print(f"Test 3 Results: {passed} passed, {failed} failed")
print(f"{'='*50}")

---
## Test 4: End-to-End - Collect & Upload Real Metrics

Collect real A100 metrics and upload them to the AluminatAI API.

In [None]:
# Cell 7: End-to-End Collection & Upload
if not API_KEY:
    print("⏭️  Test 4 skipped (no API key set in Cell 2)")
else:
    passed = 0
    failed = 0
    all_metrics = []

    collector = GPUCollector()

    print(f"Collecting {NUM_SAMPLES} samples from {collector.get_gpu_info()[0]['name']}...")
    print()

    for sample_num in range(NUM_SAMPLES):
        metrics = collector.collect()
        for m in metrics:
            all_metrics.append(m.to_dict())

        m = metrics[0]
        energy_str = f"{m.energy_delta_j:.2f}J" if m.energy_delta_j else "N/A"
        print(f"  Sample {sample_num+1}/{NUM_SAMPLES}: "
              f"{m.power_draw_w:.1f}W | "
              f"{m.utilization_gpu_pct}% util | "
              f"{m.temperature_c}°C | "
              f"E={energy_str}")

        if sample_num < NUM_SAMPLES - 1:
            time.sleep(SAMPLE_INTERVAL)

    collector.shutdown()

    print(f"\nCollected {len(all_metrics)} metric records.")

    # --- Test 4.1: Upload batch ---
    try:
        session = requests.Session()
        session.headers.update({
            'Content-Type': 'application/json',
            'X-API-Key': API_KEY,
        })

        r = session.post(API_ENDPOINT, json=all_metrics, timeout=30)
        assert r.status_code == 200, f"Upload failed: {r.status_code} - {r.text}"
        data = r.json()
        assert data['success'] == True
        assert data['inserted'] == len(all_metrics)
        print(f"\n✅ Test 4.1: Uploaded {data['inserted']} metrics successfully")

        # Check rate limit headers
        rl_limit = r.headers.get('X-RateLimit-Limit')
        rl_remaining = r.headers.get('X-RateLimit-Remaining')
        if rl_limit:
            print(f"   Rate limit: {rl_remaining}/{rl_limit} remaining")
        passed += 1
    except Exception as e:
        print(f"\n❌ Test 4.1: Upload failed - {e}")
        failed += 1

    # --- Test 4.2: Verify data integrity ---
    try:
        total_energy_j = sum(m['energy_delta_j'] for m in all_metrics
                             if m['energy_delta_j'] is not None)
        total_energy_kwh = total_energy_j / 3_600_000
        avg_power = sum(m['power_draw_w'] for m in all_metrics) / len(all_metrics)
        expected_energy = avg_power * SAMPLE_INTERVAL * (NUM_SAMPLES - 1)  # approximate

        # Allow 50% tolerance for energy calculation
        assert total_energy_j > 0, "Total energy should be positive"
        ratio = total_energy_j / expected_energy if expected_energy > 0 else 1
        assert 0.3 < ratio < 3.0, f"Energy calculation seems off: ratio={ratio:.2f}"

        print(f"✅ Test 4.2: Data integrity check")
        print(f"   Total energy: {total_energy_j:.2f}J ({total_energy_kwh:.6f} kWh)")
        print(f"   Avg power: {avg_power:.1f}W")
        print(f"   Expected ~{expected_energy:.1f}J, got {total_energy_j:.1f}J (ratio: {ratio:.2f})")
        passed += 1
    except Exception as e:
        print(f"❌ Test 4.2: Data integrity check failed - {e}")
        failed += 1

    print(f"\n{'='*50}")
    print(f"Test 4 Results: {passed} passed, {failed} failed")
    print(f"{'='*50}")

---
## Test 5: Stress Test Under GPU Load

Create real GPU load with PyTorch, then collect metrics to verify the agent captures actual workload.

In [None]:
# Cell 8: Stress Test - GPU Load + Metrics Collection
import torch
import threading

if not torch.cuda.is_available():
    print("❌ CUDA not available - skipping stress test")
else:
    passed = 0
    failed = 0

    device = torch.device('cuda')
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB")

    # --- Collect idle baseline ---
    collector = GPUCollector()
    _ = collector.collect()  # prime the energy calculator
    time.sleep(1)
    idle_metrics = collector.collect()[0]
    idle_power = idle_metrics.power_draw_w
    idle_util = idle_metrics.utilization_gpu_pct
    print(f"\nIdle baseline: {idle_power:.1f}W, {idle_util}% util")

    # --- Create GPU load ---
    print("\nStarting GPU load (large matrix multiply)...")
    stop_event = threading.Event()

    def gpu_stress():
        a = torch.randn(8192, 8192, device=device)
        b = torch.randn(8192, 8192, device=device)
        while not stop_event.is_set():
            c = torch.matmul(a, b)
            torch.cuda.synchronize()

    stress_thread = threading.Thread(target=gpu_stress)
    stress_thread.start()

    # Wait for GPU to ramp up
    time.sleep(3)

    # --- Collect under load ---
    load_samples = []
    for i in range(5):
        metrics = collector.collect()
        m = metrics[0]
        load_samples.append(m)
        print(f"  Load sample {i+1}: {m.power_draw_w:.1f}W, "
              f"{m.utilization_gpu_pct}% util, "
              f"{m.temperature_c}°C, "
              f"E={m.energy_delta_j:.2f}J" if m.energy_delta_j else "N/A")
        time.sleep(SAMPLE_INTERVAL)

    # Stop GPU load
    stop_event.set()
    stress_thread.join()
    torch.cuda.empty_cache()

    # --- Test 5.1: Power increased under load ---
    try:
        avg_load_power = sum(s.power_draw_w for s in load_samples) / len(load_samples)
        assert avg_load_power > idle_power * 1.1, (
            f"Power should increase under load: idle={idle_power:.1f}W, load={avg_load_power:.1f}W")
        print(f"\n✅ Test 5.1: Power increased - idle: {idle_power:.1f}W -> load: {avg_load_power:.1f}W")
        passed += 1
    except Exception as e:
        print(f"\n❌ Test 5.1: Power increase test failed - {e}")
        failed += 1

    # --- Test 5.2: Utilization detected ---
    try:
        avg_util = sum(s.utilization_gpu_pct for s in load_samples) / len(load_samples)
        assert avg_util > 50, f"Expected >50% utilization under load, got {avg_util:.1f}%"
        print(f"✅ Test 5.2: Utilization under load - {avg_util:.1f}%")
        passed += 1
    except Exception as e:
        print(f"❌ Test 5.2: Utilization test failed - {e}")
        failed += 1

    # --- Test 5.3: Energy calculation accurate ---
    try:
        total_energy = sum(s.energy_delta_j for s in load_samples if s.energy_delta_j)
        assert total_energy > 0, "Energy should be positive"
        # Rough check: total energy should be close to avg_power * total_time
        total_time = SAMPLE_INTERVAL * len(load_samples)
        expected = avg_load_power * total_time
        ratio = total_energy / expected
        assert 0.3 < ratio < 3.0, f"Energy/expected ratio off: {ratio:.2f}"
        print(f"✅ Test 5.3: Energy under load - {total_energy:.1f}J (expected ~{expected:.1f}J)")
        passed += 1
    except Exception as e:
        print(f"❌ Test 5.3: Energy calculation test failed - {e}")
        failed += 1

    # --- Test 5.4: Upload load metrics ---
    if API_KEY:
        try:
            load_dicts = [s.to_dict() for s in load_samples]
            r = requests.post(API_ENDPOINT,
                              json=load_dicts,
                              headers={'X-API-Key': API_KEY},
                              timeout=30)
            assert r.status_code == 200, f"Upload failed: {r.status_code}"
            print(f"✅ Test 5.4: Uploaded {len(load_dicts)} stress test metrics")
            passed += 1
        except Exception as e:
            print(f"❌ Test 5.4: Upload stress metrics failed - {e}")
            failed += 1
    else:
        print("⏭️  Test 5.4 skipped (no API key)")

    collector.shutdown()

    print(f"\n{'='*50}")
    print(f"Test 5 Results: {passed} passed, {failed} failed")
    print(f"{'='*50}")

---
## Test 6: API Key Uniqueness & Security

Verify API key format and generation security.

In [None]:
# Cell 9: API Key Format & Security Tests
import secrets
import string
import re

passed = 0
failed = 0

CHARSET = 'ABCDEFGHJKLMNPQRSTUVWXYZabcdefghjkmnpqrstuvwxyz23456789'

def generate_api_key():
    """Mirror the server-side generation logic"""
    key = 'alum_'
    for _ in range(59):
        key += secrets.choice(CHARSET)
    return key

# --- Test 6.1: Key format ---
try:
    key = generate_api_key()
    assert key.startswith('alum_'), "Key must start with 'alum_'"
    assert len(key) == 64, f"Key must be 64 chars, got {len(key)}"
    # Check all chars are from valid charset
    for c in key[5:]:
        assert c in CHARSET, f"Invalid char in key: {c}"
    print(f"✅ Test 6.1: Key format valid - alum_{'*'*10}...{key[-4:]} (len={len(key)})")
    passed += 1
except Exception as e:
    print(f"❌ Test 6.1: Key format test failed - {e}")
    failed += 1

# --- Test 6.2: Key uniqueness ---
try:
    keys = set()
    num_keys = 10000
    for _ in range(num_keys):
        keys.add(generate_api_key())
    assert len(keys) == num_keys, f"Collision detected: {num_keys - len(keys)} duplicates"
    print(f"✅ Test 6.2: Generated {num_keys} unique keys (0 collisions)")
    passed += 1
except Exception as e:
    print(f"❌ Test 6.2: Uniqueness test failed - {e}")
    failed += 1

# --- Test 6.3: Entropy check ---
try:
    import math
    charset_size = len(CHARSET)  # 54 chars
    key_length = 59  # random portion
    entropy_bits = math.log2(charset_size) * key_length
    assert entropy_bits > 256, f"Entropy too low: {entropy_bits:.1f} bits"
    print(f"✅ Test 6.3: Entropy - {entropy_bits:.1f} bits ({charset_size}^{key_length}) - exceeds 256-bit threshold")
    passed += 1
except Exception as e:
    print(f"❌ Test 6.3: Entropy test failed - {e}")
    failed += 1

# --- Test 6.4: No ambiguous characters ---
try:
    ambiguous = set('0OIl1')
    charset_set = set(CHARSET)
    overlap = ambiguous & charset_set
    assert len(overlap) == 0, f"Ambiguous chars found in charset: {overlap}"
    print(f"✅ Test 6.4: No ambiguous characters (0, O, I, l, 1 excluded)")
    passed += 1
except Exception as e:
    print(f"❌ Test 6.4: Ambiguous chars test failed - {e}")
    failed += 1

# --- Test 6.5: Validate current API key format (if provided) ---
if API_KEY:
    try:
        assert API_KEY.startswith('alum_'), "Your API key doesn't start with 'alum_'"
        assert len(API_KEY) == 64, f"Your API key should be 64 chars, got {len(API_KEY)}"
        print(f"✅ Test 6.5: Your API key format is valid")
        passed += 1
    except Exception as e:
        print(f"❌ Test 6.5: Your API key format invalid - {e}")
        failed += 1
else:
    print("⏭️  Test 6.5 skipped (no API key)")

print(f"\n{'='*50}")
print(f"Test 6 Results: {passed} passed, {failed} failed")
print(f"{'='*50}")

---
## Test 7: Continuous Monitoring Demo

Run a short continuous monitoring session (like the real agent would) and display a summary.

In [None]:
# Cell 10: Continuous Monitoring - 60 Second Demo Run
DEMO_DURATION = 60  # seconds
DEMO_INTERVAL = 5   # seconds

print(f"Running continuous monitoring for {DEMO_DURATION}s (interval: {DEMO_INTERVAL}s)")
print(f"{'='*70}")
print(f"{'#':>4} | {'Time':>8} | {'Power':>8} | {'Util':>5} | {'Temp':>5} | {'Mem MB':>8} | {'Energy J':>10}")
print(f"{'-'*4}-+-{'-'*8}-+-{'-'*8}-+-{'-'*5}-+-{'-'*5}-+-{'-'*8}-+-{'-'*10}")

collector = GPUCollector()
all_samples = []
total_energy = 0.0
start_time = time.time()
sample_num = 0

while time.time() - start_time < DEMO_DURATION:
    loop_start = time.time()
    metrics = collector.collect()
    sample_num += 1
    m = metrics[0]
    all_samples.append(m)

    if m.energy_delta_j:
        total_energy += m.energy_delta_j

    energy_str = f"{m.energy_delta_j:.2f}" if m.energy_delta_j else "N/A"
    elapsed = time.time() - start_time
    print(f"{sample_num:>4} | {elapsed:>7.1f}s | {m.power_draw_w:>7.1f}W | {m.utilization_gpu_pct:>4}% | {m.temperature_c:>4}C | {m.memory_used_mb:>7.0f} | {energy_str:>10}")

    sleep_time = max(0, DEMO_INTERVAL - (time.time() - loop_start))
    if sleep_time > 0 and time.time() - start_time + sleep_time < DEMO_DURATION:
        time.sleep(sleep_time)

collector.shutdown()

# Summary
runtime = time.time() - start_time
total_kwh = total_energy / 3_600_000
avg_power = sum(s.power_draw_w for s in all_samples) / len(all_samples)
max_power = max(s.power_draw_w for s in all_samples)
avg_temp = sum(s.temperature_c for s in all_samples) / len(all_samples)
max_temp = max(s.temperature_c for s in all_samples)
avg_util = sum(s.utilization_gpu_pct for s in all_samples) / len(all_samples)

print(f"\n{'='*70}")
print(f"MONITORING SUMMARY")
print(f"{'='*70}")
print(f"Runtime:         {runtime:.1f}s")
print(f"Samples:         {len(all_samples)}")
print(f"GPU:             {all_samples[0].gpu_name}")
print(f"Avg Power:       {avg_power:.1f}W (max: {max_power:.1f}W)")
print(f"Avg Utilization: {avg_util:.1f}%")
print(f"Avg Temperature: {avg_temp:.1f}°C (max: {max_temp}°C)")
print(f"Total Energy:    {total_energy:.2f}J ({total_kwh:.6f} kWh)")
print(f"Est. Cost:       ${total_kwh * 0.12:.6f} (@ $0.12/kWh)")
print(f"{'='*70}")

# Upload if API key available
if API_KEY:
    metrics_dicts = [s.to_dict() for s in all_samples]
    r = requests.post(API_ENDPOINT,
                      json=metrics_dicts,
                      headers={'X-API-Key': API_KEY},
                      timeout=30)
    if r.status_code == 200:
        print(f"\n✅ Uploaded {len(metrics_dicts)} samples to AluminatAI dashboard")
    else:
        print(f"\n❌ Upload failed: {r.status_code} - {r.text}")

---
## Summary

All tests completed. Check your [AluminatAI Dashboard](https://aluminatiai-landing.vercel.app/dashboard) to see the uploaded metrics.

### What was tested:
- **Test 1**: Raw NVML access (power, util, temp, memory)
- **Test 2**: GPUCollector class + energy delta calculation
- **Test 3**: API auth validation (missing key, bad format, wrong key, bad payloads)
- **Test 4**: End-to-end collect + upload + data integrity
- **Test 5**: Stress test under real GPU load (matrix multiply)
- **Test 6**: API key format, uniqueness, and entropy
- **Test 7**: Continuous 60-second monitoring demo