In [None]:
"""
GPU Power Monitoring - With GPU State Metadata Tracking
For Google Colab - monitors power with detailed state information

NEW FEATURES:
- Training state tracking: initiation, forward_pass, backward_pass, communication
- Inference state tracking: processing_batch_N, waiting_for_queries
- Millisecond-scale sampling (10ms-1000ms)
- Gamma distribution for realistic query arrivals
- Detailed CSV output with state metadata

IMPORTANT: Run this in a FRESH Colab session or restart the kernel first!
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from torch.utils.data import DataLoader, Dataset
import time
import pandas as pd
import matplotlib.pyplot as plt
import subprocess
import threading
from collections import defaultdict, deque
import numpy as np
import gc
import os
import warnings

# ============================================================================
# MEMORY CLEANUP
# ============================================================================

def cleanup_memory():
    """Aggressively clean up GPU memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

# ============================================================================
# GPU MONITORING CLASS WITH STATE TRACKING
# ============================================================================

class GPUMonitorWithStates:
    """Monitor GPU metrics with state metadata tracking"""

    def __init__(self, gpu_id=0, sampling_interval_ms=100):
        """
        Initialize GPU monitor with state tracking.
        
        Args:
            gpu_id: GPU device ID to monitor
            sampling_interval_ms: Sampling interval in milliseconds
        """
        self.gpu_id = gpu_id
        self.sampling_interval_ms = max(10, min(1000, sampling_interval_ms))
        self.monitoring = False
        self.metrics = defaultdict(list)
        self.timestamps = []
        self.gpu_states = []  # NEW: Track GPU state at each sample
        self.monitor_thread = None
        self.error_count = 0
        self.max_errors = 10
        self.sample_count = 0
        self.dropped_samples = 0
        
        # State tracking
        self.current_state = "idle"
        self.state_lock = threading.Lock()
        
        # Performance tracking
        self.query_times = deque(maxlen=100)
        self.adaptive_sampling = True
        
        print(f"\n🔧 GPU Monitor initialized with {sampling_interval_ms}ms sampling interval")
        print(f"   Expected samples per second: {1000/sampling_interval_ms:.1f}")
        print(f"   State tracking: ENABLED")

    def set_state(self, state):
        """Thread-safe state setter"""
        with self.state_lock:
            self.current_state = state

    def get_state(self):
        """Thread-safe state getter"""
        with self.state_lock:
            return self.current_state

    def get_gpu_info(self):
        """Get GPU brand and model information"""
        try:
            result = subprocess.run(
                ['nvidia-smi', '--query-gpu=name,driver_version', '--format=csv,noheader'],
                capture_output=True, text=True, check=True, timeout=2
            )
            gpu_name, driver_version = result.stdout.strip().split(', ')
            return gpu_name, driver_version
        except Exception as e:
            print(f"⚠️  Error getting GPU info: {e}")
            return "Unknown", "Unknown"

    def get_gpu_metrics(self):
        """Query GPU metrics - optimized for millisecond sampling"""
        query_start = time.perf_counter()
        
        try:
            query = [
                'nvidia-smi',
                '--query-gpu=power.draw,temperature.gpu,utilization.gpu,memory.used,memory.total',
                '--format=csv,noheader,nounits',
                '-i', str(self.gpu_id)
            ]

            timeout = max(0.05, self.sampling_interval_ms / 1000 * 0.5)
            result = subprocess.run(query, capture_output=True, text=True, check=True, timeout=timeout)
            metrics_str = result.stdout.strip()

            parts = [p.strip() for p in metrics_str.split(',')]

            metrics = {
                'power_draw_w': float(parts[0]) if parts[0] not in ['[N/A]', 'N/A'] else None,
                'temperature_c': float(parts[1]) if parts[1] not in ['[N/A]', 'N/A'] else None,
                'utilization_pct': float(parts[2]) if parts[2] not in ['[N/A]', 'N/A'] else None,
                'memory_used_mb': float(parts[3]) if parts[3] not in ['[N/A]', 'N/A'] else None,
                'memory_total_mb': float(parts[4]) if parts[4] not in ['[N/A]', 'N/A'] else None,
            }

            # Try to get voltage and current
            try:
                voltage_query = ['nvidia-smi', '--query-gpu=voltage,current', '--format=csv,noheader,nounits', '-i', str(self.gpu_id)]
                voltage_result = subprocess.run(voltage_query, capture_output=True, text=True, timeout=timeout)
                if voltage_result.returncode == 0:
                    voltage_parts = [p.strip() for p in voltage_result.stdout.strip().split(',')]
                    metrics['voltage_v'] = float(voltage_parts[0]) if voltage_parts[0] not in ['[N/A]', 'N/A', '[Not Supported]'] else None
                    metrics['current_a'] = float(voltage_parts[1]) if len(voltage_parts) > 1 and voltage_parts[1] not in ['[N/A]', 'N/A', '[Not Supported]'] else None
            except:
                pass

            query_time = (time.perf_counter() - query_start) * 1000
            self.query_times.append(query_time)
            
            self.error_count = 0
            return metrics

        except subprocess.TimeoutExpired:
            self.error_count += 1
            self.dropped_samples += 1
            if self.error_count <= 3:
                print(f"⚠️  nvidia-smi timeout (attempt {self.error_count})")
            return None
        except Exception as e:
            self.error_count += 1
            self.dropped_samples += 1
            if self.error_count <= 3:
                print(f"⚠️  Error querying GPU metrics: {e}")
            return None

    def _monitor_loop(self):
        """Background monitoring loop with state tracking"""
        interval_seconds = self.sampling_interval_ms / 1000.0
        next_sample_time = time.perf_counter()
        
        while self.monitoring and self.error_count < self.max_errors:
            current_time = time.perf_counter()
            
            if current_time >= next_sample_time:
                metrics = self.get_gpu_metrics()
                if metrics:
                    self.timestamps.append(time.time())
                    for key, value in metrics.items():
                        self.metrics[key].append(value)
                    
                    # Capture current GPU state
                    current_state = self.get_state()
                    self.gpu_states.append(current_state)
                    
                    self.sample_count += 1
                
                next_sample_time += interval_seconds
                
                if self.adaptive_sampling and len(self.query_times) >= 10:
                    avg_query_time = np.mean(list(self.query_times))
                    if avg_query_time > (self.sampling_interval_ms * 0.8):
                        if self.error_count == 0:
                            print(f"⚠️  Query time ({avg_query_time:.1f}ms) approaching sampling interval")
                        self.error_count = 1
            else:
                sleep_time = min(0.001, (next_sample_time - current_time) / 2)
                if sleep_time > 0:
                    time.sleep(sleep_time)

    def start_monitoring(self):
        """Start monitoring in background thread"""
        if self.monitoring:
            print("⚠️  Monitoring already active")
            return
            
        self.monitoring = True
        self.error_count = 0
        self.sample_count = 0
        self.dropped_samples = 0
        
        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
        self.monitor_thread.start()
        print(f"✅ Monitoring started at {self.sampling_interval_ms}ms intervals")

    def stop_monitoring(self):
        """Stop monitoring and return collected data"""
        if not self.monitoring:
            return None
            
        self.monitoring = False
        if self.monitor_thread:
            self.monitor_thread.join(timeout=2)

        if self.sample_count > 0:
            duration = self.timestamps[-1] - self.timestamps[0] if len(self.timestamps) > 1 else 0
            actual_rate = self.sample_count / duration if duration > 0 else 0
            expected_rate = 1000 / self.sampling_interval_ms
            
            print(f"\n📊 Monitoring Statistics:")
            print(f"   Samples collected: {self.sample_count}")
            print(f"   Dropped samples: {self.dropped_samples}")
            print(f"   Duration: {duration:.2f}s")
            print(f"   Actual rate: {actual_rate:.1f} samples/s")
            print(f"   Efficiency: {(actual_rate/expected_rate*100):.1f}%")

        return self.get_dataframe()

    def get_dataframe(self):
        """Convert collected metrics to DataFrame with state information"""
        if not self.timestamps:
            return pd.DataFrame()

        df = pd.DataFrame(self.metrics)
        df['timestamp'] = self.timestamps
        df['time_ms'] = (df['timestamp'] - df['timestamp'].iloc[0]) * 1000
        df['gpu_state'] = self.gpu_states  # Add GPU state column
        
        return df

    def reset(self):
        """Reset all collected data"""
        self.metrics = defaultdict(list)
        self.timestamps = []
        self.gpu_states = []
        self.sample_count = 0
        self.dropped_samples = 0
        self.error_count = 0
        self.query_times.clear()
        self.current_state = "idle"

# ============================================================================
# MODEL TRAINER WITH STATE TRACKING
# ============================================================================

class ModelTrainerWithStates:
    """Train models with detailed GPU state tracking"""

    def __init__(self, model_name='resnet', device='cuda', sampling_interval_ms=100):
        self.device = device
        self.model_name = model_name
        self.sampling_interval_ms = sampling_interval_ms
        
        # Initialize GPU monitor with state tracking
        self.monitor = GPUMonitorWithStates(gpu_id=0, sampling_interval_ms=sampling_interval_ms)
        
        # Get GPU info
        gpu_name, driver_version = self.monitor.get_gpu_info()
        print(f"\n{'='*70}")
        print(f"  🎮 GPU: {gpu_name}")
        print(f"  🔧 Driver: {driver_version}")
        print(f"  ⏱️  Sampling: {sampling_interval_ms}ms ({1000/sampling_interval_ms:.1f} samples/sec)")
        print(f"  📊 State tracking: ENABLED")
        print(f"{'='*70}\n")

        # Setup model
        self.model = self._create_model()
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        # Storage for metrics
        self.training_data = None
        self.inference_data = None

    def _create_model(self):
        """Create and prepare model"""
        print(f"📦 Loading {self.model_name} model...")
        
        if self.model_name == 'resnet':
            model = models.resnet50(pretrained=False)
        elif self.model_name == 'vgg':
            model = models.vgg16(pretrained=False)
        else:
            raise ValueError(f"Unknown model: {self.model_name}")

        model = model.to(self.device)
        print(f"✅ Model loaded on {self.device}\n")
        return model

    def train_epoch(self, dataloader, epoch):
        """Train for one epoch with state tracking"""
        self.model.train()
        total_loss = 0
        
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(self.device), target.to(self.device)

            # FORWARD PASS
            self.monitor.set_state(f"epoch_{epoch}_batch_{batch_idx+1}_forward_pass")
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.criterion(output, target)
            
            # BACKWARD PASS
            self.monitor.set_state(f"epoch_{epoch}_batch_{batch_idx+1}_backward_pass")
            loss.backward()
            self.optimizer.step()

            # COMMUNICATION (simulated - in distributed training this would be gradient sync)
            self.monitor.set_state(f"epoch_{epoch}_batch_{batch_idx+1}_communication")
            torch.cuda.synchronize()  # Ensure all operations complete
            time.sleep(0.01)  # Simulate brief communication delay

            total_loss += loss.item()

            if (batch_idx + 1) % 5 == 0:
                print(f"  Epoch {epoch} [{batch_idx+1}/{len(dataloader)}] Loss: {loss.item():.4f}")

        return total_loss / len(dataloader)

    def run_training(self, num_epochs=2, batch_size=48, num_batches=14):
        """Run training with state tracking"""
        print(f"\n{'='*70}")
        print(f"  🏋️  TRAINING PHASE WITH STATE TRACKING")
        print(f"{'='*70}")
        print(f"  Epochs: {num_epochs}")
        print(f"  Batch size: {batch_size}")
        print(f"  Batches per epoch: {num_batches}")
        print(f"  States tracked: initiation, forward_pass, backward_pass, communication")
        print(f"{'='*70}\n")

        # Create dummy dataset
        class DummyDataset(Dataset):
            def __init__(self, size, num_classes=1000):
                self.size = size
                self.num_classes = num_classes

            def __len__(self):
                return self.size

            def __getitem__(self, idx):
                return torch.randn(3, 224, 224), torch.randint(0, self.num_classes, (1,)).item()

        dataset = DummyDataset(batch_size * num_batches)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # INITIATION STATE
        self.monitor.set_state("training_initiation")
        self.monitor.start_monitoring()
        time.sleep(0.5)  # Warmup period

        # Training loop
        for epoch in range(1, num_epochs + 1):
            avg_loss = self.train_epoch(dataloader, epoch)
            print(f"\n  ✅ Epoch {epoch} complete - Avg Loss: {avg_loss:.4f}\n")

        # Stop monitoring
        self.monitor.set_state("training_complete")
        time.sleep(0.3)
        self.training_data = self.monitor.stop_monitoring()

        if self.training_data is not None and len(self.training_data) > 0:
            state_counts = self.training_data['gpu_state'].value_counts()
            print(f"\n✅ Training complete - {len(self.training_data)} samples collected")
            print(f"\n📊 State distribution:")
            for state, count in state_counts.head(10).items():
                print(f"   {state}: {count} samples")
        else:
            print("\n⚠️  Warning: No training data collected")

        cleanup_memory()

    def run_inference(self, batch_size=64, num_batches=20, gamma_shape=2.0, gamma_scale=0.1):
        """
        Run inference with state tracking.
        States: processing_batch_N or waiting_for_queries
        """
        print(f"\n{'='*70}")
        print(f"  🔍 INFERENCE PHASE WITH STATE TRACKING")
        print(f"{'='*70}")
        print(f"  Number of user queries: {num_batches}")
        print(f"  Batch size per query: {batch_size} images")
        print(f"  Total images: {batch_size * num_batches}")
        print(f"  Arrival pattern: Gamma distribution (shape={gamma_shape}, scale={gamma_scale}s)")
        print(f"  States tracked: processing_batch_N, waiting_for_queries")
        print(f"{'='*70}\n")

        self.model.eval()

        # Generate inter-arrival times
        np.random.seed(42)
        inter_arrival_times = np.random.gamma(gamma_shape, gamma_scale, num_batches - 1)
        
        print(f"📊 Query arrival statistics:")
        print(f"   Mean inter-arrival time: {np.mean(inter_arrival_times)*1000:.1f}ms")
        print(f"   Std inter-arrival time: {np.std(inter_arrival_times)*1000:.1f}ms")
        print(f"   Expected total duration: {np.sum(inter_arrival_times):.2f}s\n")

        # Reset and start monitoring
        self.monitor.reset()
        self.monitor.set_state("inference_initiation")
        self.monitor.start_monitoring()
        time.sleep(0.2)

        # Run inference with state tracking
        query_timestamps = []
        with torch.no_grad():
            for i in range(num_batches):
                # PROCESSING STATE
                self.monitor.set_state(f"processing_batch_{i+1}")
                
                query_start = time.time()
                query_timestamps.append(query_start)
                
                dummy_input = torch.randn(batch_size, 3, 224, 224).to(self.device)
                output = self.model(dummy_input)
                torch.cuda.synchronize()
                
                query_duration = (time.time() - query_start) * 1000
                print(f"  Query {i+1}/{num_batches} complete (processing: {query_duration:.2f}ms)")
                
                # WAITING STATE (if not last query)
                if i < num_batches - 1:
                    self.monitor.set_state("waiting_for_queries")
                    wait_time = inter_arrival_times[i]
                    time.sleep(wait_time)

        # Stop monitoring
        self.monitor.set_state("inference_complete")
        time.sleep(0.2)
        self.inference_data = self.monitor.stop_monitoring()

        if self.inference_data is not None and len(self.inference_data) > 0:
            state_counts = self.inference_data['gpu_state'].value_counts()
            print(f"\n✅ Inference complete - {len(self.inference_data)} power samples collected")
            print(f"\n📊 State distribution:")
            for state, count in state_counts.items():
                print(f"   {state}: {count} samples")
            
            # Calculate time in each state
            processing_samples = sum(1 for s in self.inference_data['gpu_state'] if 'processing' in s)
            waiting_samples = sum(1 for s in self.inference_data['gpu_state'] if 'waiting' in s)
            total_samples = len(self.inference_data)
            
            print(f"\n⏱️  Time allocation:")
            print(f"   Processing: {processing_samples/total_samples*100:.1f}% ({processing_samples} samples)")
            print(f"   Waiting: {waiting_samples/total_samples*100:.1f}% ({waiting_samples} samples)")
        else:
            print("\n⚠️  Warning: No inference data collected")

        cleanup_memory()

    def save_results(self, output_dir='/content'):
        """Save monitoring results with state metadata to CSV files"""
        print(f"\n{'='*70}")
        print(f"  💾 SAVING RESULTS WITH STATE METADATA")
        print(f"{'='*70}\n")

        saved_files = []

        if self.training_data is not None and len(self.training_data) > 0:
            train_file = f'{output_dir}/training_metrics_with_states.csv'
            self.training_data.to_csv(train_file, index=False)
            print(f"  ✅ Training metrics: {train_file}")
            print(f"     Samples: {len(self.training_data)}")
            print(f"     Columns: {', '.join(self.training_data.columns)}")
            saved_files.append(train_file)

        if self.inference_data is not None and len(self.inference_data) > 0:
            infer_file = f'{output_dir}/inference_metrics_with_states.csv'
            self.inference_data.to_csv(infer_file, index=False)
            print(f"\n  ✅ Inference metrics: {infer_file}")
            print(f"     Samples: {len(self.inference_data)}")
            print(f"     Columns: {', '.join(self.inference_data.columns)}")
            saved_files.append(infer_file)

        print(f"\n{'='*70}\n")
        return saved_files

    def plot_results(self, output_dir='/content'):
        """Create visualization plots with state information"""
        if self.training_data is None and self.inference_data is None:
            print("⚠️  No data to plot")
            return

        fig, axes = plt.subplots(2, 2, figsize=(16, 10))
        fig.suptitle(f'GPU Metrics with State Tracking - {self.model_name.upper()} ({self.sampling_interval_ms}ms sampling)', 
                     fontsize=16, fontweight='bold')

        # Power Draw
        ax = axes[0, 0]
        if self.training_data is not None and 'power_draw_w' in self.training_data:
            ax.plot(self.training_data['time_ms']/1000, self.training_data['power_draw_w'], 
                   label='Training', alpha=0.7, linewidth=1)
        if self.inference_data is not None and 'power_draw_w' in self.inference_data:
            ax.plot(self.inference_data['time_ms']/1000, self.inference_data['power_draw_w'], 
                   label='Inference', alpha=0.7, linewidth=1)
        ax.set_xlabel('Time (seconds)')
        ax.set_ylabel('Power Draw (W)')
        ax.set_title('Power Consumption')
        ax.legend()
        ax.grid(True, alpha=0.3)

        # Temperature
        ax = axes[0, 1]
        if self.training_data is not None and 'temperature_c' in self.training_data:
            ax.plot(self.training_data['time_ms']/1000, self.training_data['temperature_c'], 
                   label='Training', alpha=0.7, linewidth=1)
        if self.inference_data is not None and 'temperature_c' in self.inference_data:
            ax.plot(self.inference_data['time_ms']/1000, self.inference_data['temperature_c'], 
                   label='Inference', alpha=0.7, linewidth=1)
        ax.set_xlabel('Time (seconds)')
        ax.set_ylabel('Temperature (°C)')
        ax.set_title('GPU Temperature')
        ax.legend()
        ax.grid(True, alpha=0.3)

        # GPU Utilization
        ax = axes[1, 0]
        if self.training_data is not None and 'utilization_pct' in self.training_data:
            ax.plot(self.training_data['time_ms']/1000, self.training_data['utilization_pct'], 
                   label='Training', alpha=0.7, linewidth=1)
        if self.inference_data is not None and 'utilization_pct' in self.inference_data:
            ax.plot(self.inference_data['time_ms']/1000, self.inference_data['utilization_pct'], 
                   label='Inference', alpha=0.7, linewidth=1)
        ax.set_xlabel('Time (seconds)')
        ax.set_ylabel('Utilization (%)')
        ax.set_title('GPU Utilization')
        ax.legend()
        ax.grid(True, alpha=0.3)

        # Memory Usage
        ax = axes[1, 1]
        if self.training_data is not None and 'memory_used_mb' in self.training_data:
            ax.plot(self.training_data['time_ms']/1000, self.training_data['memory_used_mb']/1024, 
                   label='Training', alpha=0.7, linewidth=1)
        if self.inference_data is not None and 'memory_used_mb' in self.inference_data:
            ax.plot(self.inference_data['time_ms']/1000, self.inference_data['memory_used_mb']/1024, 
                   label='Inference', alpha=0.7, linewidth=1)
        ax.set_xlabel('Time (seconds)')
        ax.set_ylabel('Memory Used (GB)')
        ax.set_title('GPU Memory Usage')
        ax.legend()
        ax.grid(True, alpha=0.3)

        plt.tight_layout()
        
        plot_file = f'{output_dir}/gpu_metrics_with_states.png'
        plt.savefig(plot_file, dpi=150, bbox_inches='tight')
        print(f"\n📊 Plot saved: {plot_file}")
        plt.show()

# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    print("\n" + "="*70)
    print("  GPU POWER MONITORING WITH STATE TRACKING")
    print("="*70)

    # Configuration
    SAMPLING_INTERVAL_MS = 50  # 50ms = 20 samples per second
    
    MODEL_NAME = 'resnet'
    NUM_EPOCHS = 2
    TRAIN_BATCH_SIZE = 48
    TRAIN_BATCHES = 14
    
    INFER_BATCH_SIZE = 64
    INFER_BATCHES = 20  # Number of user queries
    GAMMA_SHAPE = 2.0
    GAMMA_SCALE = 0.1  # Mean inter-arrival = 0.2s

    try:
        # Check GPU availability
        if not torch.cuda.is_available():
            raise RuntimeError("No GPU available! Enable GPU runtime in Colab.")

        # Initialize trainer
        trainer = ModelTrainerWithStates(
            model_name=MODEL_NAME,
            device='cuda',
            sampling_interval_ms=SAMPLING_INTERVAL_MS
        )

        # Run training with state tracking
        trainer.run_training(
            num_epochs=NUM_EPOCHS,
            batch_size=TRAIN_BATCH_SIZE,
            num_batches=TRAIN_BATCHES
        )

        # Run inference with state tracking
        trainer.run_inference(
            batch_size=INFER_BATCH_SIZE,
            num_batches=INFER_BATCHES,
            gamma_shape=GAMMA_SHAPE,
            gamma_scale=GAMMA_SCALE
        )

        # Save results
        saved_files = trainer.save_results()

        # Create plots
        trainer.plot_results()

        # Summary
        print("\n" + "="*70)
        print("  ✅ EXECUTION COMPLETED SUCCESSFULLY!")
        print("="*70)
        print(f"  Model              : {MODEL_NAME}")
        print(f"  Sampling rate      : {SAMPLING_INTERVAL_MS}ms")
        print(f"  State tracking     : ENABLED")
        print(f"  Training states    : initiation, forward_pass, backward_pass, communication")
        print(f"  Inference states   : processing_batch_N, waiting_for_queries")
        if trainer.training_data is not None:
            print(f"  Training samples   : {len(trainer.training_data)}")
        if trainer.inference_data is not None:
            print(f"  Inference samples  : {len(trainer.inference_data)}")
        print("="*70 + "\n")

    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()
    finally:
        cleanup_memory()


In [None]:
# Download results
!zip -r results_with_states.zip /content/*.csv /content/*.png

from google.colab import files
files.download('results_with_states.zip')