# Runtime Data Analysis

This notebook provides a flexible, generic data analysis framework for processing runtime metrics and performance data.

## Features
- Load data from CSV or JSON files individually or in bulk
- Flexible label extraction for organizing datasets
- Comprehensive statistical analysis (mean, std, percentiles, etc.)
- Multiple visualization options (histograms, KDE, box plots, bar charts)
- Easy to extend for new metrics or data types
- Export summaries to CSV

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import List, Dict, Union, Optional
import json
import re

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

class DataAnalyzer:
    """
    Generic data analyzer for runtime metrics and other performance data.
    Easily extensible for different metrics and file formats.
    """
    
    def __init__(self):
        self.data = {}
        self.metadata = {}
    
    def load_files(self, 
                   file_paths: Union[str, List[str]], 
                   dataset_name: Optional[str] = None,
                   label_extractor: Optional[callable] = None) -> None:
        """
        Load data from one or more files.
        
        Args:
            file_paths: Single file path or list of file paths
            dataset_name: Name for this dataset (optional, will use filename if not provided)
            label_extractor: Optional function to extract label from filename (used when loading multiple files)
        """
        if isinstance(file_paths, str):
            file_paths = [file_paths]
        
        for file_path in file_paths:
            path = Path(file_path)
            
            if not path.exists():
                print(f"Warning: {file_path} not found, skipping...")
                continue
            
            # Extract label from filename
            if label_extractor:
                label = label_extractor(path)
            elif dataset_name:
                label = dataset_name
            else:
                label = path.stem
            
            # Load data based on file extension
            if path.suffix == '.csv':
                df = pd.read_csv(file_path)
            elif path.suffix == '.json':
                with open(file_path, 'r') as f:
                    json_data = json.load(f)
                df = pd.DataFrame(json_data)
            elif path.suffix == '.jsonl':
                # Read JSONL (one JSON object per line)
                df = pd.read_json(file_path, lines=True)
            else:
                print(f"Warning: Unsupported file type {path.suffix}")
                continue
            
            self.data[label] = df
            self.metadata[label] = {
                'file_path': str(file_path),
                'num_samples': len(df),
                'columns': list(df.columns)
            }
            
            print(f"Loaded {len(df)} samples from {path.name} with columns: {list(df.columns)}")
    
    def load_pattern(self, 
                     pattern: str, 
                     label_extractor: Optional[callable] = None) -> None:
        """
        Load all files matching a glob pattern.
        
        Args:
            pattern: Glob pattern (e.g., 'experiments/logs/*.csv')
            label_extractor: Optional function to extract label from filename
        """
        files = list(Path('.').glob(pattern))
        if not files:
            print(f"No files found matching pattern: {pattern}")
            return
        
        file_paths = [str(f) for f in files]
        self.load_files(file_paths, label_extractor=label_extractor)
    
    def add_throughput_column(self,
                              latency_column='latency_ms',
                              batch_size=8,
                              seq_len=128,
                              throughput_column='throughput',
                              datasets=None):
        """
        Add a throughput column to the dataframes.
        
        Throughput = (batch_size * seq_len) / (latency_ms / 1000)
        
        Args:
            latency_column: Column name containing latency data in milliseconds
            batch_size: Batch size used in experiment
            seq_len: Sequence length (number of tokens per sequence)
            throughput_column: Name for the new throughput column
            datasets: List of datasets to process (None = all)
        """
        if datasets is None:
            datasets = list(self.data.keys())
        
        tokens_per_batch = batch_size * seq_len
        
        for dataset_name in datasets:
            df = self.data[dataset_name]
            
            if latency_column not in df.columns:
                print(f"Warning: Column '{latency_column}' not found in dataset '{dataset_name}', skipping...")
                continue
            
            # Convert latency from ms to seconds and compute throughput
            latency_seconds = df[latency_column] / 1000.0
            df[throughput_column] = tokens_per_batch / latency_seconds
            
            # Update metadata to include new column
            self.metadata[dataset_name]['columns'] = list(df.columns)
            
            print(f"Added '{throughput_column}' column to dataset '{dataset_name}'")
        
        print(f"\nThroughput computed with: batch_size={batch_size}, seq_len={seq_len}, tokens_per_batch={tokens_per_batch}")
    
    def get_gpu_cpu_runtime(self, dataset_name: Optional[str] = None, 
                           events_file: Optional[str] = None) -> Dict[str, float]:
        """
        Calculate total GPU and CPU runtime from events data.
        
        The GPU time is the sum of all iteration latencies (actual GPU execution time).
        The CPU time is the wall time minus GPU time (CPU overhead for scheduling, data transfer, etc.).
        
        Args:
            dataset_name: Name of dataset to analyze (uses loaded data)
            events_file: Path to events.jsonl file (alternative to using loaded data)
            
        Returns:
            Dictionary with timing statistics
        """
        # Load events from file or use loaded dataset
        if events_file:
            with open(events_file, 'r') as f:
                events = [json.loads(line) for line in f]
        elif dataset_name and dataset_name in self.data:
            # Convert dataframe back to list of dicts
            df = self.data[dataset_name]
            events = df.to_dict('records')
        else:
            print(f"Error: Must provide either dataset_name or events_file")
            return None
        
        # Parse events
        iterations = [e for e in events if e.get('type') == 'iteration']
        time_elapsed_event = next((e for e in events if e.get('type') == 'time_elapsed'), None)
        run_start = next((e for e in events if e.get('type') == 'run_start'), None)
        run_end = next((e for e in events if e.get('type') == 'run_end'), None)
        
        if not iterations:
            print("Error: No iteration events found")
            return None
        
        # Calculate GPU time (sum of all iteration latencies)
        total_gpu_time_ms = sum(it['latency_ms'] for it in iterations)
        
        # Get wall time
        if time_elapsed_event and 'sec' in time_elapsed_event:
            total_wall_time_ms = time_elapsed_event['sec'] * 1000
        elif run_start and run_end:
            # Fallback: calculate from timestamps
            total_wall_time_ms = (run_end['timestamp'] - run_start['timestamp']) * 1000
        else:
            print("Warning: Could not determine wall time")
            total_wall_time_ms = total_gpu_time_ms  # Fallback
        
        # Calculate CPU overhead time
        total_cpu_time_ms = max(0, total_wall_time_ms - total_gpu_time_ms)
        
        # Calculate GPU utilization
        gpu_utilization_pct = (total_gpu_time_ms / total_wall_time_ms * 100) if total_wall_time_ms > 0 else 0
        
        results = {
            'total_gpu_time_ms': total_gpu_time_ms,
            'total_cpu_time_ms': total_cpu_time_ms,
            'total_wall_time_ms': total_wall_time_ms,
            'num_iterations': len(iterations),
            'gpu_utilization_pct': gpu_utilization_pct,
            'avg_gpu_time_per_iter_ms': total_gpu_time_ms / len(iterations) if iterations else 0,
            'avg_cpu_overhead_per_iter_ms': total_cpu_time_ms / len(iterations) if iterations else 0
        }
        
        # Print summary
        label = dataset_name if dataset_name else (events_file if events_file else "Unknown")
        # print(f"\nGPU/CPU Runtime Analysis: {label}")
        # print(f"{'='*70}")
        # print(f"Total GPU Time:              {total_gpu_time_ms:12.2f} ms")
        # print(f"Total CPU Overhead Time:     {total_cpu_time_ms:12.2f} ms")
        # print(f"Total Wall Time:             {total_wall_time_ms:12.2f} ms")
        # print(f"Number of Iterations:        {len(iterations):12,d}")
        # print(f"GPU Utilization:             {gpu_utilization_pct:12.2f} %")
        # print(f"Avg GPU Time per Iteration:  {results['avg_gpu_time_per_iter_ms']:12.2f} ms")
        # print(f"Avg CPU Overhead per Iter:   {results['avg_cpu_overhead_per_iter_ms']:12.2f} ms")
        # print(f"{'='*70}\n")
        
        return results
    
    def compute_statistics(self, column: str, datasets: Optional[List[str]] = None, percentiles: List[float] = [50, 90, 95, 99]) -> pd.DataFrame:
        """
        Compute comprehensive statistics for all loaded datasets.
        
        Args:
            column: Column name to analyze
            datasets: List of dataset names to include (None = all)
            percentiles: List of percentiles to compute
            
        Returns:
            DataFrame with statistics for each dataset
        """
        if datasets is None:
            datasets = list(self.data.keys())
            
        stats_list = []
        
        for label in datasets:
            df = self.data[label]
            
            if column not in df.columns:
                print(f"Warning: Column '{column}' not found in dataset '{label}', skipping...")
                continue
                
            values = df[column]
            
            stats = {
                'dataset': label,
                'column': column,
                'count': len(values),
                'mean': values.mean(),
                'std': values.std(),
                'min': values.min(),
                'max': values.max(),
            }
            
            # Add percentiles
            for p in percentiles:
                stats[f'p{p}'] = values.quantile(p/100)
            
            stats_list.append(stats)
        
        return pd.DataFrame(stats_list)
    
    def plot_distributions(self, 
                          column: str,
                          datasets: Optional[List[str]] = None,
                          plot_type: str = 'hist',
                          bins: int = 50,
                          figsize: tuple = (14, 6)) -> None:
        """
        Plot distributions of metrics with equal-width bins for consistent visualization.
        
        Args:
            column: Column name to analyze
            datasets: List of dataset labels to plot (None = all)
            plot_type: 'hist', 'kde', or 'both'
            bins: Number of bins for histogram (or bin edges array)
            figsize: Figure size
        """
        if datasets is None:
            datasets = list(self.data.keys())
        
        fig, axes = plt.subplots(1, 2, figsize=figsize)
        
        # Collect all data to determine global min/max for equal-width bins
        all_values = []
        for label in datasets:
            df = self.data[label]
            if column in df.columns:
                all_values.extend(df[column].dropna().values)
        
        if len(all_values) == 0:
            print(f"Warning: No data found for column '{column}'")
            return
        
        # Create equal-width bin edges based on ALL data
        if isinstance(bins, int):
            bin_edges = np.linspace(min(all_values), max(all_values), bins + 1)
        else:
            # If bins is already an array, use it as-is
            bin_edges = bins
        
        for label in datasets:
            df = self.data[label]
            
            if column not in df.columns:
                print(f"Warning: Column '{column}' not found in dataset '{label}', skipping...")
                continue
                
            values = df[column]
            
            # Histogram with equal-width bins
            if plot_type in ['hist', 'both']:
                axes[0].hist(values, bins=bin_edges, alpha=0.6, label=label)
            
            # KDE
            if plot_type in ['kde', 'both']:
                values.plot.kde(ax=axes[1], label=label)
        
        axes[0].set_xlabel(f'{column}')
        axes[0].set_ylabel('Frequency')
        axes[0].set_title('Distribution (Histogram)')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        axes[1].set_xlabel(f'{column}')
        axes[1].set_ylabel('Density')
        axes[1].set_title('Distribution (KDE)')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    def plot_comparison(self,
                       dataset: str,
                       column: str,
                       metrics: List[str] = ['mean', 'median', 'p95', 'p99'],
                       figsize: tuple = (10, 6)) -> None:
        """
        Bar plot comparing multiple metrics for a single dataset and column.

        Args:
            dataset: Dataset label to analyze
            column: Column name to analyze
            metrics: List of metrics to compare ('mean', 'median', 'p95', etc.)
            figsize: Figure size
        """
        if dataset not in self.data:
            print(f"Error: Dataset '{dataset}' not found")
            print(f"Available datasets: {self.list_datasets()}")
            return

        df = self.data[dataset]
        
        if column not in df.columns:
            print(f"Error: Column '{column}' not found in dataset '{dataset}'")
            print(f"Available columns: {list(df.columns)}")
            return
            
        data = df[column]

        values = []
        labels = []

        for metric in metrics:
            if metric == 'mean':
                val = data.mean()
            elif metric == 'median':
                val = data.median()
            elif metric.startswith('p'):
                percentile = int(metric[1:])
                val = data.quantile(percentile/100)
            elif metric in ['min', 'max', 'std']:
                val = getattr(data, metric)()
            else:
                print(f"Warning: Unknown metric '{metric}', skipping")
                continue

            values.append(val)
            labels.append(metric.upper())

        plt.figure(figsize=figsize)
        plt.bar(labels, values)
        plt.xlabel('Metric')
        plt.ylabel(f'{column}')
        plt.title(f'Metric Comparison for {dataset} - {column}')
        plt.xticks(rotation=45, ha='right')
        plt.grid(True, alpha=0.3, axis='y')
        plt.tight_layout()
        plt.show()
    
    def plot_boxplot(self, 
                     column: str,
                     datasets: Optional[List[str]] = None,
                     figsize: tuple = (12, 6)) -> None:
        """
        Create box plot for comparing distributions.
        
        Args:
            column: Column name to analyze
            datasets: List of dataset labels to plot (None = all)
            figsize: Figure size
        """
        if datasets is None:
            datasets = list(self.data.keys())
        
        plot_data = []
        plot_labels = []
        
        for label in datasets:
            df = self.data[label]
            
            if column not in df.columns:
                print(f"Warning: Column '{column}' not found in dataset '{label}', skipping...")
                continue
                
            plot_data.append(df[column].values)
            plot_labels.append(label)
        
        plt.figure(figsize=figsize)
        plt.boxplot(plot_data, labels=plot_labels)
        plt.xlabel('Dataset')
        plt.ylabel(f'{column}')
        plt.title(f'Distribution Comparison (Box Plot) - {column}')
        plt.xticks(rotation=45, ha='right')
        plt.grid(True, alpha=0.3, axis='y')
        plt.tight_layout()
        plt.show()
    
    def export_summary(self, column: str, output_file: str, datasets: Optional[List[str]] = None, percentiles: List[float] = [50, 90, 95, 99]) -> None:
        """
        Export summary statistics to CSV.
        
        Args:
            column: Column name to analyze
            output_file: Path to output CSV file
            datasets: List of dataset names to include (None = all)
            percentiles: List of percentiles to include
        """
        stats_df = self.compute_statistics(column, datasets, percentiles)
        stats_df.to_csv(output_file, index=False)
        print(f"Summary exported to {output_file}")
    
    def get_data(self, label: str) -> pd.DataFrame:
        """Get raw data for a specific dataset."""
        return self.data.get(label)
    
    def list_datasets(self) -> List[str]:
        """List all loaded datasets."""
        return list(self.data.keys())
    
    def get_columns(self, dataset: str) -> List[str]:
        """Get list of columns for a specific dataset."""
        if dataset in self.metadata:
            return self.metadata[dataset]['columns']
        return []
    
    def clear(self) -> None:
        """Clear all loaded data."""
        self.data.clear()
        self.metadata.clear()

# Helper functions for common label extraction patterns
def extract_batch_size(filename: str) -> str:
    """Extract batch size from filename like 'model_b8_L128_latencies_ms.csv'"""
    match = re.search(r'_b(\d+)_', filename)
    return f"batch_{match.group(1)}" if match else filename

def extract_model_and_batch(filename: str) -> str:
    """Extract model and batch size from filename"""
    model_match = re.search(r'(distilgpt2|Mistral)', filename)
    batch_match = re.search(r'_b(\d+)_', filename)
    
    model = model_match.group(1) if model_match else "unknown"
    batch = batch_match.group(1) if batch_match else "unknown"
    
    return f"{model}_b{batch}"

print("DataAnalyzer class loaded successfully!")
print("\nQuick start examples:")
print("  analyzer = DataAnalyzer()")
print("  analyzer.load_files('path/to/file.csv', dataset_name='experiment1')")
print("  analyzer.load_pattern('experiments/logs/*.csv', label_extractor=extract_batch_size)")
print("  stats = analyzer.compute_statistics(column='latency_ms')")
print("  analyzer.plot_distributions(column='latency_ms')")
print("  analyzer.plot_comparison(dataset='experiment1', column='latency_ms', metrics=['mean', 'p95', 'p99'])")
print("\n  # Get GPU/CPU runtime breakdown:")
print("  gpu_cpu_stats = analyzer.get_gpu_cpu_runtime(dataset_name='experiment1')")
print("  # Or from a file:")
print("  gpu_cpu_stats = analyzer.get_gpu_cpu_runtime(events_file='path/to/events.jsonl')")

In [None]:
# Add method to DataAnalyzer for computing total throughput
def compute_total_throughput(analyzer,
                              latency_column='latency_ms',
                              batch_size=8,
                              seq_len=128,
                              datasets=None):
    """
    Compute total throughput for multi-tenant experiments.
    
    For multi-tenant scenarios, the total throughput is the sum of individual
    model throughputs. This allows comparison between:
    - Single model on 1 GPU
    - Multiple models sharing 1 GPU
    
    Args:
        analyzer: DataAnalyzer instance
        latency_column: Column name containing latency data in milliseconds
        batch_size: Batch size used in experiment
        seq_len: Sequence length (number of tokens per sequence)
        datasets: List of datasets to include (None = all)
    
    Returns:
        DataFrame with per-iteration total throughput and statistics
    """
    if datasets is None:
        datasets = analyzer.list_datasets()
    
    tokens_per_batch = batch_size * seq_len
    
    # Collect all throughput data aligned by iteration
    throughput_by_iter = {}
    
    for dataset_name in datasets:
        df = analyzer.get_data(dataset_name)
        
        if df is None:
            print(f"Warning: Dataset '{dataset_name}' not found, skipping...")
            continue
        
        if latency_column not in df.columns:
            print(f"Warning: Column '{latency_column}' not found in dataset '{dataset_name}', skipping...")
            continue
        
        # Compute throughput for each iteration
        for idx, row in df.iterrows():
            iter_num = row.get('iter', idx)
            latency_ms = row[latency_column]
            throughput = tokens_per_batch / (latency_ms / 1000.0)
            
            if iter_num not in throughput_by_iter:
                throughput_by_iter[iter_num] = []
            throughput_by_iter[iter_num].append(throughput)
    
    # Sum throughputs for each iteration
    total_throughput_data = []
    for iter_num in sorted(throughput_by_iter.keys()):
        total_throughput = sum(throughput_by_iter[iter_num])
        total_throughput_data.append({
            'iter': iter_num,
            'total_throughput': total_throughput,
            'num_models': len(throughput_by_iter[iter_num])
        })
    
    result_df = pd.DataFrame(total_throughput_data)
    
    # Print summary statistics
    print(f"\n{'='*60}")
    print(f"TOTAL THROUGHPUT ANALYSIS")
    print(f"{'='*60}")
    print(f"Configuration: {len(datasets)} models on 1 GPU")
    print(f"Batch size: {batch_size}, Seq length: {seq_len}")
    print(f"Tokens per batch: {tokens_per_batch}")
    print(f"\nTotal Throughput Statistics (tokens/second):")
    print(f"  Mean:   {result_df['total_throughput'].mean():,.2f}")
    print(f"  Std:    {result_df['total_throughput'].std():,.2f}")
    print(f"  Min:    {result_df['total_throughput'].min():,.2f}")
    print(f"  Max:    {result_df['total_throughput'].max():,.2f}")
    print(f"  Median: {result_df['total_throughput'].median():,.2f}")
    print(f"  P95:    {result_df['total_throughput'].quantile(0.95):,.2f}")
    print(f"  P99:    {result_df['total_throughput'].quantile(0.99):,.2f}")
    print(f"{'='*60}\n")
    
    return result_df


def compare_multi_vs_single_tenant(multi_tenant_analyzer, 
                                    single_tenant_analyzer,
                                    batch_size=8,
                                    seq_len=128,
                                    figsize=(14, 6)):
    """
    Compare total throughput between multi-tenant and single-tenant configurations.
    
    Args:
        multi_tenant_analyzer: DataAnalyzer with multiple model datasets loaded
        single_tenant_analyzer: DataAnalyzer with single model dataset loaded
        batch_size: Batch size used
        seq_len: Sequence length
        figsize: Figure size for plots
    
    Returns:
        Dictionary with comparison statistics
    """
    # Compute total throughput for multi-tenant
    print("Computing multi-tenant total throughput...")
    multi_df = compute_total_throughput(multi_tenant_analyzer, 
                                       batch_size=batch_size, 
                                       seq_len=seq_len)
    
    # Compute throughput for single tenant
    print("\nComputing single-tenant throughput...")
    single_datasets = single_tenant_analyzer.list_datasets()
    if len(single_datasets) == 0:
        print("Error: No datasets in single_tenant_analyzer")
        return None
    
    single_df = single_tenant_analyzer.get_data(single_datasets[0])
    tokens_per_batch = batch_size * seq_len
    single_throughput = tokens_per_batch / (single_df['latency_ms'] / 1000.0)
    
    # Comparison statistics
    multi_mean = multi_df['total_throughput'].mean()
    single_mean = single_throughput.mean()
    improvement = ((multi_mean - single_mean) / single_mean) * 100
    
    print(f"\n{'='*60}")
    print(f"MULTI-TENANT vs SINGLE-TENANT COMPARISON")
    print(f"{'='*60}")
    print(f"Multi-tenant ({multi_df['num_models'].iloc[0]} models):")
    print(f"  Mean throughput: {multi_mean:,.2f} tokens/s")
    print(f"\nSingle-tenant (1 model):")
    print(f"  Mean throughput: {single_mean:,.2f} tokens/s")
    print(f"\nDifference: {improvement:+.2f}%")
    if improvement > 0:
        print(f"Multi-tenant provides {improvement:.2f}% MORE total throughput")
    else:
        print(f"Multi-tenant provides {abs(improvement):.2f}% LESS total throughput")
    print(f"{'='*60}\n")
    
    # Plot comparison
    fig, axes = plt.subplots(1, 2, figsize=figsize)
    
    # Distribution comparison
    axes[0].hist(multi_df['total_throughput'], bins=30, alpha=0.6, label=f'Multi-tenant ({multi_df["num_models"].iloc[0]} models)', color='blue')
    axes[0].hist(single_throughput, bins=30, alpha=0.6, label='Single-tenant (1 model)', color='orange')
    axes[0].set_xlabel('Throughput (tokens/s)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Throughput Distribution Comparison')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Bar chart of means
    means = [multi_mean, single_mean]
    labels = [f'Multi-tenant\n({multi_df["num_models"].iloc[0]} models)', 'Single-tenant\n(1 model)']
    colors = ['blue', 'orange']
    
    bars = axes[1].bar(labels, means, color=colors, alpha=0.6)
    axes[1].set_ylabel('Mean Throughput (tokens/s)')
    axes[1].set_title('Mean Throughput Comparison')
    axes[1].grid(True, alpha=0.3, axis='y')
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        axes[1].text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:,.0f}',
                    ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    return {
        'multi_tenant_mean': multi_mean,
        'single_tenant_mean': single_mean,
        'improvement_percent': improvement,
        'multi_df': multi_df,
        'single_throughput': single_throughput
    }


print("Total throughput analysis functions loaded!")
print("\nUsage examples:")
print("  # Compute total throughput for multi-tenant setup")
print("  total_throughput_df = compute_total_throughput(analyzer, batch_size=8, seq_len=128)")
print("")
print("  # Compare multi-tenant vs single-tenant")
print("  comparison = compare_multi_vs_single_tenant(multi_analyzer, single_analyzer, batch_size=8, seq_len=128)")


## Statistical Fairness & Significance Analysis

This section provides tools to analyze:
1. **Fairness**: Whether GPU resources are shared fairly among models
2. **Statistical Significance**: Whether performance differences are statistically significant

In [None]:
from scipy import stats
from typing import Tuple

def compute_fairness_metrics(analyzer, column='latency_ms', datasets=None, metric='mean', quantiles=[0.25, 0.5, 0.75]):
    """
    Compute fairness metrics for resource sharing among models.
    
    Fairness Metrics:
    - Coefficient of Variation (CV): std/metric_val, lower is fairer (0 = perfectly fair)
    - Gini Coefficient: 0 (perfect equality) to 1 (perfect inequality)
    - Max/Min Ratio: how many times slower is the slowest vs fastest
    - Range: difference between max and min
    - Mann-Whitney U Tests: Pairwise comparisons based on median differences
    - Quantile Regression: Analyze fairness across different percentiles
    
    Args:
        analyzer: DataAnalyzer instance
        column: Column to analyze (default 'latency_ms')
        datasets: List of datasets (None = all)
        metric: Metric to use ('mean', 'median', or 'p50') - default 'mean'
        quantiles: List of quantiles for quantile regression analysis
    
    Returns:
        Dictionary with fairness metrics
    """
    if datasets is None:
        datasets = analyzer.list_datasets()
    
    # Get metric values for each model
    metric_values = []
    medians = []
    data_by_model = []  # Store full data for Mann-Whitney tests
    
    for dataset in datasets:
        df = analyzer.get_data(dataset)
        if df is not None and column in df.columns:
            data = df[column].dropna().values
            data_by_model.append(data)
            medians.append(np.median(data))
            
            if metric == 'mean':
                metric_values.append(np.mean(data))
            elif metric in ['median', 'p50']:
                metric_values.append(np.median(data))
            else:
                # Try to parse as percentile (e.g., 'p95')
                if metric.startswith('p'):
                    percentile = int(metric[1:])
                    metric_values.append(np.percentile(data, percentile))
                else:
                    raise ValueError(f"Unknown metric: {metric}")
    
    if len(metric_values) == 0:
        print("Error: No valid data found")
        return None
    
    metric_values = np.array(metric_values)
    medians = np.array(medians)
    
    # ========== Traditional Fairness Metrics ==========
    # Coefficient of Variation (lower is better, 0 = perfectly fair)
    cv = np.std(metric_values) / np.mean(metric_values)
    
    # Gini Coefficient
    sorted_values = np.sort(metric_values)
    n = len(sorted_values)
    gini = (2 * np.sum((np.arange(1, n+1)) * sorted_values)) / (n * np.sum(sorted_values)) - (n + 1) / n
    
    # Max/Min ratio
    max_min_ratio = np.max(metric_values) / np.min(metric_values)
    
    # Range
    range_val = np.max(metric_values) - np.min(metric_values)
    
    # Percent difference from mean
    mean_val = np.mean(metric_values)
    max_deviation_pct = np.max(np.abs(metric_values - mean_val)) / mean_val * 100
    
    # ========== Mann-Whitney U Tests (Pairwise Median Comparisons) ==========
    print(f"\n{'='*80}")
    print(f"FAIRNESS ANALYSIS: {column} (using {metric})")
    print(f"{'='*80}")
    print(f"Number of models: {len(metric_values)}")
    print(f"\nIndividual {metric}s:")
    for dataset, val in zip(datasets, metric_values):
        deviation = ((val - mean_val) / mean_val) * 100
        print(f"  {dataset:20s}: {val:8.2f} ({deviation:+.2f}% from avg)")
    
    # Mann-Whitney U Tests (non-parametric test for median differences)
    print(f"\n{'─'*80}")
    print(f"MANN-WHITNEY U TESTS (Pairwise Median Comparisons)")
    print(f"{'─'*80}")
    print(f"Tests whether the medians of two groups are significantly different.")
    print(f"More robust to outliers than t-tests.\n")
    
    mann_whitney_results = []
    n_comparisons = len(datasets) * (len(datasets) - 1) // 2
    bonferroni_alpha = 0.05 / n_comparisons
    
    for i in range(len(datasets)):
        for j in range(i+1, len(datasets)):
            # Mann-Whitney U test
            u_stat, p_val = stats.mannwhitneyu(data_by_model[i], data_by_model[j], 
                                               alternative='two-sided')
            
            median_i = medians[i]
            median_j = medians[j]
            diff = median_i - median_j
            diff_pct = (diff / median_j) * 100
            
            # Effect size (rank-biserial correlation)
            n1, n2 = len(data_by_model[i]), len(data_by_model[j])
            r = 1 - (2*u_stat) / (n1 * n2)  # rank-biserial correlation
            
            is_significant = p_val < bonferroni_alpha
            
            mann_whitney_results.append({
                'model_1': datasets[i],
                'model_2': datasets[j],
                'median_1': median_i,
                'median_2': median_j,
                'difference': diff,
                'diff_percent': diff_pct,
                'u_statistic': u_stat,
                'p_value': p_val,
                'effect_size_r': r,
                'significant': is_significant
            })
    
    # Print Mann-Whitney results
    significant_mw = [r for r in mann_whitney_results if r['significant']]
    print(f"Bonferroni-corrected alpha: {bonferroni_alpha:.6f} ({n_comparisons} comparisons)")
    print(f"\nPairwise Results:")
    for result in mann_whitney_results:
        sig_marker = "***" if result['significant'] else "   "
        print(f"{sig_marker} {result['model_1']:15s} vs {result['model_2']:15s}: " +
              f"Δmedian={result['difference']:7.2f} ({result['diff_percent']:+6.2f}%), " +
              f"p={result['p_value']:.6f}, r={result['effect_size_r']:+.3f}")
    
    print(f"\nEffect Size Interpretation (rank-biserial r):")
    print(f"  < 0.1: negligible, 0.1-0.3: small, 0.3-0.5: medium, > 0.5: large")
    
    if significant_mw:
        print(f"\n✓ {len(significant_mw)} significant median difference(s) found:")
        for r in significant_mw:
            print(f"  • {r['model_1']} vs {r['model_2']}: {r['diff_percent']:+.2f}% (p={r['p_value']:.6f})")
    else:
        print(f"\n✓ No statistically significant median differences found.")
        print(f"  → Resource sharing appears fair from a statistical perspective.")
    
    # ========== Quantile Regression Analysis ==========
    print(f"\n{'─'*80}")
    print(f"QUANTILE FAIRNESS ANALYSIS")
    print(f"{'─'*80}")
    print(f"Analyzes fairness across different percentiles of the distribution.")
    print(f"Helps identify if unfairness is concentrated in tail latencies.\n")
    
    quantile_fairness = {}
    for q in quantiles:
        q_values = []
        for data in data_by_model:
            q_values.append(np.quantile(data, q))
        
        q_values = np.array(q_values)
        q_cv = np.std(q_values) / np.mean(q_values)
        q_max_min = np.max(q_values) / np.min(q_values)
        q_range = np.max(q_values) - np.min(q_values)
        
        quantile_fairness[q] = {
            'values': q_values,
            'cv': q_cv,
            'max_min_ratio': q_max_min,
            'range': q_range,
            'mean': np.mean(q_values)
        }
        
        print(f"Q{int(q*100):02d} (p{int(q*100)}): CV={q_cv:.4f}, Max/Min={q_max_min:.3f}x, " +
              f"Range={q_range:.2f}")
    
    print(f"\nQuantile Fairness Interpretation:")
    if all(quantile_fairness[q]['cv'] < 0.1 for q in quantiles):
        print(f"  ✓ EXCELLENT - Fair across all percentiles")
    elif all(quantile_fairness[q]['cv'] < 0.2 for q in quantiles):
        print(f"  ✓ GOOD - Reasonably fair across distribution")
    else:
        # Check if tail is worse
        tail_cv = quantile_fairness[max(quantiles)]['cv']
        median_cv = quantile_fairness[0.5]['cv']
        if tail_cv > median_cv * 1.5:
            print(f"  ⚠ TAIL UNFAIRNESS - Higher unfairness in tail latencies (p{int(max(quantiles)*100)})")
        else:
            print(f"  ~ MODERATE - Some unfairness across distribution")
    
    # ========== Traditional Fairness Metrics Summary ==========
    print(f"\n{'─'*80}")
    print(f"TRADITIONAL FAIRNESS METRICS (based on {metric})")
    print(f"{'─'*80}")
    print(f"  Coefficient of Variation: {cv:.4f}")
    print(f"    → Lower is fairer. <0.1 is good, <0.05 is excellent")
    print(f"  Gini Coefficient: {gini:.4f}")
    print(f"    → 0 = perfect equality, 1 = perfect inequality")
    print(f"  Max/Min Ratio: {max_min_ratio:.3f}x")
    print(f"    → Slowest model is {max_min_ratio:.3f}x slower than fastest")
    print(f"  Max Deviation: {max_deviation_pct:.2f}%")
    print(f"    → Worst-case deviation from average")
    
    # Fairness assessment
    print(f"\nOverall Fairness Assessment:")
    if cv < 0.05:
        print(f"  ✓ EXCELLENT - Resources are shared very fairly")
    elif cv < 0.1:
        print(f"  ✓ GOOD - Resources are shared fairly")
    elif cv < 0.2:
        print(f"  ~ MODERATE - Some unfairness in resource sharing")
    else:
        print(f"  ✗ POOR - Significant unfairness in resource sharing")
    
    print(f"{'='*80}\n")
    
    return {
        'num_models': len(metric_values),
        'metric': metric,
        'mean_of_metrics': mean_val,
        'std_of_metrics': np.std(metric_values),
        'coefficient_of_variation': cv,
        'gini_coefficient': gini,
        'max_min_ratio': max_min_ratio,
        'range': range_val,
        'max_deviation_percent': max_deviation_pct,
        'individual_values': {dataset: val for dataset, val in zip(datasets, metric_values)},
        'mann_whitney_tests': mann_whitney_results,
        'mann_whitney_significant': significant_mw,
        'quantile_fairness': quantile_fairness,
        'bonferroni_alpha': bonferroni_alpha
    }


def test_statistical_significance(analyzer, column='latency_ms', datasets=None, alpha=0.05):
    """
    Test whether performance differences between models are statistically significant.
    
    Tests:
    1. One-way ANOVA: Tests if ANY groups differ
    2. Pairwise t-tests: Tests each pair of models
    3. Levene's test: Tests if variances are equal (homoscedasticity)
    
    Args:
        analyzer: DataAnalyzer instance
        column: Column to analyze
        datasets: List of datasets (None = all)
        alpha: Significance level (default 0.05)
    
    Returns:
        Dictionary with test results
    """
    if datasets is None:
        datasets = analyzer.list_datasets()
    
    # Get data for each model
    data_by_model = []
    valid_datasets = []
    
    for dataset in datasets:
        df = analyzer.get_data(dataset)
        if df is not None and column in df.columns:
            data_by_model.append(df[column].values)
            valid_datasets.append(dataset)
    
    if len(data_by_model) < 2:
        print("Error: Need at least 2 datasets for significance testing")
        return None
    
    print(f"\n{'='*60}")
    print(f"STATISTICAL SIGNIFICANCE ANALYSIS: {column}")
    print(f"{'='*60}")
    print(f"Significance level (alpha): {alpha}")
    print(f"Number of models: {len(valid_datasets)}")
    
    # 1. Levene's test for equal variances
    print(f"\n1. Levene's Test (Equal Variances):")
    levene_stat, levene_p = stats.levene(*data_by_model)
    print(f"   Statistic: {levene_stat:.4f}, p-value: {levene_p:.4f}")
    if levene_p > alpha:
        print(f"   → Variances are equal (homoscedastic) ✓")
        equal_var = True
    else:
        print(f"   → Variances are NOT equal (heteroscedastic) ✗")
        equal_var = False
    
    # 2. One-way ANOVA (or Kruskal-Wallis if variances unequal)
    if equal_var:
        print(f"\n2. One-way ANOVA:")
        f_stat, anova_p = stats.f_oneway(*data_by_model)
        print(f"   F-statistic: {f_stat:.4f}, p-value: {anova_p:.6f}")
        test_name = "ANOVA"
    else:
        print(f"\n2. Kruskal-Wallis Test (non-parametric alternative):")
        h_stat, anova_p = stats.kruskal(*data_by_model)
        print(f"   H-statistic: {h_stat:.4f}, p-value: {anova_p:.6f}")
        test_name = "Kruskal-Wallis"
    
    if anova_p < alpha:
        print(f"   → At least one model is significantly different ✓")
        print(f"   → p < {alpha}, reject null hypothesis")
    else:
        print(f"   → No significant differences detected")
        print(f"   → p >= {alpha}, fail to reject null hypothesis")
    
    # 3. Pairwise comparisons (only if ANOVA shows significance)
    print(f"\n3. Pairwise Comparisons (t-tests with Bonferroni correction):")
    n_comparisons = len(valid_datasets) * (len(valid_datasets) - 1) // 2
    bonferroni_alpha = alpha / n_comparisons
    print(f"   Bonferroni-corrected alpha: {bonferroni_alpha:.6f} ({n_comparisons} comparisons)")
    
    pairwise_results = []
    significant_pairs = []
    
    for i in range(len(valid_datasets)):
        for j in range(i+1, len(valid_datasets)):
            # Use Welch's t-test if variances are unequal
            t_stat, p_val = stats.ttest_ind(data_by_model[i], data_by_model[j], 
                                           equal_var=equal_var)
            
            mean_i = np.mean(data_by_model[i])
            mean_j = np.mean(data_by_model[j])
            diff = mean_i - mean_j
            diff_pct = (diff / mean_j) * 100
            
            # Cohen's d (effect size)
            pooled_std = np.sqrt((np.var(data_by_model[i]) + np.var(data_by_model[j])) / 2)
            cohens_d = diff / pooled_std if pooled_std > 0 else 0
            
            is_significant = p_val < bonferroni_alpha
            
            pairwise_results.append({
                'model_1': valid_datasets[i],
                'model_2': valid_datasets[j],
                'mean_1': mean_i,
                'mean_2': mean_j,
                'difference': diff,
                'diff_percent': diff_pct,
                't_statistic': t_stat,
                'p_value': p_val,
                'cohens_d': cohens_d,
                'significant': is_significant
            })
            
            if is_significant:
                significant_pairs.append((valid_datasets[i], valid_datasets[j], p_val, diff_pct))
    
    # Print pairwise results
    print(f"\n   Pairwise Test Results:")
    for result in pairwise_results:
        sig_marker = "***" if result['significant'] else "   "
        print(f"   {sig_marker} {result['model_1']:15s} vs {result['model_2']:15s}: " +
              f"diff={result['difference']:7.2f} ({result['diff_percent']:+6.2f}%), " +
              f"p={result['p_value']:.6f}, d={result['cohens_d']:.3f}")
    
    print(f"\n   Effect Size Interpretation (Cohen's d):")
    print(f"     < 0.2: negligible, 0.2-0.5: small, 0.5-0.8: medium, > 0.8: large")
    
    if significant_pairs:
        print(f"\n   {len(significant_pairs)} significant difference(s) found:")
        for m1, m2, p, diff_pct in significant_pairs:
            print(f"     • {m1} vs {m2}: {diff_pct:+.2f}% difference (p={p:.6f})")
    else:
        print(f"\n   No statistically significant pairwise differences found.")
    
    print(f"{'='*60}\n")
    
    return {
        'datasets': valid_datasets,
        'levene_test': {'statistic': levene_stat, 'p_value': levene_p, 'equal_var': equal_var},
        'omnibus_test': {'test': test_name, 'statistic': f_stat if equal_var else h_stat, 
                        'p_value': anova_p, 'significant': anova_p < alpha},
        'pairwise_tests': pairwise_results,
        'significant_pairs': significant_pairs,
        'alpha': alpha,
        'bonferroni_alpha': bonferroni_alpha
    }


def visualize_fairness_and_significance(analyzer, column='latency_ms', datasets=None, 
                                       fairness_metrics=None, sig_results=None, 
                                       figsize=(16, 10)):
    """
    Create comprehensive visualization of fairness and significance analysis.
    
    Args:
        analyzer: DataAnalyzer instance
        column: Column to analyze
        datasets: List of datasets
        fairness_metrics: Pre-computed fairness metrics (optional)
        sig_results: Pre-computed significance test results (optional)
        figsize: Figure size
    """
    if datasets is None:
        datasets = analyzer.list_datasets()
    
    # Compute metrics if not provided
    if fairness_metrics is None:
        fairness_metrics = compute_fairness_metrics(analyzer, column, datasets)         
    
    if sig_results is None:
        sig_results = test_statistical_significance(analyzer, column, datasets)
    
    fig, axes = plt.subplots(2, 3, figsize=figsize)
    fig.suptitle(f'Fairness & Significance Analysis: {column}', fontsize=16, fontweight='bold')
    
    # 1. Mean comparison with error bars (std)
    means = []
    stds = []
    for dataset in datasets:
        df = analyzer.get_data(dataset)
        if df is not None and column in df.columns:
            means.append(df[column].mean())
            stds.append(df[column].std())
    
    x_pos = np.arange(len(datasets))
    axes[0, 0].bar(x_pos, means, yerr=stds, capsize=5, alpha=0.7, color='steelblue')
    axes[0, 0].set_xticks(x_pos)
    axes[0, 0].set_xticklabels(datasets, rotation=45, ha='right')
    axes[0, 0].set_ylabel(column)
    axes[0, 0].set_title('Mean ± Std Dev')
    axes[0, 0].grid(True, alpha=0.3, axis='y')
    
    # Add horizontal line at average
    overall_mean = np.mean(means)
    axes[0, 0].axhline(overall_mean, color='red', linestyle='--', alpha=0.5, label='Overall Mean')
    axes[0, 0].legend()
    
    # 2. Box plot with individual points
    data_for_box = []
    for dataset in datasets:
        df = analyzer.get_data(dataset)
        if df is not None and column in df.columns:
            data_for_box.append(df[column].values)
    
    bp = axes[0, 1].boxplot(data_for_box, labels=datasets, patch_artist=True)
    for patch in bp['boxes']:
        patch.set_facecolor('lightblue')
    axes[0, 1].set_xticklabels(datasets, rotation=45, ha='right')
    axes[0, 1].set_ylabel(column)
    axes[0, 1].set_title('Distribution Comparison')
    axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # 3. Coefficient of Variation comparison
    cv_values = [np.std(data) / np.mean(data) for data in data_for_box]
    axes[0, 2].bar(x_pos, cv_values, alpha=0.7, color='coral')
    axes[0, 2].set_xticks(x_pos)
    axes[0, 2].set_xticklabels(datasets, rotation=45, ha='right')
    axes[0, 2].set_ylabel('Coefficient of Variation')
    axes[0, 2].set_title('Within-Model Variability')
    axes[0, 2].grid(True, alpha=0.3, axis='y')
    axes[0, 2].axhline(0.1, color='orange', linestyle='--', alpha=0.5, label='Good threshold')
    axes[0, 2].axhline(0.05, color='green', linestyle='--', alpha=0.5, label='Excellent threshold')
    axes[0, 2].legend(fontsize=8)
    
    # 4. Fairness metrics summary
    axes[1, 0].axis('off')
    fairness_text = f"""
    FAIRNESS METRICS
    ─────────────────────────
    Coefficient of Variation: {fairness_metrics['coefficient_of_variation']:.4f}
    Gini Coefficient: {fairness_metrics['gini_coefficient']:.4f}
    Max/Min Ratio: {fairness_metrics['max_min_ratio']:.3f}x
    Max Deviation: {fairness_metrics['max_deviation_percent']:.2f}%
    
    Assessment: {"EXCELLENT" if fairness_metrics['coefficient_of_variation'] < 0.05 
                 else "GOOD" if fairness_metrics['coefficient_of_variation'] < 0.1
                 else "MODERATE" if fairness_metrics['coefficient_of_variation'] < 0.2
                 else "POOR"}
    """
    axes[1, 0].text(0.1, 0.5, fairness_text, fontsize=10, family='monospace',
                   verticalalignment='center')
    
    # 5. Pairwise p-values heatmap
    n_datasets = len(datasets)
    p_value_matrix = np.ones((n_datasets, n_datasets))
    
    for result in sig_results['pairwise_tests']:
        i = datasets.index(result['model_1'])
        j = datasets.index(result['model_2'])
        p_value_matrix[i, j] = result['p_value']
        p_value_matrix[j, i] = result['p_value']
    
    # Use log scale for better visualization
    log_p_matrix = -np.log10(p_value_matrix)
    log_p_matrix[np.isinf(log_p_matrix)] = 0  # Handle diagonal (p=1)
    
    im = axes[1, 1].imshow(log_p_matrix, cmap='RdYlGn_r', aspect='auto')
    axes[1, 1].set_xticks(np.arange(n_datasets))
    axes[1, 1].set_yticks(np.arange(n_datasets))
    axes[1, 1].set_xticklabels(datasets, rotation=45, ha='right')
    axes[1, 1].set_yticklabels(datasets)
    axes[1, 1].set_title('Pairwise Significance\n(-log10 p-value)')
    
    # Add colorbar
    cbar = plt.colorbar(im, ax=axes[1, 1])
    cbar.set_label('-log10(p-value)', rotation=270, labelpad=15)
    
    # Add significance threshold line
    bonferroni_threshold = -np.log10(sig_results['bonferroni_alpha'])
    
    # 6. Statistical test summary
    axes[1, 2].axis('off')
    sig_text = f"""
    SIGNIFICANCE TESTS
    ─────────────────────────
    Test: {sig_results['omnibus_test']['test']}
    p-value: {sig_results['omnibus_test']['p_value']:.6f}
    Result: {"SIGNIFICANT" if sig_results['omnibus_test']['significant'] else "NOT SIGNIFICANT"}
    
    Bonferroni α: {sig_results['bonferroni_alpha']:.6f}
    
    Significant pairs: {len(sig_results['significant_pairs'])}
    Total comparisons: {len(sig_results['pairwise_tests'])}
    """
    axes[1, 2].text(0.1, 0.5, sig_text, fontsize=10, family='monospace',
                   verticalalignment='center')
    
    plt.tight_layout()
    plt.show()


print("Enhanced statistical fairness and significance analysis functions loaded!")
print("\nNew Features:")
print("  • Mann-Whitney U tests for robust median comparisons")
print("  • Quantile regression to analyze fairness across percentiles")
print("  • Support for mean, median, or p50-based metrics")
print("\nUsage:")
print("  # Analyze fairness using mean (default)")
print("  fairness = compute_fairness_metrics(analyzer, column='latency_ms', metric='mean')")
print("")
print("  # Analyze fairness using median/p50 (more robust to outliers)")
print("  fairness = compute_fairness_metrics(analyzer, column='latency_ms', metric='median')")
print("")
print("  # Test statistical significance")
print("  sig_results = test_statistical_significance(analyzer, column='latency_ms')")
print("")
print("  # Visualize everything")
print("  visualize_fairness_and_significance(analyzer, column='latency_ms')")

## Resource Hogging Detection

This section provides tools to detect if one model is monopolizing GPU resources and starving others.

In [None]:
def _extract_model_name(dataset_name):
    """
    Extract the core model name from a dataset name.
    
    Examples:
        'distilgpt2_1' -> 'gpt2'
        'solo_gpt2' -> 'gpt2'
        'distilbert_3' -> 'bert'
        'solo_bert' -> 'bert'
        'llama_1' -> 'llama'
        'solo_llama' -> 'llama'
        'mistral7b_2' -> 'mistral'
        'solo_mistral' -> 'mistral'
    """
    name_lower = dataset_name.lower()
    
    # Define model name patterns
    if 'gpt2' in name_lower or 'gpt-2' in name_lower:
        return 'gpt2'
    elif 'roberta' in name_lower:
        return 'roberta'
    elif 'bert' in name_lower:
        return 'bert'
    elif 'llama' in name_lower:
        return 'llama'
    elif 'mistral' in name_lower:
        return 'mistral'
    else:
        # Fallback: return the name without numbers and underscores
        import re
        return re.sub(r'[_\d]+', '', name_lower)


def detect_resource_hogging_normalized(analyzer,
                                        baseline_analyzers,
                                        column='latency_ms',
                                        datasets=None,
                                        hogging_threshold=0.15):
    """
    Detect resource hogging by comparing relative slowdown from baseline performance.

    This version accounts for different model architectures having different baseline speeds.
    Instead of comparing absolute latency, it compares how much each model slowed down
    relative to its solo performance.

    Args:
        analyzer: DataAnalyzer with multi-tenant data
        baseline_analyzers: DataAnalyzer or list of DataAnalyzers with solo/baseline data for each model
                           If a list is provided, will search all analyzers for matching baseline data
        column: Column to analyze (default 'latency_ms')
        datasets: List of datasets (None = all)
        hogging_threshold: Threshold for detecting hogging (default 0.15 = 15% difference in slowdown)

    Returns:
        Dictionary with normalized hogging detection results
    """
    # Normalize baseline_analyzers to always be a list
    if not isinstance(baseline_analyzers, list):
        baseline_analyzers = [baseline_analyzers]
    
    if datasets is None:
        datasets = analyzer.list_datasets()
    
    print(f"\n{'='*80}")
    print(f"NORMALIZED RESOURCE HOGGING DETECTION (Baseline-Aware)")
    print(f"{'='*80}")
    print(f"This analysis accounts for different baseline performance of different models.")
    print(f"Column: {column}")
    print(f"Hogging threshold: {hogging_threshold*100:.1f}% difference in relative slowdown")
    print(f"Baseline analyzers: {len(baseline_analyzers)}")
    
    # Get baseline and multi-tenant stats
    model_stats = {}
    baseline_stats = {}
    
    for dataset in datasets:
        # Multi-tenant data
        df = analyzer.get_data(dataset)
        if df is not None and column in df.columns:
            model_stats[dataset] = {
                'mean': df[column].mean(),
                'std': df[column].std(),
            }
        
        # Baseline data (try to find matching baseline from any analyzer)
        baseline_df = None
        matched_analyzer_idx = None
        matched_baseline_name = None
        
        # Try exact match first across all analyzers
        for idx, baseline_analyzer in enumerate(baseline_analyzers):
            baseline_df = baseline_analyzer.get_data(dataset)
            if baseline_df is not None:
                matched_analyzer_idx = idx
                matched_baseline_name = dataset
                break
        
        # If no exact match, try substring match
        if baseline_df is None:
            for idx, baseline_analyzer in enumerate(baseline_analyzers):
                baseline_datasets = baseline_analyzer.list_datasets()
                for bd in baseline_datasets:
                    if dataset in bd or bd in dataset:
                        baseline_df = baseline_analyzer.get_data(bd)
                        matched_analyzer_idx = idx
                        matched_baseline_name = bd
                        break
                if baseline_df is not None:
                    break
        
        # If still no match, try model name extraction
        if baseline_df is None:
            dataset_model = _extract_model_name(dataset)
            for idx, baseline_analyzer in enumerate(baseline_analyzers):
                baseline_datasets = baseline_analyzer.list_datasets()
                for bd in baseline_datasets:
                    bd_model = _extract_model_name(bd)
                    if dataset_model == bd_model:
                        baseline_df = baseline_analyzer.get_data(bd)
                        matched_analyzer_idx = idx
                        matched_baseline_name = bd
                        print(f"   Matched '{dataset}' -> '{bd}' via model name '{dataset_model}'")
                        break
                if baseline_df is not None:
                    break
        
        if baseline_df is not None and column in baseline_df.columns:
            baseline_stats[dataset] = {
                'mean': baseline_df[column].mean(),
                'std': baseline_df[column].std(),
                'analyzer_idx': matched_analyzer_idx,
                'baseline_name': matched_baseline_name
            }
    
    # Check if we have baseline data
    missing_baseline = [d for d in datasets if d not in baseline_stats]
    if missing_baseline:
        print(f"\n  WARNING: Missing baseline data for: {', '.join(missing_baseline)}")
        all_baseline_datasets = []
        for ba in baseline_analyzers:
            all_baseline_datasets.extend(ba.list_datasets())
        print(f"   Available baseline datasets: {all_baseline_datasets}")
        print(f"   Falling back to absolute comparison for models without baseline.\n")
    
    # Calculate relative slowdown for each model
    print(f"\n{'Model':<20s} {'Baseline':>10s} {'Multi':>10s} {'Slowdown':>10s} {'Status':<25s}")
    print(f"{'-'*80}")
    
    slowdowns = {}
    hogging_models = []
    starved_models = []
    fair_models = []
    
    for dataset in datasets:
        if dataset not in model_stats:
            continue
            
        multi_mean = model_stats[dataset]['mean']
        
        if dataset in baseline_stats:
            baseline_mean = baseline_stats[dataset]['mean']
            
            if column == 'latency_ms':
                # Slowdown = (multi_latency - baseline_latency) / baseline_latency
                # Positive = slower (worse), Negative = faster (better, suspicious!)
                slowdown = (multi_mean - baseline_mean) / baseline_mean
            elif column == 'throughput':
                # Slowdown = (baseline_throughput - multi_throughput) / baseline_throughput
                # Positive = worse throughput, Negative = better (suspicious!)
                slowdown = (baseline_mean - multi_mean) / baseline_mean
            
            slowdowns[dataset] = slowdown
        else:
            # No baseline - can't calculate relative slowdown
            slowdowns[dataset] = None
            baseline_mean = None
    
    # Calculate average slowdown (only for models with baseline)
    valid_slowdowns = [s for s in slowdowns.values() if s is not None]
    if len(valid_slowdowns) == 0:
        print("ERROR: No baseline data available for any models!")
        return None
    
    avg_slowdown = np.mean(valid_slowdowns)
    
    # Detect hogging based on relative slowdown differences
    for dataset in datasets:
        if dataset not in model_stats:
            continue
        
        multi_mean = model_stats[dataset]['mean']
        baseline_mean = baseline_stats.get(dataset, {}).get('mean', None)
        slowdown = slowdowns[dataset]
        
        if slowdown is None:
            status = "No baseline data"
            print(f"{dataset:<20s} {'N/A':>10s} {multi_mean:10.2f} {'N/A':>10s} {status:<25s}")
            continue
        
        # Compare this model's slowdown to the average slowdown
        slowdown_diff = slowdown - avg_slowdown
        
        # Determine status
        if abs(slowdown_diff) > hogging_threshold:
            if slowdown_diff < 0:
                # This model slowed down LESS than average = getting priority
                status = f"HOGGING ({slowdown_diff*100:+.1f}%)"
                hogging_models.append((dataset, slowdown_diff * 100, slowdown * 100))
            else:
                # This model slowed down MORE than average = being starved
                status = f"STARVED ({slowdown_diff*100:+.1f}%)"
                starved_models.append((dataset, slowdown_diff * 100, slowdown * 100))
        else:
            status = f"✓ Fair ({slowdown_diff*100:+.1f}%)"
            fair_models.append(dataset)
        
        print(f"{dataset:<20s} {baseline_mean:10.2f} {multi_mean:10.2f} {slowdown*100:+9.1f}% {status:<25s}")
    
    print(f"\n{'='*80}")
    print(f"ANALYSIS:")
    print(f"  Average slowdown: {avg_slowdown*100:+.1f}%")
    print(f"  (This is expected overhead from multi-tenancy)")
    print(f"\n  Models with FAIR resource allocation: {len(fair_models)}")
    
    if hogging_models:
        print(f"\n  RESOURCE HOGGING DETECTED: {len(hogging_models)} model(s)")
        for model, diff, total_slowdown in hogging_models:
            print(f"     • {model}:")
            print(f"       - Slowdown: {total_slowdown:+.1f}% (vs avg {avg_slowdown*100:+.1f}%)")
            print(f"       - Difference: {diff:+.1f}% LESS slowdown than others")
            print(f"       → Getting PRIORITY access to GPU resources")
    
    if starved_models:
        print(f"\n RESOURCE STARVATION DETECTED: {len(starved_models)} model(s)")
        for model, diff, total_slowdown in starved_models:
            print(f"     • {model}:")
            print(f"       - Slowdown: {total_slowdown:+.1f}% (vs avg {avg_slowdown*100:+.1f}%)")
            print(f"       - Difference: {diff:+.1f}% MORE slowdown than others")
            print(f"       → Being STARVED of GPU resources")
    
    if not hogging_models and not starved_models:
        print(f"\n  NO HOGGING DETECTED - All models experience similar slowdown!")
        print(f"    Resources are shared fairly across different model types.")
    
    print(f"{'='*80}\n")
    
    # Visualization
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # 1. Baseline vs Multi-tenant comparison
    model_names = [d for d in datasets if d in model_stats and d in baseline_stats]
    baseline_means = [baseline_stats[m]['mean'] for m in model_names]
    multi_means = [model_stats[m]['mean'] for m in model_names]
    
    x = np.arange(len(model_names))
    width = 0.35
    
    axes[0].bar(x - width/2, baseline_means, width, label='Baseline (solo)', alpha=0.7, color='green')
    axes[0].bar(x + width/2, multi_means, width, label='Multi-tenant', alpha=0.7, color='blue')
    axes[0].set_xticks(x)
    axes[0].set_xticklabels(model_names, rotation=45, ha='right')
    axes[0].set_ylabel(column)
    axes[0].set_title('Baseline vs Multi-tenant Performance')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3, axis='y')
    
    # 2. Slowdown comparison
    slowdown_values = [slowdowns[m] * 100 for m in model_names]
    colors = []
    for m in model_names:
        slowdown_diff = slowdowns[m] - avg_slowdown
        if abs(slowdown_diff) > hogging_threshold:
            colors.append('red' if slowdown_diff < 0 else 'orange')
        else:
            colors.append('green')
    
    axes[1].bar(range(len(model_names)), slowdown_values, color=colors, alpha=0.7)
    axes[1].axhline(avg_slowdown * 100, color='blue', linestyle='--', linewidth=2, label=f'Avg slowdown ({avg_slowdown*100:.1f}%)')
    axes[1].set_xticks(range(len(model_names)))
    axes[1].set_xticklabels(model_names, rotation=45, ha='right')
    axes[1].set_ylabel('Slowdown from Baseline (%)')
    axes[1].set_title('Relative Slowdown per Model')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3, axis='y')
    
    # 3. Deviation from average slowdown
    deviations = [(slowdowns[m] - avg_slowdown) * 100 for m in model_names]
    axes[2].bar(range(len(model_names)), deviations, color=colors, alpha=0.7)
    axes[2].axhline(0, color='blue', linestyle='-', linewidth=1)
    axes[2].axhline(hogging_threshold * 100, color='red', linestyle='--', alpha=0.5, label='Hogging threshold')
    axes[2].axhline(-hogging_threshold * 100, color='red', linestyle='--', alpha=0.5)
    axes[2].set_xticks(range(len(model_names)))
    axes[2].set_xticklabels(model_names, rotation=45, ha='right')
    axes[2].set_ylabel('Deviation from Avg Slowdown (%)')
    axes[2].set_title('Hogging Detection (Normalized)')
    axes[2].legend()
    axes[2].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()
    
    return {
        'hogging_models': hogging_models,
        'starved_models': starved_models,
        'fair_models': fair_models,
        'average_slowdown': avg_slowdown,
        'slowdowns': slowdowns,
        'model_stats': model_stats,
        'baseline_stats': baseline_stats
    }


print("Normalized (baseline-aware) resource hogging detection function loaded!")
print("\nUsage:")
print("  # Compare multi-tenant to baseline performance (single baseline analyzer)")
print("  hogging = detect_resource_hogging_normalized(")
print("      analyzer=triple_analyzer,           # Multi-tenant data")
print("      baseline_analyzers=single_analyzer, # Single baseline analyzer")
print("      column='latency_ms',")
print("      hogging_threshold=0.15")
print("  )")
print("")
print("  # Compare multi-tenant to multiple baseline analyzers")
print("  hogging = detect_resource_hogging_normalized(")
print("      analyzer=triple_analyzer,")
print("      baseline_analyzers=[baseline1, baseline2, baseline3],  # List of baseline analyzers")
print("      column='latency_ms',")
print("      hogging_threshold=0.15")
print("  )")