# ChunkedDecomp Exploration and Analysis

This notebook provides interactive exploration of the ChunkedDecomp compression system.

## Contents
1. Setup and Imports
2. Basic SVD Compression Exploration
3. Chunked Compression Analysis
4. Model Compression Examples
5. Memory Usage Analysis
6. Performance Evaluation
7. Visualization and Results

## 1. Setup and Imports

In [None]:
import sys
import os
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

from models.chunked_decomp import ChunkedDecomp
from models.kv_cache import ChunkedKVCache
from models.compressed_model import CompressedModelWrapper
from utils.svd_utils import SVDCompressor
from utils.memory_utils import MemoryTracker
from utils.data_utils import DatasetManager
from evaluation.performance_evaluator import PerformanceEvaluator
from evaluation.memory_profiler import MemoryProfiler

# Setup plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Device setup
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 2. Basic SVD Compression Exploration

In [None]:
# Create SVD compressor
compressor = SVDCompressor(device=device)

# Create test matrices of different sizes
matrices = {
    'small': torch.randn(64, 128, device=device),
    'medium': torch.randn(256, 512, device=device),
    'large': torch.randn(512, 1024, device=device),
    'square': torch.randn(512, 512, device=device)
}

print("Matrix shapes:")
for name, matrix in matrices.items():
    print(f"{name}: {matrix.shape}")

In [None]:
# Explore compression ratios
compression_ratios = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
matrix_name = 'medium'
test_matrix = matrices[matrix_name]

results = []

for ratio in tqdm(compression_ratios, desc="Testing compression ratios"):
    # Compress matrix
    compressed_data = compressor.compress_matrix(
        test_matrix,
        compression_ratio=ratio,
        collect_stats=True
    )
    
    # Decompress
    decompressed_matrix = compressor.decompress_matrix(compressed_data)
    
    # Calculate error
    error_analysis = compressor.analyze_compression_error(test_matrix, compressed_data)
    
    results.append({
        'compression_ratio': ratio,
        'rank': compressed_data['rank'],
        'relative_error': error_analysis['relative_error'],
        'memory_reduction_mb': compressed_data['compression_stats']['memory_reduction_mb'],
        'compression_time_ms': compressed_data['compression_stats']['compression_time_ms']
    })

results_df = pd.DataFrame(results)
print("Compression analysis complete!")
results_df.head()

In [None]:
# Visualize compression trade-offs
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle(f'SVD Compression Analysis - {matrix_name} matrix ({test_matrix.shape})', fontsize=16)

# Compression ratio vs relative error
axes[0, 0].plot(results_df['compression_ratio'], results_df['relative_error'], 'o-', linewidth=2, markersize=6)
axes[0, 0].set_xlabel('Compression Ratio')
axes[0, 0].set_ylabel('Relative Error')
axes[0, 0].set_title('Compression vs Accuracy Trade-off')
axes[0, 0].grid(True, alpha=0.3)

# Compression ratio vs memory reduction
axes[0, 1].plot(results_df['compression_ratio'], results_df['memory_reduction_mb'], 'o-', linewidth=2, markersize=6, color='green')
axes[0, 1].set_xlabel('Compression Ratio')
axes[0, 1].set_ylabel('Memory Reduction (MB)')
axes[0, 1].set_title('Memory Savings')
axes[0, 1].grid(True, alpha=0.3)

# Rank vs relative error
axes[1, 0].plot(results_df['rank'], results_df['relative_error'], 'o-', linewidth=2, markersize=6, color='red')
axes[1, 0].set_xlabel('Rank')
axes[1, 0].set_ylabel('Relative Error')
axes[1, 0].set_title('Rank vs Accuracy')
axes[1, 0].grid(True, alpha=0.3)

# Compression time
axes[1, 1].plot(results_df['compression_ratio'], results_df['compression_time_ms'], 'o-', linewidth=2, markersize=6, color='purple')
axes[1, 1].set_xlabel('Compression Ratio')
axes[1, 1].set_ylabel('Compression Time (ms)')
axes[1, 1].set_title('Compression Speed')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Chunked Compression Analysis

In [None]:
# Compare chunked vs non-chunked compression
large_matrix = torch.randn(1024, 2048, device=device)
chunk_sizes = [32, 64, 128, 256, 512]
compression_ratio = 0.5

chunked_results = []

# Non-chunked compression
print("Testing non-chunked compression...")
start_time = torch.cuda.Event(enable_timing=True) if device == 'cuda' else None
end_time = torch.cuda.Event(enable_timing=True) if device == 'cuda' else None

if device == 'cuda':
    start_time.record()

non_chunked_data = compressor.compress_matrix(
    large_matrix,
    compression_ratio=compression_ratio,
    collect_stats=True
)

if device == 'cuda':
    end_time.record()
    torch.cuda.synchronize()
    non_chunked_time = start_time.elapsed_time(end_time)
else:
    non_chunked_time = non_chunked_data['compression_stats']['compression_time_ms']

non_chunked_decompressed = compressor.decompress_matrix(non_chunked_data)
non_chunked_error = torch.norm(large_matrix - non_chunked_decompressed) / torch.norm(large_matrix)

print(f"Non-chunked: {non_chunked_time:.2f}ms, error: {non_chunked_error:.4f}")

# Chunked compression
for chunk_size in tqdm(chunk_sizes, desc="Testing chunk sizes"):
    if device == 'cuda':
        start_time.record()
    
    chunked_data = compressor.compress_matrix(
        large_matrix,
        compression_ratio=compression_ratio,
        chunk_size=chunk_size,
        collect_stats=True
    )
    
    if device == 'cuda':
        end_time.record()
        torch.cuda.synchronize()
        chunked_time = start_time.elapsed_time(end_time)
    else:
        chunked_time = chunked_data['compression_stats']['compression_time_ms']
    
    chunked_decompressed = compressor.decompress_matrix(chunked_data)
    chunked_error = torch.norm(large_matrix - chunked_decompressed) / torch.norm(large_matrix)
    
    chunked_results.append({
        'chunk_size': chunk_size,
        'compression_time_ms': chunked_time,
        'relative_error': chunked_error.item(),
        'num_chunks': chunked_data.get('num_chunks', 1)
    })

chunked_df = pd.DataFrame(chunked_results)
print("\nChunked compression results:")
print(chunked_df)

In [None]:
# Visualize chunked vs non-chunked comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Compression time comparison
axes[0].axhline(y=non_chunked_time, color='red', linestyle='--', label=f'Non-chunked ({non_chunked_time:.1f}ms)')
axes[0].plot(chunked_df['chunk_size'], chunked_df['compression_time_ms'], 'o-', linewidth=2, markersize=8, label='Chunked')
axes[0].set_xlabel('Chunk Size')
axes[0].set_ylabel('Compression Time (ms)')
axes[0].set_title('Compression Speed Comparison')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Error comparison
axes[1].axhline(y=non_chunked_error, color='red', linestyle='--', label=f'Non-chunked ({non_chunked_error:.4f})')
axes[1].plot(chunked_df['chunk_size'], chunked_df['relative_error'], 'o-', linewidth=2, markersize=8, label='Chunked')
axes[1].set_xlabel('Chunk Size')
axes[1].set_ylabel('Relative Error')
axes[1].set_title('Accuracy Comparison')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Number of chunks
axes[2].plot(chunked_df['chunk_size'], chunked_df['num_chunks'], 'o-', linewidth=2, markersize=8, color='green')
axes[2].set_xlabel('Chunk Size')
axes[2].set_ylabel('Number of Chunks')
axes[2].set_title('Chunking Strategy')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Model Compression Examples

In [None]:
# Create a simple transformer-like model
class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size=1000, hidden_size=256, num_layers=4, num_heads=8):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.pos_embedding = nn.Embedding(512, hidden_size)
        
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=hidden_size,
                nhead=num_heads,
                dim_feedforward=hidden_size * 4,
                batch_first=True
            ) for _ in range(num_layers)
        ])
        
        self.output_projection = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x):
        seq_len = x.size(1)
        pos_ids = torch.arange(seq_len, device=x.device).unsqueeze(0).expand_as(x)
        
        x = self.embedding(x) + self.pos_embedding(pos_ids)
        
        for layer in self.layers:
            x = layer(x)
        
        return self.output_projection(x)

# Create model
model = SimpleTransformer().to(device)
print(f"Model created with {sum(p.numel() for p in model.parameters()):,} parameters")

# Test input
test_input = torch.randint(0, 1000, (2, 32), device=device)
with torch.no_grad():
    original_output = model(test_input)
    
print(f"Original output shape: {original_output.shape}")

In [None]:
# Compress the model with different configurations
compression_configs = [
    {'compression_ratio': 0.3, 'chunk_size': 32, 'adaptive_rank': False},
    {'compression_ratio': 0.5, 'chunk_size': 64, 'adaptive_rank': False},
    {'compression_ratio': 0.7, 'chunk_size': 64, 'adaptive_rank': False},
    {'compression_ratio': 0.5, 'chunk_size': 64, 'adaptive_rank': True, 'error_threshold': 0.1}
]

compression_results = []

for i, config in enumerate(compression_configs):
    print(f"\nTesting configuration {i+1}: {config}")
    
    # Create fresh model copy
    model_copy = SimpleTransformer().to(device)
    model_copy.load_state_dict(model.state_dict())
    
    # Create compression config
    full_config = {
        'compression': config,
        'kv_cache': {
            'max_cache_size': 1000000,
            'compression_threshold': 0.8
        }
    }
    
    # Apply compression
    with MemoryTracker(device=device) as tracker:
        chunked_decomp = ChunkedDecomp(model=model_copy, config=full_config)
        compression_stats = chunked_decomp.apply_compression()
    
    memory_stats = tracker.get_stats()
    
    # Test compressed model
    model_copy.eval()
    with torch.no_grad():
        compressed_output = model_copy(test_input)
    
    # Calculate output difference
    output_diff = torch.norm(original_output - compressed_output) / torch.norm(original_output)
    
    result = {
        'config_index': i,
        'compression_ratio': config['compression_ratio'],
        'chunk_size': config['chunk_size'],
        'adaptive_rank': config.get('adaptive_rank', False),
        'layers_compressed': compression_stats['layers_compressed'],
        'memory_reduction_mb': compression_stats['memory_reduction_mb'],
        'compression_time_s': compression_stats['compression_time_seconds'],
        'output_difference': output_diff.item(),
        'peak_memory_mb': memory_stats['peak_memory_mb']
    }
    
    compression_results.append(result)
    
    print(f"  Layers compressed: {compression_stats['layers_compressed']}")
    print(f"  Memory reduction: {compression_stats['memory_reduction_mb']:.2f} MB")
    print(f"  Output difference: {output_diff:.4f}")

compression_df = pd.DataFrame(compression_results)
print("\nCompression comparison:")
print(compression_df[['compression_ratio', 'chunk_size', 'adaptive_rank', 
                     'memory_reduction_mb', 'output_difference', 'compression_time_s']])

In [None]:
# Visualize model compression results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Model Compression Analysis', fontsize=16)

# Memory reduction vs compression ratio
scatter1 = axes[0, 0].scatter(compression_df['compression_ratio'], compression_df['memory_reduction_mb'], 
                             c=compression_df['chunk_size'], cmap='viridis', s=100, alpha=0.7)
axes[0, 0].set_xlabel('Compression Ratio')
axes[0, 0].set_ylabel('Memory Reduction (MB)')
axes[0, 0].set_title('Memory Savings vs Compression Ratio')
plt.colorbar(scatter1, ax=axes[0, 0], label='Chunk Size')

# Output difference vs compression ratio
colors = ['red' if adaptive else 'blue' for adaptive in compression_df['adaptive_rank']]
axes[0, 1].scatter(compression_df['compression_ratio'], compression_df['output_difference'], 
                  c=colors, s=100, alpha=0.7)
axes[0, 1].set_xlabel('Compression Ratio')
axes[0, 1].set_ylabel('Output Difference')
axes[0, 1].set_title('Accuracy vs Compression Ratio')
# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='blue', label='Fixed Rank'),
                  Patch(facecolor='red', label='Adaptive Rank')]
axes[0, 1].legend(handles=legend_elements)

# Compression time vs memory reduction
axes[1, 0].scatter(compression_df['memory_reduction_mb'], compression_df['compression_time_s'], 
                  s=100, alpha=0.7, color='green')
axes[1, 0].set_xlabel('Memory Reduction (MB)')
axes[1, 0].set_ylabel('Compression Time (s)')
axes[1, 0].set_title('Compression Speed vs Memory Savings')

# Trade-off: memory reduction vs output difference
axes[1, 1].scatter(compression_df['memory_reduction_mb'], compression_df['output_difference'], 
                  c=compression_df['compression_ratio'], cmap='plasma', s=100, alpha=0.7)
axes[1, 1].set_xlabel('Memory Reduction (MB)')
axes[1, 1].set_ylabel('Output Difference')
axes[1, 1].set_title('Accuracy vs Memory Trade-off')
plt.colorbar(axes[1, 1].collections[0], ax=axes[1, 1], label='Compression Ratio')

plt.tight_layout()
plt.show()

## 5. Memory Usage Analysis

In [None]:
# Analyze memory usage with different batch sizes and sequence lengths
if device == 'cuda':
    batch_sizes = [1, 2, 4, 8]
    sequence_lengths = [32, 64, 128, 256]
    
    memory_analysis = []
    
    # Test original model
    original_model = SimpleTransformer().to(device)
    
    # Test compressed model (use best config from above)
    best_config_idx = compression_df['memory_reduction_mb'].idxmax()
    best_config = compression_configs[best_config_idx]
    
    compressed_model = SimpleTransformer().to(device)
    compressed_model.load_state_dict(original_model.state_dict())
    
    full_config = {
        'compression': best_config,
        'kv_cache': {'max_cache_size': 1000000, 'compression_threshold': 0.8}
    }
    
    chunked_decomp = ChunkedDecomp(model=compressed_model, config=full_config)
    chunked_decomp.apply_compression()
    
    print(f"Using best compression config: {best_config}")
    
    for batch_size in tqdm(batch_sizes, desc="Batch sizes"):
        for seq_len in sequence_lengths:
            test_input = torch.randint(0, 1000, (batch_size, seq_len), device=device)
            
            # Test original model
            torch.cuda.empty_cache()
            with MemoryTracker(device=device) as tracker:
                original_model.eval()
                with torch.no_grad():
                    _ = original_model(test_input)
            original_memory = tracker.get_stats()['peak_memory_mb']
            
            # Test compressed model
            torch.cuda.empty_cache()
            with MemoryTracker(device=device) as tracker:
                compressed_model.eval()
                with torch.no_grad():
                    _ = compressed_model(test_input)
            compressed_memory = tracker.get_stats()['peak_memory_mb']
            
            memory_reduction = (original_memory - compressed_memory) / original_memory * 100
            
            memory_analysis.append({
                'batch_size': batch_size,
                'sequence_length': seq_len,
                'original_memory_mb': original_memory,
                'compressed_memory_mb': compressed_memory,
                'memory_reduction_percent': memory_reduction
            })
    
    memory_df = pd.DataFrame(memory_analysis)
    print("\nMemory analysis complete!")
    print(memory_df.head(10))
else:
    print("Memory analysis skipped (requires CUDA)")

In [None]:
# Visualize memory usage analysis (if CUDA available)
if device == 'cuda' and 'memory_df' in locals():
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Memory usage heatmaps
    pivot_original = memory_df.pivot(index='sequence_length', columns='batch_size', values='original_memory_mb')
    pivot_compressed = memory_df.pivot(index='sequence_length', columns='batch_size', values='compressed_memory_mb')
    pivot_reduction = memory_df.pivot(index='sequence_length', columns='batch_size', values='memory_reduction_percent')
    
    # Original memory usage
    sns.heatmap(pivot_original, annot=True, fmt='.1f', cmap='Reds', ax=axes[0])
    axes[0].set_title('Original Model Memory (MB)')
    
    # Compressed memory usage
    sns.heatmap(pivot_compressed, annot=True, fmt='.1f', cmap='Blues', ax=axes[1])
    axes[1].set_title('Compressed Model Memory (MB)')
    
    # Memory reduction percentage
    sns.heatmap(pivot_reduction, annot=True, fmt='.1f', cmap='Greens', ax=axes[2])
    axes[2].set_title('Memory Reduction (%)')
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print(f"\nMemory Analysis Summary:")
    print(f"Average memory reduction: {memory_df['memory_reduction_percent'].mean():.1f}%")
    print(f"Best memory reduction: {memory_df['memory_reduction_percent'].max():.1f}%")
    print(f"Worst memory reduction: {memory_df['memory_reduction_percent'].min():.1f}%")

## 6. Performance Evaluation

In [None]:
# Create performance evaluator
evaluator = PerformanceEvaluator(device=device)

# Create test data
data_manager = DatasetManager()

# Generate synthetic data for testing
test_data = []
for _ in range(100):  # 100 samples
    seq_len = torch.randint(10, 64, (1,)).item()
    tokens = torch.randint(0, 1000, (seq_len,))
    test_data.append({'input_ids': tokens})

# Create dataloader
from torch.utils.data import DataLoader, Dataset

class SimpleDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

def collate_fn(batch):
    # Pad sequences to same length
    max_len = max(len(item['input_ids']) for item in batch)
    
    padded_batch = []
    for item in batch:
        input_ids = item['input_ids']
        padded = torch.cat([input_ids, torch.zeros(max_len - len(input_ids), dtype=torch.long)])
        padded_batch.append({'input_ids': padded})
    
    # Stack into batch tensor
    batch_input_ids = torch.stack([item['input_ids'] for item in padded_batch])
    return {'input_ids': batch_input_ids}

test_dataset = SimpleDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

print(f"Created test dataset with {len(test_data)} samples")

In [None]:
# Evaluate original vs compressed model performance
original_model = SimpleTransformer().to(device)

# Create compressed model
compressed_model = SimpleTransformer().to(device)
compressed_model.load_state_dict(original_model.state_dict())

best_config = {
    'compression': {'compression_ratio': 0.5, 'chunk_size': 64, 'adaptive_rank': True},
    'kv_cache': {'max_cache_size': 1000000, 'compression_threshold': 0.8}
}

chunked_decomp = ChunkedDecomp(model=compressed_model, config=best_config)
chunked_decomp.apply_compression()

print("Evaluating original model...")
original_results = evaluator.evaluate_model(
    model=original_model,
    test_loader=test_loader,
    max_samples=50
)

print("Evaluating compressed model...")
compressed_results = evaluator.evaluate_model(
    model=compressed_model,
    test_loader=test_loader,
    max_samples=50
)

print("\nPerformance Comparison:")
print(f"Original - Avg Inference Time: {original_results.get('avg_inference_time_ms', 0):.2f}ms")
print(f"Compressed - Avg Inference Time: {compressed_results.get('avg_inference_time_ms', 0):.2f}ms")

if 'loss' in original_results and 'loss' in compressed_results:
    print(f"Original - Loss: {original_results['loss']:.4f}")
    print(f"Compressed - Loss: {compressed_results['loss']:.4f}")
    print(f"Loss difference: {abs(original_results['loss'] - compressed_results['loss']):.4f}")

## 7. Visualization and Results Summary

In [None]:
# Create comprehensive summary visualization
fig = plt.figure(figsize=(20, 12))
gs = fig.add_gridspec(3, 4, hspace=0.3, wspace=0.3)

# SVD compression trade-off
ax1 = fig.add_subplot(gs[0, 0:2])
ax1.plot(results_df['compression_ratio'], results_df['relative_error'], 'o-', linewidth=2, markersize=6)
ax1.set_xlabel('Compression Ratio')
ax1.set_ylabel('Relative Error')
ax1.set_title('SVD Compression Trade-off')
ax1.grid(True, alpha=0.3)

# Model compression comparison
ax2 = fig.add_subplot(gs[0, 2:4])
bars = ax2.bar(range(len(compression_df)), compression_df['memory_reduction_mb'], alpha=0.7)
ax2.set_xlabel('Configuration')
ax2.set_ylabel('Memory Reduction (MB)')
ax2.set_title('Model Compression Results')
ax2.set_xticks(range(len(compression_df)))
ax2.set_xticklabels([f'Config {i+1}' for i in range(len(compression_df))])

# Chunked vs non-chunked
ax3 = fig.add_subplot(gs[1, 0:2])
if 'chunked_df' in locals():
    ax3.plot(chunked_df['chunk_size'], chunked_df['compression_time_ms'], 'o-', linewidth=2, markersize=6, label='Chunked')
    ax3.axhline(y=non_chunked_time, color='red', linestyle='--', label='Non-chunked')
    ax3.set_xlabel('Chunk Size')
    ax3.set_ylabel('Compression Time (ms)')
    ax3.set_title('Chunked vs Non-chunked Performance')
    ax3.legend()
    ax3.grid(True, alpha=0.3)

# Memory usage (if available)
ax4 = fig.add_subplot(gs[1, 2:4])
if device == 'cuda' and 'memory_df' in locals():
    memory_summary = memory_df.groupby('batch_size')['memory_reduction_percent'].mean()
    ax4.bar(memory_summary.index, memory_summary.values, alpha=0.7, color='green')
    ax4.set_xlabel('Batch Size')
    ax4.set_ylabel('Avg Memory Reduction (%)')
    ax4.set_title('Memory Reduction by Batch Size')
else:
    ax4.text(0.5, 0.5, 'Memory analysis\nnot available\n(requires CUDA)', 
             transform=ax4.transAxes, ha='center', va='center', fontsize=12)
    ax4.set_title('Memory Analysis')

# Performance comparison
ax5 = fig.add_subplot(gs[2, 0:2])
if 'original_results' in locals() and 'compressed_results' in locals():
    models = ['Original', 'Compressed']
    inference_times = [
        original_results.get('avg_inference_time_ms', 0),
        compressed_results.get('avg_inference_time_ms', 0)
    ]
    
    bars = ax5.bar(models, inference_times, alpha=0.7, color=['blue', 'orange'])
    ax5.set_ylabel('Avg Inference Time (ms)')
    ax5.set_title('Inference Speed Comparison')
    
    # Add value labels on bars
    for bar, time in zip(bars, inference_times):
        ax5.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                f'{time:.2f}ms', ha='center', va='bottom')

# Summary statistics
ax6 = fig.add_subplot(gs[2, 2:4])
ax6.axis('off')

# Create summary text
summary_text = "CHUNKEDDECOMP ANALYSIS SUMMARY\n\n"
summary_text += f"• Matrix compression tested on {len(matrices)} different sizes\n"
summary_text += f"• Best compression ratio tested: {results_df['compression_ratio'].max():.1f}\n"
summary_text += f"• Lowest relative error achieved: {results_df['relative_error'].min():.4f}\n\n"

if 'compression_df' in locals():
    summary_text += f"• Model compression configurations tested: {len(compression_df)}\n"
    summary_text += f"• Best memory reduction: {compression_df['memory_reduction_mb'].max():.2f} MB\n"
    summary_text += f"• Best output preservation: {compression_df['output_difference'].min():.4f}\n\n"

if device == 'cuda' and 'memory_df' in locals():
    summary_text += f"• Memory analysis: {len(memory_df)} configurations tested\n"
    summary_text += f"• Average memory reduction: {memory_df['memory_reduction_percent'].mean():.1f}%\n\n"

summary_text += f"• Device used: {device.upper()}\n"
summary_text += f"• Chunked compression: {'✓' if 'chunked_df' in locals() else '✗'}\n"
summary_text += f"• Performance evaluation: {'✓' if 'original_results' in locals() else '✗'}"

ax6.text(0.05, 0.95, summary_text, transform=ax6.transAxes, 
         fontsize=10, verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))

plt.suptitle('ChunkedDecomp Comprehensive Analysis', fontsize=16, fontweight='bold')
plt.show()

In [None]:
# Final recommendations based on analysis
print("="*60)
print("CHUNKEDDECOMP RECOMMENDATIONS")
print("="*60)

# SVD recommendations
best_ratio_idx = results_df['relative_error'].idxmin()
best_ratio = results_df.loc[best_ratio_idx, 'compression_ratio']
best_error = results_df.loc[best_ratio_idx, 'relative_error']

print(f"\n1. SVD COMPRESSION:")
print(f"   • Best compression ratio for accuracy: {best_ratio:.1f} (error: {best_error:.4f})")
print(f"   • Memory-accuracy trade-off is most favorable around 0.5-0.7 range")

# Model compression recommendations
if 'compression_df' in locals():
    best_model_idx = compression_df['memory_reduction_mb'].idxmax()
    best_model_config = compression_configs[best_model_idx]
    
    print(f"\n2. MODEL COMPRESSION:")
    print(f"   • Best configuration: {best_model_config}")
    print(f"   • Memory reduction: {compression_df.loc[best_model_idx, 'memory_reduction_mb']:.2f} MB")
    print(f"   • Output difference: {compression_df.loc[best_model_idx, 'output_difference']:.4f}")
    print(f"   • Adaptive rank helps maintain accuracy with aggressive compression")

# Chunking recommendations
if 'chunked_df' in locals():
    optimal_chunk_idx = chunked_df['compression_time_ms'].idxmin()
    optimal_chunk_size = chunked_df.loc[optimal_chunk_idx, 'chunk_size']
    
    print(f"\n3. CHUNKING STRATEGY:")
    print(f"   • Optimal chunk size for speed: {optimal_chunk_size}")
    print(f"   • Chunking provides {((non_chunked_time - chunked_df['compression_time_ms'].min()) / non_chunked_time * 100):.1f}% speedup")
    print(f"   • Trade-off: slight accuracy loss but significant speed improvement")

# Memory recommendations
if device == 'cuda' and 'memory_df' in locals():
    avg_reduction = memory_df['memory_reduction_percent'].mean()
    print(f"\n4. MEMORY EFFICIENCY:")
    print(f"   • Average memory reduction: {avg_reduction:.1f}%")
    print(f"   • Memory savings scale well with batch size and sequence length")
    print(f"   • Particularly effective for large models and long sequences")

# General recommendations
print(f"\n5. GENERAL RECOMMENDATIONS:")
print(f"   • Start with compression_ratio=0.5, chunk_size=64")
print(f"   • Enable adaptive_rank for better accuracy preservation")
print(f"   • Use chunking for large matrices (>1024x1024)")
print(f"   • Monitor output quality vs memory trade-offs for your use case")
print(f"   • Consider enabling KV cache for generation tasks")

print("\n" + "="*60)