# LLM Performance Benchmarking

> **Measure baseline performance before optimization - the foundation of any optimization project**

1. **Loads Multiple Models**: Test different sizes (small to large)
2. **Measures Key Metrics**: Latency, throughput, memory usage
3. **Identifies Bottlenecks**: CPU vs GPU vs Memory limitations
4. **Creates Baseline Report**: Professional performance analysis
5. **Guides Optimization**: Shows exactly what to optimize next
---

## Setup and Imports

In [None]:
# Colab authentication setup
try:
    import google.colab
    from google.colab import userdata
    import os
    
except ImportError:
    print("Local environment - no token setup needed")

# Essential imports for benchmarking
import torch
import time
import psutil
import json
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Set up plotting
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)

print("Benchmarking tools loaded successfully!")
print(f"Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Model Selection for Benchmarking

**Strategy**: Test multiple model sizes to understand scaling behavior

In [None]:
# Models to benchmark (small to large)
BENCHMARK_MODELS = [
    {
        'name': 'DistilGPT-2',
        'model_id': 'distilgpt2',
        'params': '82M',
        'description': 'Smallest - fast baseline'
    },
    {
        'name': 'GPT-2 Small',
        'model_id': 'gpt2',
        'params': '124M', 
        'description': 'Small - good for development'
    },
    {
        'name': 'GPT-2 Medium',
        'model_id': 'gpt2-medium',
        'params': '355M',
        'description': 'Medium - realistic workload'
    }
    # Note: We can add larger models later if needed
    # GPT-2 Large (774M) and XL (1.5B) - for advanced testing
]

# Test prompts of different lengths
TEST_PROMPTS = [
    "AI optimization is",  # Short prompt
    "The future of artificial intelligence and machine learning technologies will",  # Medium prompt
    "In the rapidly evolving field of artificial intelligence, researchers are constantly developing new optimization techniques that"  # Long prompt
]

# Benchmark configuration
BENCHMARK_CONFIG = {
    'max_new_tokens': [10, 50, 100],  # Different generation lengths
    'batch_sizes': [1, 4],             # Single vs batch inference
    'num_runs': 5,                     # Repetitions for statistical reliability
    'warmup_runs': 2                   # Ignore first runs (GPU warmup)
}

print("Benchmark Configuration:")
print(f"   Models to test: {len(BENCHMARK_MODELS)}")
print(f"   Test prompts: {len(TEST_PROMPTS)}")
print(f"   Generation lengths: {BENCHMARK_CONFIG['max_new_tokens']}")
print(f"   Batch sizes: {BENCHMARK_CONFIG['batch_sizes']}")
print(f"   Runs per test: {BENCHMARK_CONFIG['num_runs']}")

## Benchmarking Functions

In [None]:
def measure_memory_usage():
    """Get current memory usage (CPU and GPU)"""
    memory_info = {
        'cpu_percent': psutil.virtual_memory().percent,
        'cpu_used_gb': psutil.virtual_memory().used / 1e9
    }
    
    if torch.cuda.is_available():
        memory_info.update({
            'gpu_allocated_gb': torch.cuda.memory_allocated() / 1e9,
            'gpu_reserved_gb': torch.cuda.memory_reserved() / 1e9,
            'gpu_total_gb': torch.cuda.get_device_properties(0).total_memory / 1e9
        })
        memory_info['gpu_utilization_percent'] = (
            memory_info['gpu_allocated_gb'] / memory_info['gpu_total_gb'] * 100
        )
    
    return memory_info

def benchmark_model_loading(model_id):
    """Measure model loading time and memory impact"""
    print(f"Loading {model_id}...")
    
    # Clear GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Measure before loading
    memory_before = measure_memory_usage()
    
    # Load model and measure time
    start_time = time.time()
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)
    
    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    loading_time = time.time() - start_time
    
    # Measure after loading
    memory_after = measure_memory_usage()
    
    # Add padding token if needed
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Calculate model size
    param_count = sum(p.numel() for p in model.parameters())
    model_size_mb = sum(p.numel() * p.element_size() for p in model.parameters()) / 1e6
    
    loading_stats = {
        'loading_time_sec': loading_time,
        'param_count': param_count,
        'model_size_mb': model_size_mb,
        'memory_before': memory_before,
        'memory_after': memory_after,
        'device': str(device)
    }
    
    print(f"   Loaded in {loading_time:.2f}s, {param_count:,} parameters, {model_size_mb:.1f}MB")
    
    return model, tokenizer, loading_stats

def benchmark_inference(model, tokenizer, prompt, max_new_tokens=50, batch_size=1, num_runs=5):
    """Measure inference performance with statistical reliability"""
    device = next(model.parameters()).device
    
    # Prepare inputs
    if batch_size == 1:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
    else:
        # Create batch by repeating prompt
        prompts = [prompt] * batch_size
        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(device)
    
    # Warmup runs (GPU needs to "warm up")
    for _ in range(2):
        with torch.no_grad():
            _ = model.generate(
                inputs.input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=False,  # Deterministic for consistent timing
                pad_token_id=tokenizer.eos_token_id
            )
    
    # Actual benchmark runs
    latencies = []
    memory_stats = []
    
    for run in range(num_runs):
        # Clear cache and measure initial memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()  # Ensure GPU operations complete
        
        memory_before = measure_memory_usage()
        
        # Measure inference time
        start_time = time.time()
        
        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
        
        if torch.cuda.is_available():
            torch.cuda.synchronize()  # Ensure GPU completes
        
        end_time = time.time()
        latency = end_time - start_time
        
        memory_after = measure_memory_usage()
        
        latencies.append(latency)
        memory_stats.append({
            'before': memory_before,
            'after': memory_after
        })
    
    # Calculate statistics
    mean_latency = np.mean(latencies)
    std_latency = np.std(latencies)
    min_latency = np.min(latencies)
    max_latency = np.max(latencies)
    
    # Calculate tokens and throughput
    total_tokens_generated = (len(outputs[0]) - len(inputs.input_ids[0])) * batch_size
    tokens_per_second = total_tokens_generated / mean_latency
    
    # Decode first output for verification
    sample_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    inference_stats = {
        'prompt': prompt,
        'max_new_tokens': max_new_tokens,
        'batch_size': batch_size,
        'mean_latency_sec': mean_latency,
        'std_latency_sec': std_latency,
        'min_latency_sec': min_latency,
        'max_latency_sec': max_latency,
        'tokens_generated': total_tokens_generated,
        'tokens_per_second': tokens_per_second,
        'sample_output': sample_output,
        'memory_stats': memory_stats
    }
    
    return inference_stats

print("Benchmarking functions ready!")
print("Can measure: loading time, inference latency, memory usage, throughput")

## Run Comprehensive Benchmarks

In [None]:
# Store all benchmark results
all_results = {
    'timestamp': datetime.now().isoformat(),
    'environment': {
        'device': 'cuda' if torch.cuda.is_available() else 'cpu',
        'gpu_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
        'torch_version': torch.__version__,
        'cuda_version': torch.version.cuda if torch.cuda.is_available() else None
    },
    'models': []
}

print("Starting Comprehensive Benchmark Suite")
print("=" * 60)

# Benchmark each model
for model_config in BENCHMARK_MODELS:
    print(f"
Testing {model_config['name']} ({model_config['params']})")
    print(f"   Description: {model_config['description']}")
    
    try:
        # Load model and measure loading performance
        model, tokenizer, loading_stats = benchmark_model_loading(model_config['model_id'])
        
        model_results = {
            'config': model_config,
            'loading_stats': loading_stats,
            'inference_results': []
        }
        
        # Test different scenarios
        print("   Running inference benchmarks...")
        
        # Test first prompt with different generation lengths
        test_prompt = TEST_PROMPTS[0]  # Start with short prompt
        
        for max_tokens in BENCHMARK_CONFIG['max_new_tokens']:
            for batch_size in BENCHMARK_CONFIG['batch_sizes']:
                print(f"      {max_tokens} tokens, batch_size={batch_size}...")
                
                result = benchmark_inference(
                    model, tokenizer, test_prompt,
                    max_new_tokens=max_tokens,
                    batch_size=batch_size,
                    num_runs=BENCHMARK_CONFIG['num_runs']
                )
                
                model_results['inference_results'].append(result)
                
                print(f"         Latency: {result['mean_latency_sec']:.3f}±{result['std_latency_sec']:.3f}s")
                print(f"         Throughput: {result['tokens_per_second']:.1f} tokens/sec")
        
        all_results['models'].append(model_results)
        
        # Clean up to free memory for next model
        del model, tokenizer
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print(f"   {model_config['name']} benchmarking complete!")
        
    except Exception as e:
        print(f"   Error benchmarking {model_config['name']}: {e}")
        continue

print(f"
Benchmark suite completed!")
print(f"Tested {len(all_results['models'])} models successfully")

## Performance Analysis and Visualization

In [None]:
# Extract data for visualization
model_names = []
param_counts = []
loading_times = []
latencies_50_tokens = []  # 50 token generation, batch_size=1
throughputs_50_tokens = []
memory_usage = []

for model_result in all_results['models']:
    model_names.append(model_result['config']['name'])
    param_counts.append(model_result['loading_stats']['param_count'] / 1e6)  # Convert to millions
    loading_times.append(model_result['loading_stats']['loading_time_sec'])
    
    # Find 50 token, batch_size=1 result
    for inference_result in model_result['inference_results']:
        if inference_result['max_new_tokens'] == 50 and inference_result['batch_size'] == 1:
            latencies_50_tokens.append(inference_result['mean_latency_sec'])
            throughputs_50_tokens.append(inference_result['tokens_per_second'])
            
            # Memory usage (GPU if available)
            if torch.cuda.is_available():
                memory_usage.append(model_result['loading_stats']['memory_after']['gpu_allocated_gb'])
            else:
                memory_usage.append(model_result['loading_stats']['memory_after']['cpu_used_gb'])
            break

# Create comprehensive visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('LLM Performance Baseline Report', fontsize=16, fontweight='bold')

# 1. Model Size vs Loading Time
ax1.scatter(param_counts, loading_times, s=100, alpha=0.7, color='blue')
ax1.set_xlabel('Model Size (Million Parameters)')
ax1.set_ylabel('Loading Time (seconds)')
ax1.set_title('Model Loading Performance')
ax1.grid(True, alpha=0.3)
for i, name in enumerate(model_names):
    ax1.annotate(name, (param_counts[i], loading_times[i]), 
                xytext=(5, 5), textcoords='offset points', fontsize=8)

# 2. Model Size vs Inference Latency
ax2.scatter(param_counts, latencies_50_tokens, s=100, alpha=0.7, color='red')
ax2.set_xlabel('Model Size (Million Parameters)')
ax2.set_ylabel('Inference Latency (seconds)')
ax2.set_title('Inference Performance (50 tokens)')
ax2.grid(True, alpha=0.3)
for i, name in enumerate(model_names):
    ax2.annotate(name, (param_counts[i], latencies_50_tokens[i]), 
                xytext=(5, 5), textcoords='offset points', fontsize=8)

# 3. Throughput Comparison
bars = ax3.bar(model_names, throughputs_50_tokens, color='green', alpha=0.7)
ax3.set_ylabel('Throughput (tokens/second)')
ax3.set_title('Generation Throughput')
ax3.tick_params(axis='x', rotation=45)
# Add value labels on bars
for i, (bar, value) in enumerate(zip(bars, throughputs_50_tokens)):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
             f'{value:.1f}', ha='center', va='bottom', fontsize=9)

# 4. Memory Usage
bars = ax4.bar(model_names, memory_usage, color='purple', alpha=0.7)
memory_type = 'GPU' if torch.cuda.is_available() else 'CPU'
ax4.set_ylabel(f'{memory_type} Memory Usage (GB)')
ax4.set_title(f'{memory_type} Memory Consumption')
ax4.tick_params(axis='x', rotation=45)
# Add value labels on bars
for i, (bar, value) in enumerate(zip(bars, memory_usage)):
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05, 
             f'{value:.2f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# Print summary statistics
print("
PERFORMANCE SUMMARY")
print("=" * 50)
for i, model_result in enumerate(all_results['models']):
    config = model_result['config']
    loading = model_result['loading_stats']
    
    print(f"
{config['name']} ({config['params']})")
    print(f"   Loading: {loading['loading_time_sec']:.2f}s")
    print(f"   Memory: {memory_usage[i]:.2f} GB")
    print(f"   Latency: {latencies_50_tokens[i]:.3f}s (50 tokens)")
    print(f"   Throughput: {throughputs_50_tokens[i]:.1f} tokens/sec")

# Identify optimization opportunities
print("
OPTIMIZATION OPPORTUNITIES")
print("=" * 50)

# Find slowest model for optimization focus
slowest_idx = latencies_50_tokens.index(max(latencies_50_tokens))
fastest_idx = latencies_50_tokens.index(min(latencies_50_tokens))

print(f"Slowest model: {model_names[slowest_idx]} ({latencies_50_tokens[slowest_idx]:.3f}s)")
print(f"Fastest model: {model_names[fastest_idx]} ({latencies_50_tokens[fastest_idx]:.3f}s)")
print(f"Speedup potential: {latencies_50_tokens[slowest_idx]/latencies_50_tokens[fastest_idx]:.1f}x")

# Memory analysis
max_memory_idx = memory_usage.index(max(memory_usage))
print(f"Most memory hungry: {model_names[max_memory_idx]} ({memory_usage[max_memory_idx]:.2f} GB)")

# Scaling analysis
if len(param_counts) > 1:
    # Simple linear fit to understand scaling
    param_ratio = param_counts[-1] / param_counts[0]
    latency_ratio = latencies_50_tokens[-1] / latencies_50_tokens[0]
    print(f"Scaling analysis: {param_ratio:.1f}x parameters → {latency_ratio:.1f}x latency")
    
    if latency_ratio > param_ratio:
        print("Worse than linear scaling - optimization needed!")
    else:
        print("Better than linear scaling - good baseline")

## Save Benchmark Results

In [None]:
# Save detailed results to JSON
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"baseline_benchmark_{timestamp}.json"

with open(filename, 'w') as f:
    json.dump(all_results, f, indent=2, default=str)

print(f"Detailed results saved to: {filename}")

# Create summary report
summary_report = {
    'benchmark_date': datetime.now().isoformat(),
    'environment': all_results['environment'],
    'models_tested': len(all_results['models']),
    'key_findings': {
        'fastest_model': {
            'name': model_names[fastest_idx],
            'latency_sec': latencies_50_tokens[fastest_idx],
            'throughput_tokens_per_sec': throughputs_50_tokens[fastest_idx]
        },
        'slowest_model': {
            'name': model_names[slowest_idx],
            'latency_sec': latencies_50_tokens[slowest_idx],
            'throughput_tokens_per_sec': throughputs_50_tokens[slowest_idx]
        },
        'optimization_potential': {
            'max_speedup_possible': round(latencies_50_tokens[slowest_idx]/latencies_50_tokens[fastest_idx], 1),
            'memory_range_gb': [min(memory_usage), max(memory_usage)]
        }
    },
    'next_optimization_targets': [
        'TensorRT optimization for fastest inference',
        'Quantization for memory reduction',
        'Custom CUDA kernels for specialized operations',
        'Batching optimization for throughput'
    ]
}

summary_filename = f"benchmark_summary_{timestamp}.json"
with open(summary_filename, 'w') as f:
    json.dump(summary_report, f, indent=2)

print(f"Summary report saved to: {summary_filename}")

# Display final summary
print("
BENCHMARKING COMPLETE!")
print("=" * 50)
print("What you now have:")
print("   Baseline performance measurements")
print("   Performance scaling analysis")
print("   Clear optimization targets")
print("   Professional benchmark reports")
print("
Next steps: Choose optimization technique to implement!")
print("   1. TensorRT optimization (fastest path to speedup)")
print("   2. Quantization (memory reduction)")
print("   3. Custom CUDA kernels (maximum performance)")
print("   4. Batching optimization (throughput improvement)")

## Benchmark Results Analysis

### Next Optimization Steps

## Download Results from Colab

In [None]:
# Download results files to your local machine
try:
    import google.colab
    from google.colab import files
    
    print("Available result files to download:")
    
    # Download benchmark results
    results_file = f"benchmark_results_{timestamp}.json"
    if os.path.exists(results_file):
        print(f"Done{results_file}")
        files.download(results_file)
    
    # Download summary report
    summary_file = f"benchmark_summary_{timestamp}.json"
    if os.path.exists(summary_file):
        print(f"Done{summary_file}")
        files.download(summary_file)
    
    # Download any plots that were saved
    import glob
    plot_files = glob.glob("*benchmark*.png")
    for plot_file in plot_files:
        print(f"Done{plot_file}")
        files.download(plot_file)
    
    print("\nAll files downloaded to your Downloads folder!")
    print("Upload these to your GitHub repository for documentation")
    
except ImportError:
    print("Local environment - files already saved locally:")
    print(f"{results_file}")
    print(f"{summary_file}")
    print("benchmark_plots.png")
    print("\nNext: Commit these to your GitHub repository")