# TensorRT Model Optimization

> **Convert PyTorch models to optimized TensorRT engines for 2-5x speedup**

1. **Convert Models**: PyTorch → TensorRT engines
2. **Measure Speedup**: Compare before/after performance  
3. **Optimize Settings**: Precision, batch size, sequence length
4. **Save Engines**: Reusable optimized models
---

## Setup and Imports

In [None]:
import torch
import tensorrt as trt
import numpy as np
import time
import json
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM
import matplotlib.pyplot as plt

print("TensorRT optimization tools loaded!")
print(f"TensorRT version: {trt.__version__}")
print(f"Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## TensorRT Engine Builder

In [None]:
class TensorRTOptimizer:
    def __init__(self, precision='fp16'):
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.builder = trt.Builder(self.logger)
        self.precision = precision
        
    def build_engine(self, onnx_path, engine_path, max_batch_size=1, max_seq_len=512):
        """Build TensorRT engine from ONNX model"""
        
        # Create network
        network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
        network = self.builder.create_network(network_flags)
        parser = trt.OnnxParser(network, self.logger)
        
        # Parse ONNX model
        with open(onnx_path, 'rb') as model:
            if not parser.parse(model.read()):
                print("ERROR: Failed to parse ONNX model")
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return None
        
        # Create builder config
        config = self.builder.create_builder_config()
        
        # Set precision
        if self.precision == 'fp16' and self.builder.platform_has_fast_fp16:
            config.set_flag(trt.BuilderFlag.FP16)
            print("Using FP16 precision")
        elif self.precision == 'int8' and self.builder.platform_has_fast_int8:
            config.set_flag(trt.BuilderFlag.INT8)
            print("Using INT8 precision")
        else:
            print("Using FP32 precision")
        
        # Set memory pool
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 2 << 30)  # 2GB
        
        # Build engine
        print("Building TensorRT engine... (this may take several minutes)")
        engine = self.builder.build_serialized_network(network, config)
        
        if engine is None:
            print("ERROR: Failed to build engine")
            return None
        
        # Save engine
        with open(engine_path, 'wb') as f:
            f.write(engine)
        
        print(f"Engine saved to {engine_path}")
        return engine

# Initialize optimizer
trt_optimizer = TensorRTOptimizer(precision='fp16')
print("TensorRT optimizer ready")

## Model to ONNX Conversion

In [None]:
def convert_to_onnx(model_name, onnx_path, max_seq_len=128):
    """Convert PyTorch model to ONNX format"""
    
    # Load model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.eval()
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Create dummy input
    dummy_input = torch.randint(0, tokenizer.vocab_size, (1, max_seq_len))
    
    # Export to ONNX
    torch.onnx.export(
        model,
        dummy_input,
        onnx_path,
        export_params=True,
        opset_version=11,
        do_constant_folding=True,
        input_names=['input_ids'],
        output_names=['logits'],
        dynamic_axes={
            'input_ids': {0: 'batch_size', 1: 'sequence'},
            'logits': {0: 'batch_size', 1: 'sequence'}
        }
    )
    
    print(f"Model exported to {onnx_path}")
    return tokenizer

# Convert models to ONNX
models_to_optimize = ['distilgpt2', 'gpt2']
onnx_models = {}

for model_name in models_to_optimize:
    onnx_path = f"{model_name.replace('/', '_')}.onnx"
    print(f"Converting {model_name} to ONNX...")
    
    try:
        tokenizer = convert_to_onnx(model_name, onnx_path, max_seq_len=128)
        onnx_models[model_name] = {
            'onnx_path': onnx_path,
            'tokenizer': tokenizer
        }
        print(f"✓ {model_name} converted successfully")
    except Exception as e:
        print(f"✗ Failed to convert {model_name}: {e}")

print(f"Converted {len(onnx_models)} models to ONNX")

## Build TensorRT Engines

In [None]:
# Build TensorRT engines from ONNX models
trt_engines = {}

for model_name, model_info in onnx_models.items():
    engine_path = f"{model_name.replace('/', '_')}_fp16.trt"
    print(f"Building TensorRT engine for {model_name}...")
    
    try:
        engine = trt_optimizer.build_engine(
            onnx_path=model_info['onnx_path'],
            engine_path=engine_path,
            max_batch_size=4,
            max_seq_len=128
        )
        
        if engine is not None:
            trt_engines[model_name] = {
                'engine_path': engine_path,
                'tokenizer': model_info['tokenizer']
            }
            print(f"✓ {model_name} engine built successfully")
        else:
            print(f"✗ Failed to build engine for {model_name}")
            
    except Exception as e:
        print(f"✗ Error building engine for {model_name}: {e}")

print(f"Built {len(trt_engines)} TensorRT engines")

## TensorRT Inference Engine

In [None]:
class TensorRTInference:
    def __init__(self, engine_path):
        self.logger = trt.Logger(trt.Logger.WARNING)
        
        # Load engine
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        
        runtime = trt.Runtime(self.logger)
        self.engine = runtime.deserialize_cuda_engine(engine_data)
        self.context = self.engine.create_execution_context()
        
        # Allocate GPU memory
        self.inputs = []
        self.outputs = []
        self.bindings = []
        
        for binding in self.engine:
            binding_idx = self.engine.get_binding_index(binding)
            size = trt.volume(self.context.get_binding_shape(binding_idx))
            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
            
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            
            self.bindings.append(int(device_mem))
            
            if self.engine.binding_is_input(binding):
                self.inputs.append({'host': host_mem, 'device': device_mem})
            else:
                self.outputs.append({'host': host_mem, 'device': device_mem})
    
    def infer(self, input_data):
        """Run inference with TensorRT engine"""
        
        # Copy input data to GPU
        np.copyto(self.inputs[0]['host'], input_data.ravel())
        cuda.memcpy_htod(self.inputs[0]['device'], self.inputs[0]['host'])
        
        # Run inference
        self.context.execute_v2(bindings=self.bindings)
        
        # Copy output data from GPU
        cuda.memcpy_dtoh(self.outputs[0]['host'], self.outputs[0]['device'])
        
        return self.outputs[0]['host']

# Note: This requires pycuda which may not be available in Colab
# Alternative: Use torch-tensorrt or other high-level interfaces
print("TensorRT inference class defined")
print("Note: Full TensorRT inference requires additional CUDA setup")

## Performance Comparison

In [None]:
def benchmark_pytorch_model(model_name, num_runs=10):
    """Benchmark original PyTorch model"""
    
    # Load model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.eval()
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Test prompt
    prompt = "The future of artificial intelligence"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Warmup
    for _ in range(3):
        with torch.no_grad():
            _ = model.generate(inputs.input_ids, max_new_tokens=50, do_sample=False)
    
    # Benchmark
    latencies = []
    for _ in range(num_runs):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        
        start_time = time.time()
        
        with torch.no_grad():
            outputs = model.generate(inputs.input_ids, max_new_tokens=50, do_sample=False)
        
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        
        latency = time.time() - start_time
        latencies.append(latency)
    
    # Calculate statistics
    mean_latency = np.mean(latencies)
    std_latency = np.std(latencies)
    
    # Calculate throughput
    tokens_generated = len(outputs[0]) - len(inputs.input_ids[0])
    throughput = tokens_generated / mean_latency
    
    return {
        'mean_latency': mean_latency,
        'std_latency': std_latency,
        'throughput': throughput,
        'tokens_generated': tokens_generated
    }

# Benchmark PyTorch models
pytorch_results = {}

for model_name in models_to_optimize:
    print(f"Benchmarking PyTorch {model_name}...")
    try:
        result = benchmark_pytorch_model(model_name)
        pytorch_results[model_name] = result
        print(f"  Latency: {result['mean_latency']:.3f}±{result['std_latency']:.3f}s")
        print(f"  Throughput: {result['throughput']:.1f} tokens/sec")
    except Exception as e:
        print(f"  Error: {e}")

print("PyTorch benchmarking complete")

## Results Analysis

In [None]:
# Create comparison visualization
if pytorch_results:
    models = list(pytorch_results.keys())
    pytorch_latencies = [pytorch_results[m]['mean_latency'] for m in models]
    pytorch_throughputs = [pytorch_results[m]['throughput'] for m in models]
    
    # Note: TensorRT results would be added here after successful engine execution
    # For now, showing PyTorch baseline results
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Latency comparison
    x_pos = np.arange(len(models))
    ax1.bar(x_pos, pytorch_latencies, alpha=0.7, label='PyTorch')
    # ax1.bar(x_pos + 0.4, trt_latencies, alpha=0.7, label='TensorRT')  # Add when available
    ax1.set_xlabel('Models')
    ax1.set_ylabel('Latency (seconds)')
    ax1.set_title('Inference Latency Comparison')
    ax1.set_xticks(x_pos)
    ax1.set_xticklabels(models)
    ax1.legend()
    
    # Throughput comparison
    ax2.bar(x_pos, pytorch_throughputs, alpha=0.7, label='PyTorch')
    # ax2.bar(x_pos + 0.4, trt_throughputs, alpha=0.7, label='TensorRT')  # Add when available
    ax2.set_xlabel('Models')
    ax2.set_ylabel('Throughput (tokens/sec)')
    ax2.set_title('Inference Throughput Comparison')
    ax2.set_xticks(x_pos)
    ax2.set_xticklabels(models)
    ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print("OPTIMIZATION RESULTS")
    print("=" * 40)
    
    for model in models:
        pytorch_result = pytorch_results[model]
        print(f"\n{model}:")
        print(f"  PyTorch Latency: {pytorch_result['mean_latency']:.3f}s")
        print(f"  PyTorch Throughput: {pytorch_result['throughput']:.1f} tokens/sec")
        # Add TensorRT results when available
        # speedup = pytorch_latency / trt_latency
        # print(f"  Speedup: {speedup:.2f}x")

else:
    print("No benchmark results available")

## Save Optimization Results

In [None]:
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

optimization_results = {
    'timestamp': timestamp,
    'optimization_type': 'tensorrt',
    'pytorch_results': pytorch_results,
    'tensorrt_engines': list(trt_engines.keys()),
    'precision': 'fp16',
    'notes': 'Initial TensorRT optimization setup and PyTorch baseline'
}

filename = f"tensorrt_optimization_{timestamp}.json"
with open(filename, 'w') as f:
    json.dump(optimization_results, f, indent=2, default=str)

print(f"Results saved to {filename}")
print("\nTensorRT optimization setup complete!")
print("Next steps:")
print("1. Fine-tune TensorRT engine parameters")
print("2. Add INT8 quantization")
print("3. Optimize for specific hardware")
print("4. Implement custom CUDA kernels")

## TensorRT Optimization Summary

**Setup Complete**: