In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install dependencies
!pip install onnxruntime-gpu numpy -q

In [None]:
# Upload your model file: diarization_transformer_optimized.onnx
# Or download from your repo:
# !wget https://github.com/FCHEHIDI/VoiceFlow-Intelligence-Platform/raw/main/VoiceFlow-Intelligence-Platform/voiceflow-ml/models/diarization_transformer_optimized.onnx

from google.colab import files
uploaded = files.upload()  # Upload your .onnx model

In [None]:
import onnxruntime as ort
import numpy as np
import time
from typing import List

MODEL_PATH = "diarization_transformer_optimized.onnx"  # Adjust if needed

def benchmark_inference(provider: str, iterations: int = 200) -> dict:
    """
    Benchmark ONNX inference with specified provider.
    
    Args:
        provider: 'CUDAExecutionProvider' or 'CPUExecutionProvider'
        iterations: Number of inference runs
    """
    print(f"\nüöÄ Benchmarking with {provider}...")
    
    # Create session
    session = ort.InferenceSession(
        MODEL_PATH,
        providers=[provider]
    )
    
    # Get input shape
    input_name = session.get_inputs()[0].name
    input_shape = session.get_inputs()[0].shape
    
    # Generate random input (adjust shape if needed)
    # Assuming input is (batch_size, sequence_length, features)
    if input_shape[0] is None or isinstance(input_shape[0], str):
        input_shape[0] = 1  # Batch size
    
    test_input = np.random.randn(*input_shape).astype(np.float32)
    
    # Warmup
    print("Warming up...")
    for _ in range(10):
        session.run(None, {input_name: test_input})
    
    # Benchmark
    print(f"Running {iterations} iterations...")
    latencies = []
    
    for i in range(iterations):
        start = time.perf_counter()
        session.run(None, {input_name: test_input})
        latency_ms = (time.perf_counter() - start) * 1000
        latencies.append(latency_ms)
        
        if (i + 1) % 50 == 0:
            print(f"  Progress: {i + 1}/{iterations}")
    
    # Calculate statistics
    latencies_sorted = sorted(latencies)
    results = {
        "provider": provider,
        "iterations": iterations,
        "min_ms": min(latencies),
        "max_ms": max(latencies),
        "mean_ms": np.mean(latencies),
        "median_ms": np.median(latencies),
        "p50_ms": latencies_sorted[len(latencies) // 2],
        "p95_ms": latencies_sorted[int(len(latencies) * 0.95)],
        "p99_ms": latencies_sorted[int(len(latencies) * 0.99)],
        "throughput_rps": 1000 / np.mean(latencies)
    }
    
    return results

def print_results(results: dict):
    """Pretty print benchmark results."""
    print(f"\n{'='*60}")
    print(f"Provider: {results['provider']}")
    print(f"Iterations: {results['iterations']}")
    print(f"{'='*60}")
    print(f"Min:        {results['min_ms']:.2f} ms")
    print(f"Mean:       {results['mean_ms']:.2f} ms")
    print(f"Median:     {results['median_ms']:.2f} ms")
    print(f"P50:        {results['p50_ms']:.2f} ms")
    print(f"P95:        {results['p95_ms']:.2f} ms")
    print(f"P99:        {results['p99_ms']:.2f} ms ‚≠ê")
    print(f"Max:        {results['max_ms']:.2f} ms")
    print(f"Throughput: {results['throughput_rps']:.1f} req/s")
    print(f"{'='*60}\n")

In [None]:
# Run GPU benchmark
gpu_results = benchmark_inference('CUDAExecutionProvider', iterations=200)
print_results(gpu_results)

In [None]:
# Run CPU benchmark for comparison
cpu_results = benchmark_inference('CPUExecutionProvider', iterations=200)
print_results(cpu_results)

In [None]:
# Comparison
print("\nüéØ GPU vs CPU Comparison")
print(f"{'='*60}")
speedup = cpu_results['mean_ms'] / gpu_results['mean_ms']
print(f"GPU P99:     {gpu_results['p99_ms']:.2f} ms")
print(f"CPU P99:     {cpu_results['p99_ms']:.2f} ms")
print(f"Speedup:     {speedup:.1f}x faster on GPU")
print(f"GPU Target:  < 10 ms P99 {'‚úÖ' if gpu_results['p99_ms'] < 10 else '‚ùå'}")
print(f"{'='*60}")

# Save results
import json
with open('benchmark_results.json', 'w') as f:
    json.dump({
        'gpu': gpu_results,
        'cpu': cpu_results,
        'speedup': speedup
    }, f, indent=2)

print("\nüíæ Results saved to benchmark_results.json")

# Download results
files.download('benchmark_results.json')