# vLLM Scaling Experiment Analysis

## 1. Setup and Data Loading

Load the aggregated experiment results from `summary_metrics.parquet` and prepare the data for analysis.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the aggregated data
try:
    df = pd.read_parquet('../results/agg/summary_metrics.parquet')
    print("Successfully loaded summary_metrics.parquet")
    print(f"Data shape: {df.shape}")
    print("Columns:", df.columns.tolist())
except FileNotFoundError:
    print("Error: summary_metrics.parquet not found. Please run scripts/aggregate_metrics.py first.")

# Set plot style
sns.set_theme(style="whitegrid")

## 2. Throughput Analysis

Analyze how throughput scales with the number of GPUs. We will look at both requests/sec and total tokens/sec.

In [None]:
if 'df' in locals():
    # Plot 1: Request Throughput Scaling
    plt.figure(figsize=(12, 6))
    sns.barplot(data=df, x='gpu_count', y='request_throughput_req_s', hue='prompt_profile', ci=None)
    plt.title('Request Throughput (req/s) vs. GPU Count')
    plt.xlabel('Number of GPUs')
    plt.ylabel('Requests per Second')
    plt.savefig('../results/figs/throughput_scaling_req_s.png')
    plt.show()

    # Plot 2: Total Token Throughput Scaling
    plt.figure(figsize=(12, 6))
    sns.barplot(data=df, x='gpu_count', y='total_token_throughput_tok_s', hue='prompt_profile', ci=None)
    plt.title('Total Token Throughput (tokens/s) vs. GPU Count')
    plt.xlabel('Number of GPUs')
    plt.ylabel('Total Tokens per Second')
    plt.savefig('../results/figs/throughput_scaling_tok_s.png')
    plt.show()

## 3. Latency Analysis

Analyze different latency metrics (TTFT, TPOT) across different configurations.

In [None]:
if 'df' in locals():
    # Plot 3: Mean Time to First Token (TTFT)
    plt.figure(figsize=(12, 6))
    sns.pointplot(data=df, x='gpu_count', y='mean_ttft_ms', hue='prompt_profile', dodge=True, linestyles='--')
    plt.title('Mean Time to First Token (TTFT) vs. GPU Count')
    plt.xlabel('Number of GPUs')
    plt.ylabel('Mean TTFT (ms)')
    plt.savefig('../results/figs/latency_ttft.png')
    plt.show()

    # Plot 4: Mean Time Per Output Token (TPOT)
    plt.figure(figsize=(12, 6))
    sns.pointplot(data=df, x='gpu_count', y='mean_tpot_ms', hue='prompt_profile', dodge=True, linestyles='--')
    plt.title('Mean Time Per Output Token (TPOT) vs. GPU Count')
    plt.xlabel('Number of GPUs')
    plt.ylabel('Mean TPOT (ms)')
    plt.savefig('../results/figs/latency_tpot.png')
    plt.show()

## 4. Throughput vs. Latency Trade-off

Visualize the relationship between throughput and latency to understand the performance trade-offs.

In [None]:
if 'df' in locals():
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df, x='total_token_throughput_tok_s', y='mean_ttft_ms', hue='gpu_count', style='prompt_profile', s=150)
    plt.title('Throughput vs. Latency (TTFT)')
    plt.xlabel('Total Token Throughput (tokens/s)')
    plt.ylabel('Mean Time to First Token (ms)')
    plt.legend(title='Configuration', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.savefig('../results/figs/tradeoff_throughput_vs_latency.png')
    plt.show()

## 5. Amdahl's Law Analysis

Load the `amdahl_fit.csv` data to see the calculated speedup and the estimated sequential portion (`s`) of the workload.

In [None]:
try:
    amdahl_df = pd.read_csv('../results/agg/amdahl_fit.csv')
    print("Amdahl's Law Analysis Results:")
    display(amdahl_df)
except FileNotFoundError:
    print("amdahl_fit.csv not found. Please run scripts/amdahl_fit.py first.")