# vLLM Upstream vs vLLM ROCm
AMD maintains a vLLM fork: https://github.com/ROCm/vllm
Is this any better than the upstream https://github.com/vllm-project/vllm ?

```
# Clone
mamba create --name vllm-rocm --clone vllm
mamba activate vllm-rocm
python -m ipykernel install --user --name vllm-rocm

# Build (leave everything else the same as our previous setup)
git clone https://github.com/ROCm/vllm vllm-rocm
cd vllm-rocm
PYTORCH_ROCM_ARCH="gfx942" python setup.py develop

# vllm-rocm requirements
pip install -r requirements-rocm.txt
pip install msgspec pydantic

# without this you will get a `hipbsolidxgemm` error
cd gradlib
pip install .
cd ..
```


In [12]:
# Simple Benchmark
import time
import pandas as pd
import re

def benchmark_model(model, input_len, output_len, tp):    
    # Initialize the DataFrame
    df = pd.DataFrame(columns=['Requests per Second', 'Tokens per Second'])
    
    # Function to run the benchmark command and capture output
    def run_benchmark():
        command = f"VLLM_USE_TRITON_FLASH_ATTN=0 time python vllm/benchmarks/benchmark_throughput.py --backend vllm --input-len {input_len} --output-len {output_len} --model {model} -tp {tp}"
        # Run the command and capture the output
        start = time.time()
        output = get_ipython().getoutput(command)
        end = time.time()
        total_time = end-start
        output_str = ' '.join(output)
        print(f"  Run time: {total_time:.2f} seconds")
        # Use regular expressions to extract the throughput values
        matches = re.findall(r"Throughput:\s*([\d.]+)\s*requests/s,\s*([\d.]+)\s*tokens/s", output_str)
        if matches:
            requests_per_sec, tokens_per_sec = map(float, matches[0])
            return requests_per_sec, tokens_per_sec
        else:
            print(f"No throughput data found for {tuning} tuning.")
            return None, None

    # Run benchmarks for no GEMM Tuning
    none_rps, none_tps = run_benchmark()
    if none_rps is None or none_tps is None:
        print("Benchmark failed.")
        return None

    # Append No GEMM Tuning results to the DataFrame
    df.loc[len(df)] = {'Requests per Second': none_rps, 'Tokens per Second': none_tps}

    # Display the DataFrame
    print(df)
    return df

## vLLM Upstream

In [10]:
!VLLM_WORKER_MULTIPROC_METHOD='spawn' python -c 'import vllm; print(vllm.__version__)'

0.6.4.dev9+g5d264f4a


In [2]:
# Increase File handles
!ulimit -n 131072
!python vllm/collect_env.py

Collecting environment information...
PyTorch version: 2.6.0.dev20241015+rocm6.2
Is debug build: False
CUDA used to build PyTorch: N/A
ROCM used to build PyTorch: 6.2.41133-dd7f95766

OS: Ubuntu 22.04.5 LTS (x86_64)
GCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
Clang version: Could not collect
CMake version: version 3.30.5
Libc version: glibc-2.35

Python version: 3.11.10 | packaged by conda-forge | (main, Sep 30 2024, 18:08:57) [GCC 13.3.0] (64-bit runtime)
Python platform: Linux-6.8.0-47-generic-x86_64-with-glibc2.35
Is CUDA available: True
CUDA runtime version: Could not collect
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration: AMD Instinct MI300X (gfx942:sramecc+:xnack-)
Nvidia driver version: Could not collect
cuDNN version: Could not collect
HIP runtime version: 6.2.41133
MIOpen runtime version: 3.2.0
Is XNNPACK available: True

CPU:
Architecture:                         x86_64
CPU op-mode(s):                       32-bit, 64-bit
Address sizes:                  

In [3]:
import pandas as pd
import re
import nbformat

In [13]:
# List of configurations to test
configs = [
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 256, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 512, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 1024, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 2048, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 4096, 'tp': 8},
]

# Initialize an empty DataFrame to store all results
all_results = pd.DataFrame()

# Run benchmarks for each configuration
for config in configs:
    print(config)
    df_result = benchmark_model(**config)
    if df_result is not None:
        # Add a column for the configuration
        df_result['Config'] = f"input_len={config['input_len']}, output_len={config['output_len']}, tp={config['tp']}"
        all_results = pd.concat([all_results, df_result], ignore_index=True)

# Display all results
display(all_results)

{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 128, 'tp': 8}
  Run time: 74.92 seconds
   Requests per Second  Tokens per Second
0                91.63            11728.8
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 256, 'tp': 8}
  Run time: 88.60 seconds
   Requests per Second  Tokens per Second
0                43.82           11217.16
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 512, 'tp': 8}
  Run time: 107.98 seconds
   Requests per Second  Tokens per Second
0                22.46           11497.47
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 1024, 'tp': 8}
  Run time: 159.48 seconds
   Requests per Second  Tokens per Second
0                10.94           11203.39
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 2048, 'tp': 8}
  Run time: 258.22 seconds
   Requests per Second  Tokens per Second
0                 5.23           1070

Unnamed: 0,Requests per Second,Tokens per Second,Config
0,91.63,11728.8,"input_len=0, output_len=128, tp=8"
1,43.82,11217.16,"input_len=0, output_len=256, tp=8"
2,22.46,11497.47,"input_len=0, output_len=512, tp=8"
3,10.94,11203.39,"input_len=0, output_len=1024, tp=8"
4,5.23,10702.26,"input_len=0, output_len=2048, tp=8"
5,2.19,8965.82,"input_len=0, output_len=4096, tp=8"


In [14]:
# List of configurations to test
configs = [
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 128, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 256, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 512, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 1024, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2048, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 4096, 'output_len': 128, 'tp': 8},
]

# Initialize an empty DataFrame to store all results
all_results = pd.DataFrame()

# Run benchmarks for each configuration
for config in configs:
    print(config)
    df_result = benchmark_model(**config)
    if df_result is not None:
        # Add a column for the configuration
        df_result['Config'] = f"input_len={config['input_len']}, output_len={config['output_len']}, tp={config['tp']}"
        all_results = pd.concat([all_results, df_result], ignore_index=True)

# Display all results
display(all_results)

{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 128, 'output_len': 128, 'tp': 8}
  Run time: 76.39 seconds
   Requests per Second  Tokens per Second
0                83.27           21318.35
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 256, 'output_len': 128, 'tp': 8}
  Run time: 77.30 seconds
   Requests per Second  Tokens per Second
0                73.55           28244.18
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 512, 'output_len': 128, 'tp': 8}
  Run time: 81.28 seconds
   Requests per Second  Tokens per Second
0                62.58           40052.48
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 1024, 'output_len': 128, 'tp': 8}
  Run time: 86.43 seconds
   Requests per Second  Tokens per Second
0                45.13           51987.43
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2048, 'output_len': 128, 'tp': 8}
  Run time: 97.39 seconds
   Requests per Second  Tokens per Second
0                30.66        

Unnamed: 0,Requests per Second,Tokens per Second,Config
0,83.27,21318.35,"input_len=128, output_len=128, tp=8"
1,73.55,28244.18,"input_len=256, output_len=128, tp=8"
2,62.58,40052.48,"input_len=512, output_len=128, tp=8"
3,45.13,51987.43,"input_len=1024, output_len=128, tp=8"
4,30.66,66719.98,"input_len=2048, output_len=128, tp=8"
5,18.16,76695.09,"input_len=4096, output_len=128, tp=8"


In [15]:
# List of configurations to test
configs = [
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 131, 'output_len': 131, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2000, 'output_len': 2000, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2048, 'output_len': 2048, 'tp': 8},
]

# Initialize an empty DataFrame to store all results
all_results = pd.DataFrame()

# Run benchmarks for each configuration
for config in configs:
    print(config)
    df_result = benchmark_model(**config)
    if df_result is not None:
        # Add a column for the configuration
        df_result['Config'] = f"input_len={config['input_len']}, output_len={config['output_len']}, tp={config['tp']}"
        all_results = pd.concat([all_results, df_result], ignore_index=True)

# Display all results
display(all_results)

{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 131, 'output_len': 131, 'tp': 8}
  Run time: 77.59 seconds
   Requests per Second  Tokens per Second
0                 81.1           21247.29
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2000, 'output_len': 2000, 'tp': 8}
  Run time: 298.27 seconds
   Requests per Second  Tokens per Second
0                 4.29           17156.23
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2048, 'output_len': 2048, 'tp': 8}
  Run time: 305.56 seconds
   Requests per Second  Tokens per Second
0                 4.15           16997.87


Unnamed: 0,Requests per Second,Tokens per Second,Config
0,81.1,21247.29,"input_len=131, output_len=131, tp=8"
1,4.29,17156.23,"input_len=2000, output_len=2000, tp=8"
2,4.15,16997.87,"input_len=2048, output_len=2048, tp=8"


## vLLM ROCm
We switch the kernel here to `vllm-rocm` (in the top right pull-down) and run everything again.

In [6]:
%pushd /
!VLLM_WORKER_MULTIPROC_METHOD='spawn' python -c 'import vllm; print(vllm.__version__)'
%popd

/
0.6.4.dev5+g4bba0922
/mnt/nvme1n1p1/MI300-testing
popd -> /mnt/nvme1n1p1/MI300-testing


In [9]:
# Increase File handles
!ulimit -n 131072
!python ~/vllm-rocm/collect_env.py

Collecting environment information...
PyTorch version: 2.6.0.dev20241015+rocm6.2
Is debug build: False
CUDA used to build PyTorch: N/A
ROCM used to build PyTorch: 6.2.41133-dd7f95766

OS: Ubuntu 22.04.5 LTS (x86_64)
GCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
Clang version: Could not collect
CMake version: version 3.30.5
Libc version: glibc-2.35

Python version: 3.11.10 | packaged by conda-forge | (main, Sep 30 2024, 18:08:57) [GCC 13.3.0] (64-bit runtime)
Python platform: Linux-6.8.0-47-generic-x86_64-with-glibc2.35
Is CUDA available: True
CUDA runtime version: Could not collect
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration: AMD Instinct MI300X (gfx942:sramecc+:xnack-)
Nvidia driver version: Could not collect
cuDNN version: Could not collect
HIP runtime version: 6.2.41133
MIOpen runtime version: 3.2.0
Is XNNPACK available: True

CPU:
Architecture:                         x86_64
CPU op-mode(s):                       32-bit, 64-bit
Address sizes:                  

In [3]:
import pandas as pd
import re
import nbformat

In [11]:
# Simple Benchmark
import time
import pandas as pd
import re

def benchmark_model(model, input_len, output_len, tp):    
    # Initialize the DataFrame
    df = pd.DataFrame(columns=['Requests per Second', 'Tokens per Second'])
    
    # Function to run the benchmark command and capture output
    def run_benchmark():
        command = f"VLLM_USE_TRITON_FLASH_ATTN=0 time python vllm/benchmarks/benchmark_throughput.py --backend vllm --input-len {input_len} --output-len {output_len} --model {model} -tp {tp}"
        # Run the command and capture the output
        start = time.time()
        output = get_ipython().getoutput(command)
        end = time.time()
        total_time = end-start
        output_str = ' '.join(output)
        print(f"  Run time: {total_time:.2f} seconds")
        # Use regular expressions to extract the throughput values
        matches = re.findall(r"Throughput:\s*([\d.]+)\s*requests/s,\s*([\d.]+)\s*tokens/s", output_str)
        if matches:
            requests_per_sec, tokens_per_sec = map(float, matches[0])
            return requests_per_sec, tokens_per_sec
        else:
            print(f"No throughput data found for {tuning} tuning.")
            return None, None

    # Run benchmarks for no GEMM Tuning
    none_rps, none_tps = run_benchmark()
    if none_rps is None or none_tps is None:
        print("Benchmark failed.")
        return None

    # Append No GEMM Tuning results to the DataFrame
    df.loc[len(df)] = {'Requests per Second': none_rps, 'Tokens per Second': none_tps}

    # Display the DataFrame
    print(df)
    return df

In [13]:
# List of configurations to test
configs = [
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 256, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 512, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 1024, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 2048, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 4096, 'tp': 8},
]

# Initialize an empty DataFrame to store all results
all_results = pd.DataFrame()

# Run benchmarks for each configuration
for config in configs:
    print(config)
    df_result = benchmark_model(**config)
    if df_result is not None:
        # Add a column for the configuration
        df_result['Config'] = f"input_len={config['input_len']}, output_len={config['output_len']}, tp={config['tp']}"
        all_results = pd.concat([all_results, df_result], ignore_index=True)

# Display all results
display(all_results)

{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 128, 'tp': 8}
  Run time: 81.67 seconds
   Requests per Second  Tokens per Second
0                85.87           10990.94
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 256, 'tp': 8}
  Run time: 91.41 seconds
   Requests per Second  Tokens per Second
0                44.13           11296.39
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 512, 'tp': 8}
  Run time: 112.58 seconds
   Requests per Second  Tokens per Second
0                22.52           11530.38
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 1024, 'tp': 8}
  Run time: 161.37 seconds
   Requests per Second  Tokens per Second
0                10.89           11154.05
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 0, 'output_len': 2048, 'tp': 8}
  Run time: 257.02 seconds
   Requests per Second  Tokens per Second
0                  5.3           1086

Unnamed: 0,Requests per Second,Tokens per Second,Config
0,85.87,10990.94,"input_len=0, output_len=128, tp=8"
1,44.13,11296.39,"input_len=0, output_len=256, tp=8"
2,22.52,11530.38,"input_len=0, output_len=512, tp=8"
3,10.89,11154.05,"input_len=0, output_len=1024, tp=8"
4,5.3,10860.82,"input_len=0, output_len=2048, tp=8"
5,2.2,9026.35,"input_len=0, output_len=4096, tp=8"


In [14]:
# List of configurations to test
configs = [
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 128, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 256, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 512, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 1024, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2048, 'output_len': 128, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 4096, 'output_len': 128, 'tp': 8},
]

# Initialize an empty DataFrame to store all results
all_results = pd.DataFrame()

# Run benchmarks for each configuration
for config in configs:
    print(config)
    df_result = benchmark_model(**config)
    if df_result is not None:
        # Add a column for the configuration
        df_result['Config'] = f"input_len={config['input_len']}, output_len={config['output_len']}, tp={config['tp']}"
        all_results = pd.concat([all_results, df_result], ignore_index=True)

# Display all results
display(all_results)

{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 128, 'output_len': 128, 'tp': 8}
  Run time: 80.26 seconds
   Requests per Second  Tokens per Second
0                 78.4           20070.07
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 256, 'output_len': 128, 'tp': 8}
  Run time: 83.16 seconds
   Requests per Second  Tokens per Second
0                69.67           26753.28
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 512, 'output_len': 128, 'tp': 8}
  Run time: 84.45 seconds
   Requests per Second  Tokens per Second
0                60.19           38520.14
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 1024, 'output_len': 128, 'tp': 8}
  Run time: 89.50 seconds
   Requests per Second  Tokens per Second
0                44.31           51043.84
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2048, 'output_len': 128, 'tp': 8}
  Run time: 101.74 seconds
   Requests per Second  Tokens per Second
0                30.22       

Unnamed: 0,Requests per Second,Tokens per Second,Config
0,78.4,20070.07,"input_len=128, output_len=128, tp=8"
1,69.67,26753.28,"input_len=256, output_len=128, tp=8"
2,60.19,38520.14,"input_len=512, output_len=128, tp=8"
3,44.31,51043.84,"input_len=1024, output_len=128, tp=8"
4,30.22,65750.09,"input_len=2048, output_len=128, tp=8"
5,18.02,76106.02,"input_len=4096, output_len=128, tp=8"


In [12]:
# List of configurations to test
configs = [
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 131, 'output_len': 131, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2000, 'output_len': 2000, 'tp': 8},
    {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2048, 'output_len': 2048, 'tp': 8},
]

# Initialize an empty DataFrame to store all results
all_results = pd.DataFrame()

# Run benchmarks for each configuration
for config in configs:
    print(config)
    df_result = benchmark_model(**config)
    if df_result is not None:
        # Add a column for the configuration
        df_result['Config'] = f"input_len={config['input_len']}, output_len={config['output_len']}, tp={config['tp']}"
        all_results = pd.concat([all_results, df_result], ignore_index=True)

# Display all results
display(all_results)

{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 131, 'output_len': 131, 'tp': 8}
  Run time: 82.60 seconds
   Requests per Second  Tokens per Second
0                 71.3            18679.5
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2000, 'output_len': 2000, 'tp': 8}
  Run time: 300.54 seconds
   Requests per Second  Tokens per Second
0                 4.31           17234.79
{'model': 'meta-llama/Llama-3.1-8B-Instruct', 'input_len': 2048, 'output_len': 2048, 'tp': 8}
  Run time: 307.76 seconds
   Requests per Second  Tokens per Second
0                 4.21           17257.16


Unnamed: 0,Requests per Second,Tokens per Second,Config
0,71.3,18679.5,"input_len=131, output_len=131, tp=8"
1,4.31,17234.79,"input_len=2000, output_len=2000, tp=8"
2,4.21,17257.16,"input_len=2048, output_len=2048, tp=8"
