In [1]:
from vllm import LLM, SamplingParams
import time
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# constants
max_tokens = 50
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=max_tokens)

## bp16

In [3]:
model = "/root/autodl-fs/data2/anti_fraud/models/modelscope/hub/hub/Qwen/Qwen2-7B"

llm = LLM(model=model, gpu_memory_utilization=0.9)


INFO 03-14 15:05:22 __init__.py:207] Automatically detected platform cpu.


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/root/autodl-fs/data2/anti_fraud/models/modelscope/hub/hub/Qwen/Qwen2-7B'. Use `repo_type` argument if needed.

In [None]:
# observations
durations = []
throughputs = []
latencies = []

batch_sizes = [2 ** p for p in range(10)]
for batch_size in batch_sizes:
    print(f"bs={batch_size}")

    # generate tokens for batch and record duration
    t0 = time.time()
    batch_prompt = [
        prompts[i % len(prompts)] for i in range(batch_size)
    ]

    outputs = llm.generate(batch_prompt, sampling_params)

    duration_s = time.time() - t0

    # calculate throughput
    ntokens = batch_size * max_tokens
    throughput = ntokens / duration_s
    avg_latency = duration_s / max_tokens
    print(f"duration: {duration_s}")
    print(f"throughput: {throughput} tokens/s")
    print(f"avg latency: {avg_latency}")
    print()

    durations.append(duration_s)
    throughputs.append(throughput)
    latencies.append(avg_latency)


In [None]:
# save observations and restart kernel
# save as json
import json

data = {
    "durations": durations,
    "throughputs": throughputs,
    "latencies": latencies,
    "batch_sizes": batch_sizes,
    "prompts": prompts,
    "max_tokens": max_tokens

}

with open("basic_fp16_inference_observation.json", "w") as f:
    json.dump(data, f)



In [None]:
import json

# Read the data from the JSON file
with open("basic_fp16_inference_observation.json", "r") as f:
    data = json.load(f)

# Restore the lists
durations = data["durations"]
throughputs = data["throughputs"]
latencies = data["latencies"]
batch_sizes = data["batch_sizes"]
prompts = data["prompts"]
max_tokens = data["max_tokens"]

## int8 kv cache

In [None]:
from vllm import LLM, SamplingParams
import matplotlib.pyplot as plt
import time

In [None]:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=max_tokens)

In [None]:
model = "/root/autodl-fs/data2/anti_fraud/models/modelscope/hub/hub/Qwen/Qwen2-7B"

llm_with_kv_cache = LLM(model=model,
                        kv_cache_dtype="auto",
                        calculate_kv_scales=True,
                        gpu_memory_utilization=0.9)

In [None]:
# observations
durations_with_kv_cache = []
throughputs_with_kv_cache = []
latencies_with_kv_cache = []

for batch_size in batch_sizes:
    print(f"bs={batch_size}")

    # generate tokens for batch and record duration
    t0 = time.time()
    batch_prompt = [
        prompts[i % len(prompts)] for i in range(batch_size)
    ]

    outputs = llm_with_kv_cache.generate(batch_prompt, sampling_params)

    duration_s = time.time() - t0

    # calculate throughput
    ntokens = batch_size * max_tokens
    throughput = ntokens / duration_s
    avg_latency = duration_s / max_tokens
    print(f"duration: {duration_s}")
    print(f"throughput: {throughput} tokens/s")
    print(f"avg latency: {avg_latency}")
    print()

    durations_with_kv_cache.append(duration_s)
    throughputs_with_kv_cache.append(throughput)
    latencies_with_kv_cache.append(avg_latency)

In [None]:
def render_plot(x, y1, y2, y3, y4, x_label, y1_label, y2_label):
    fig, ax1 = plt.subplots()

    # plot the first line (FP32 throughput)
    color = 'tab:red'
    ax1.set_xlabel(x_label)
    ax1.set_ylabel(y1_label, color=color)
    ax1.plot(x, y1, color=color, label='FP16 Throughput', linestyle='-')
    ax1.tick_params(axis='y', labelcolor=color)

    # plot the third line (kv cache throughput)
    ax1.plot(x, y3, color=color, label='kv cache Throughput', linestyle='--')

    # set the x-axis to be log scaled
    ax1.set_xscale('log', base=2)

    # Instantiate a second axes shares the same x-axis
    ax2 = ax1.twinx()
    color = 'tab:blue'
    ax2.set_ylabel(y2_label, color=color)
    ax2.plot(x, y2, color=color, label='FP16 Latency', linestyle='-')
    ax2.tick_params(axis='y', labelcolor=color)

    # plot the fourth line (kv cache latency)
    ax2.plot(x, y4, color=color, label='kv cache Latency', linestyle='--')

    # Add legends
    fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))

    plt.show()

In [None]:
render_plot(
    batch_sizes,
    throughputs,
    latencies,
    throughputs_with_kv_cache,
    latencies_with_kv_cache,
    "batch size",
    "throughput (tokens/s)",
    "avg latency (s)"
)