In [1]:
!pip -q install vllm
!pip -q install huggingface_hub

# Aim
In this notebook, we intend to benchmark usage of vLLM_Mistral-7B-instruct against just using the model for inference. We want to find out if using vLLM optimizes latency and throughput.

# Procedure
1. Create a batch of 60 queries which will have different response length.
2. Run the model with vLLM and check get the total inference time for the batch. Get the throughput total_words_generated / total_inference_time.
3. Repeat step 2 without using the vLLM. Compare the throughputs.

4. Select a random query and run vLLM+model on the just that query to calculate latency per word.
5. Repeat step 4 without vLLM and copare the latencies.
 Note: Dont use the vLLM metrics from first iteration as there is a cold start, run it multiple times untill it reach stability to get the correct metrics.

# Results
1. Latency decreased more than 15x with vLLM
2. Throughput increased from 18 tokens/s to 385 tk/s
3. Throughput shows significant boost on large batches


In [2]:
import os
import torch
from vllm import LLM, SamplingParams

  from .autonotebook import tqdm as notebook_tqdm
2024-01-21 07:45:06,299	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


# First we run inference with vLLM and compute throughput and latency

In [3]:
def create_prompt(sample):
  """
  This will format our question into the prompt format used by mistral-7B-instruct
  """
  bos_token = "<s>"
  original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
  system_message = "Use the provided input to create an instruction that could have been used to generate the response with an LLM."
  response = sample.replace(original_system_message, "").replace("\n\n### Instruction\n", "").replace("\n### Response\n", "").strip()
  eos_token = "</s>"

  full_prompt = ""
  full_prompt += bos_token
  full_prompt += "### Instruction:"
  full_prompt += "\n" + original_system_message
  full_prompt += "\n\n### Input:"
  full_prompt += "\n" + input
  full_prompt += "\n\n### Response:"
  full_prompt += "\n" + response
  full_prompt += eos_token

  return full_prompt

In [5]:
from huggingface_hub import snapshot_download
MODEL_DIR = '../../models/mistral-7b'
os.makedirs(MODEL_DIR, exist_ok=True)
token = os.getenv("API_KEY")

snapshot_download(
    'mistralai/Mistral-7B-Instruct-v0.1',
    local_dir=MODEL_DIR,
    token=token,
)

Fetching 14 files:   7%|▋         | 1/14 [00:00<00:07,  1.65it/s]
[A

[A[A


[A[A[A
[A

[A[A


[A[A[A


[A[A[A

[A[A
[A


[A[A[A

[A[A
[A


[A[A[A

[A[A
[A


[A[A[A

[A[A


[A[A[A
[A

[A[A


[A[A[A

[A[A
[A


[A[A[A

[A[A


[A[A[A
[A


[A[A[A

[A[A
[A


[A[A[A

[A[A


[A[A[A
[A


[A[A[A

[A[A
[A


[A[A[A


[A[A[A

[A[A
[A
[A


[A[A[A

[A[A

[A[A
[A


[A[A[A

[A[A


[A[A[A
[A

[A[A


[A[A[A
[A
[A

[A[A


[A[A[A
[A


[A[A[A

[A[A

[A[A
[A


[A[A[A


[A[A[A

[A[A


[A[A[A
[A


[A[A[A

[A[A


[A[A[A
[A


[A[A[A
[A

[A[A


[A[A[A

[A[A
[A

[A[A

[A[A


[A[A[A

[A[A
[A
[A

[A[A


[A[A[A

[A[A
[A

[A[A


[A[A[A


[A[A[A

[A[A
[A
[A


[A[A[A

[A[A

[A[A
[A


[A[A[A

[A[A
[A


[A[A[A
[A


[A[A[A

[A[A


[A[A[A
[A

[A[A


[A[A[A
[A

[A[A


[A[A[A

[A[A
[A


[A[A[A



In [None]:

# Sample prompts.
instructions = [
    "Describe India",
    "How did USA win her freedom?",
    "Give me a short summary for harry potter",
    "Explain the game of cricket to me",
    "Who was Emperor Norton I, and what was his significance in San Francisco's history?",
    "What is the Voynich manuscript, and why has it perplexed scholars for centuries?",
    "What was Project A119 and what were its objectives?",
    "What is the 'Dyatlov Pass incident' and why does it remain a mystery?",
    "What is the 'Emu War' that took place in Australia in the 1930s?",
    "What is the 'Phantom Time Hypothesis' proposed by Heribert Illig?",
    "Who was the 'Green Children of Woolpit' as per 12th-century English legend?",
    "What are 'zombie stars' in the context of astronomy?",
    "Who were the 'Dog-Headed Saint' and the 'Lion-Faced Saint' in medieval Christian traditions?",
    "What is the story of the 'Globsters', unidentified organic masses washed up on the shores?",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.75,
            top_p=1,
            max_tokens=8000,
            presence_penalty=1.15,)

# Create an LLM+vLLM instance.
llm = LLM(model=MODEL_DIR, dtype=torch.float16)




In [None]:
prompts = [instruction for instruction in instructions]

In [None]:
%%time
outputs = llm.generate(prompts, sampling_params)

In [None]:
# Count the total number of words generated
num_of_words = 0
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    num_of_words = num_of_words + len(generated_text.split("Generated text:")[0].split(" "))

In [None]:
total_time_taken_for_geneeration = 5
print("number of words/tokens generated by vLLM: ", num_of_words)
print("Thoroughput with vLLM: ", num_of_words / total_time_taken_for_geneeration)

Now lets compute latency which is tokens per second generated for one query for a user.

In [None]:
# Generate response for only one random prompt and calculate tokens per second. This will only use KV caching for optimization
%%time
output = llm.generate(prompts[5], sampling_params)

In [None]:
num_of_words = 0
for output in output:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    num_of_words = num_of_words + len(generated_text.split("Generated text:")[0].split(" "))

In [None]:
time_taken_for_a_query = 0.976
print("latency for a prompt: ", num_of_words / time_taken_for_a_query)

# Now we benchmark mistral-7B without vLLM

In [None]:
!pip install transformers trl accelerate torch bitsandbytes peft datasets -qU

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map='auto',
    use_cache=False
)

model = model.to(dtype=torch.float16, device='cuda')

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs, max_new_tokens=8000, do_sample=True, pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0].replace(prompt, "")

Lets quickly see how much time goes into processing one prompt

In [None]:
%%time
# Count the total number of words generated
num_of_words = 0
for instruction in instructions:
  output = generate_response(instruction, model)
  num_of_words = num_of_words + len(output.split(" "))

In [None]:
total_time_taken_for_generation = 115
print("number of words/tokens generated by model: ", num_of_words)
print("Thoroughput without vLLM: ", num_of_words / total_time_taken_for_generation)

Now lets compute latency for a random query without vLLM

In [None]:
%%time
output = generate_response(instructions[5], model)

In [None]:
num_of_token = len(output.split(" "))
time_taken_to_process_a_query = 14
latency = num_of_token / time_taken_to_process_a_query

In [None]:
print("number of tokens generated for the query:", num_of_token)
print("latency without vLLM: ", latency)