<a href="https://colab.research.google.com/github/AbhiK57/LLM-Benchmarking/blob/main/SGLang_Bench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install "transformers>=4.44.2" "tokenizers>=0.19.1" "pynvml" "pandas" "torch" "gpustat" "nvidia-ml-py3"
!pip install --upgrade "sglang[cuda12x]>=0.3.1"



In [2]:
import sglang as sgl
import asyncio
from google.colab import userdata
import os
import torch
from pynvml import *
import time
import threading
import pandas as pd
from transformers import AutoTokenizer
from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetHandleByIndex, nvmlDeviceGetPowerUsage, nvmlDeviceGetMemoryInfo
from google.colab import files

In [3]:
hf_token = userdata.get('HF_TOKEN')
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
backend = sgl.Runtime(model_path=MODEL_ID,
                      tokenizer_path=MODEL_ID,
                      dtype=torch.bfloat16,
                      tp_size=1)
sgl.set_default_backend(backend)

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

`torch_dtype` is deprecated! Use `dtype` instead!


In [4]:
@sgl.function
def ask_question(s, topic):
    s += sgl.user(f"Tell me a fun fact about {topic}.")
    s += sgl.assistant(sgl.gen("answer", max_tokens=256))

In [5]:
state = ask_question(topic="the planet Jupiter")
response = state["answer"]
print(response)

Here's a fun fact about Jupiter:

Jupiter is the largest planet in our solar system by volume! That's even bigger than the rock that the planet is made of – ice and hydrogen! In fact, it's so massive that it is more than 1,300 times the mass of Earth. To put it in perspective, the diameter of Jupiter is about 11 times the diameter of the Earth. That's a golf ball-sized planet!


In [16]:
nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)

def read_gpu_power_watts():
    return nvmlDeviceGetPowerUsage(gpu_handle) / 1000.0

def read_gpu_mem():
    info = nvmlDeviceGetMemoryInfo(gpu_handle)
    return dict(used=info.used, total=info.total, util=info.used/info.total)

class PowerSampler:
    def __init__(self, interval=0.2):
        self.interval=interval
        self.samples=[]
        self._stop=threading.Event()
        self._t=None
    def _run(self):
        while not self._stop.is_set():
            try:
                self.samples.append((time.time(), read_gpu_power_watts()))
            except: pass
            time.sleep(self.interval)
    def stop(self):
        self._stop.set(); self._t.join()
    def stats(self):
        vals = [p for _, p in self.samples]
        if not vals:
            return dict(avg_power=None, peak_power=None)
        return dict(avg_power=sum(vals)/len(vals), peak_power=max(vals))

    def start(self):
        self.samples.clear(); self._stop.clear()
        self._t = threading.Thread(target=self._run, daemon=True); self._t.start()
        time.sleep(0.05)  # tiny warmup to likely capture at least one sample


In [17]:
PROMPT_SIZES = [10, 25, 50]                 # target input tokens
BATCH_SIZES  = [1, 10, 25]                   # number of prompts per run
FIXED_OUTPUT =  128  # max new tokens

In [18]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

In [19]:
# @title Helper Functions
def make_prompt(n_tokens: int, seed="You are a helpful assistant."):
    """Generates a prompt with a target number of tokens."""
    text = seed
    while len(tokenizer.encode(text, add_special_tokens=False)) < n_tokens:
        text += " The quick brown fox jumps over the lazy dog."
    # Truncate to the exact number of tokens
    encoded = tokenizer.encode(text, add_special_tokens=False)[:n_tokens]
    return tokenizer.decode(encoded)

In [20]:
@sgl.function
def generate_text(s, prompt):
    s += sgl.user(prompt)
    s += sgl.assistant(sgl.gen("output", max_tokens=FIXED_OUTPUT, temperature=0.0))

In [21]:
# @title Benchmark Runner Logic
def run_sglang_batch(prompts, max_new_tokens=128):
    """
    Runs a batch of prompts using SGLang and measures performance.
    """
    ps = PowerSampler(); ps.start()
    mem_before = read_gpu_mem(); t0 = time.time()

    # --- SGLang Batch Inference ---
    #create list of asynchronous generation requests
    requests = [generate_text(prompt=p) for p in prompts]
    #execute
    _ = [st["output"] for st in requests]

    t1 = time.time(); ps.stop()
    mem_after = read_gpu_mem(); s = ps.stats(); total = t1 - t0

    return dict(
        latency_s=total / len(prompts),
        total_s=total,
        qps=len(prompts) / total,
        avg_power_w=s["avg_power"],
        peak_power_w=s["peak_power"],
        mem_before=mem_before["used"],
        mem_after=mem_after["used"]
    )

In [22]:
rows = []
backend_name = "sglang"

print(f"--- Running benchmarks for {backend_name.upper()} ---")

for n in PROMPT_SIZES:
    for b in BATCH_SIZES:
        prompts = [make_prompt(n) for _ in range(b)]
        print(f"  → prompt={n} tokens | batch={b}")
        try:
            result = run_sglang_batch(prompts, FIXED_OUTPUT)
            result.update(dict(backend=backend_name, prompt_tokens=n, batch_size=b, output_tokens=FIXED_OUTPUT))
            rows.append(result)
        except Exception as e:
            print(f"Error for {backend_name} n={n} b={b}: {e}")
            # Optional: Add more detailed error logging if needed
            # import traceback
            # traceback.print_exc()

# @title Save and Display Results
df = pd.DataFrame(rows)
output_filename = "/content/sglang_results.csv"
df.to_csv(output_filename, index=False)
print(f"\n Benchmark complete! Saved to {output_filename}")

--- Running benchmarks for SGLANG ---
  → prompt=10 tokens | batch=1
  → prompt=10 tokens | batch=10
  → prompt=10 tokens | batch=25
  → prompt=25 tokens | batch=1
  → prompt=25 tokens | batch=10
  → prompt=25 tokens | batch=25
  → prompt=50 tokens | batch=1
  → prompt=50 tokens | batch=10
  → prompt=50 tokens | batch=25

 Benchmark complete! Saved to /content/sglang_results.csv


In [23]:
display(df)

Unnamed: 0,latency_s,total_s,qps,avg_power_w,peak_power_w,mem_before,mem_after,backend,prompt_tokens,batch_size,output_tokens
0,0.233016,0.233016,4.291559,94.9875,141.375,39070859264,39070859264,sglang,10,1,128
1,0.026124,0.261242,38.278732,131.5015,199.557,39070859264,39070859264,sglang,10,10,128
2,0.011972,0.299308,83.526116,132.963,201.018,39070859264,39070859264,sglang,10,25,128
3,0.256871,0.256871,3.893001,157.6895,209.998,39070859264,39070859264,sglang,25,1,128
4,0.044908,0.44908,22.26776,156.713,201.018,39070859264,39070859264,sglang,25,10,128
5,0.013619,0.340478,73.426195,120.594,174.548,39070859264,39070859264,sglang,25,25,128
6,0.043957,0.043957,22.749755,211.458,211.458,39070859264,39070859264,sglang,50,1,128
7,0.006961,0.069609,143.660227,79.754,79.754,39070859264,39070859264,sglang,50,10,128
8,0.004084,0.102107,244.842224,64.908,64.908,39070859264,39070859264,sglang,50,25,128
