In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
import requests
import json
import time
from datetime import datetime

def stream_from_vllm_server(prompt, max_chunks=100, temperature=0.7, server_url="http://localhost:8000"):
    """
    Stream response from vLLM server
    
    Args:
        prompt (str): Input prompt text
        max_chunks (int): Maximum chunks to generate
        temperature (float): Temperature for sampling
        server_url (str): vLLM server URL
    
    Returns:
        Generator yielding response chunks and usage info
    """
    payload = {
        "prompt": prompt,
        "max_tokens": 200,  # Keep reasonable max_tokens for the server
        "temperature": temperature,
        "stream": True
    }
    
    headers = {
        "Content-Type": "application/json"
    }
    
    try:
        response = requests.post(
            f"{server_url}/v1/completions",
            json=payload,
            headers=headers,
            stream=True
        )
        response.raise_for_status()
        
        first_token_received = False
        ttft = None
        generation_start_time = None
        chunk_count = 0
        usage_info = None
        
        for line in response.iter_lines():
            if line:
                line = line.decode('utf-8')
                if line.startswith('data: '):
                    data = line[6:]  # Remove 'data: ' prefix
                    if data.strip() == '[DONE]':
                        break
                    try:
                        chunk = json.loads(data)
                        if 'choices' in chunk and len(chunk['choices']) > 0:
                            delta = chunk['choices'][0].get('text', '')
                            if delta and not first_token_received:
                                generation_start_time = time.time()
                                ttft = 0  # TTFT is 0 since we start counting from first token
                                first_token_received = True
                            if delta:
                                chunk_count += 1
                                if chunk_count >= max_chunks:
                                    yield delta, ttft, usage_info, generation_start_time
                                    break
                                yield delta, ttft, usage_info, generation_start_time
                        
                        # Check for usage information in the chunk
                        if 'usage' in chunk:
                            usage_info = chunk['usage']
                            
                    except json.JSONDecodeError:
                        continue
        
        # Return final usage info
        return usage_info
                        
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to vLLM server: {e}")
        return None

# Example usage
with open('tests/book.txt') as f:
    full_text = f.read()

# Tokenize and cut the prompt to specific number of tokens
target_tokens = 15000  # Adjust this number as needed
tokens = tokenizer.encode(full_text)
truncated_tokens = tokens[:target_tokens]
prompt = tokenizer.decode(truncated_tokens)

print(f"Original text tokens: {len(tokens)}")
print(f"Truncated to: {len(truncated_tokens)} tokens")
print("Streaming response from vLLM server:")
print("-" * 50)

full_response = ""
ttft = None
final_usage = None
generation_start_time = None

stream_generator = stream_from_vllm_server(prompt, max_chunks=200)

for chunk_data in stream_generator:
    if isinstance(chunk_data, tuple):
        if len(chunk_data) == 4:
            chunk, chunk_ttft, usage, start_time = chunk_data
            if generation_start_time is None and start_time is not None:
                generation_start_time = start_time
        else:
            chunk, chunk_ttft, usage = chunk_data
        if ttft is None and chunk_ttft is not None:
            ttft = chunk_ttft
        if usage is not None:
            final_usage = usage
    else:
        chunk = chunk_data
    
    # print(chunk, end='', flush=True)
    full_response += chunk

# Get final usage info if streaming generator returns it
if hasattr(stream_generator, 'gi_frame') and stream_generator.gi_frame is None:
    try:
        final_usage = stream_generator.send(None)
    except StopIteration as e:
        if hasattr(e, 'value'):
            final_usage = e.value

end_time = time.time()

# Calculate total time only from when generation started
if generation_start_time is not None:
    total_time = end_time - generation_start_time
else:
    total_time = 0
    print("Warning: Could not determine generation start time")

print(f"\n\n--- Performance Metrics ---")
if ttft is not None:
    print(f"TTFT (Time to First Token): {ttft:.2f} seconds")
else:
    print("TTFT: Could not measure")

if final_usage is not None:
    print(f"Input tokens (from API): {final_usage.get('prompt_tokens', 'N/A')}")
    print(f"Output tokens (from API): {final_usage.get('completion_tokens', 'N/A')}")
    print(f"Total tokens (from API): {final_usage.get('total_tokens', 'N/A')}")
    
    completion_tokens = final_usage.get('completion_tokens', 0)
    if total_time > 0 and completion_tokens > 0:
        tokens_per_second = completion_tokens / total_time
        print(f"Tokens/sec: {tokens_per_second:.2f}")
    else:
        print("Tokens/sec: Could not calculate")
else:
    # Fallback to tokenizer count if usage info not available
    final_token_count = len(tokenizer.encode(full_response))
    print(f"Total response tokens (tokenizer): {final_token_count}")
    
    if total_time > 0 and final_token_count > 0:
        tokens_per_second = final_token_count / total_time
        print(f"Tokens/sec: {tokens_per_second:.2f}")
    else:
        print("Tokens/sec: Could not calculate")

print(f"Total generation time: {total_time:.2f} seconds")
print(f"Total response length: {len(full_response)} characters")
if generation_start_time is not None:
    print(f"Generation start time: {datetime.fromtimestamp(generation_start_time).strftime('%Y-%m-%d %H:%M:%S')}")
print(f"End time: {datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Tokens generated: {final_usage.get('completion_tokens', 'N/A') if final_usage else final_token_count}")
print("Generation speed formula: Tokens/sec = Total tokens / Total generation time")


Original text tokens: 181863
Truncated to: 15000 tokens
Streaming response from vLLM server:
--------------------------------------------------


--- Performance Metrics ---
TTFT (Time to First Token): 0.00 seconds
Total response tokens (tokenizer): 200
Tokens/sec: 26.58
Total generation time: 7.52 seconds
Total response length: 469 characters
Generation start time: 2025-09-22 02:56:40
End time: 2025-09-22 02:56:47
Tokens generated: 200
Generation speed formula: Tokens/sec = Total tokens / Total generation time
