In [None]:
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server   --model zhiqing/Qwen3-14B-INT8   --served-model-name qwen3   --tensor-parallel-size 2   --max-model-len 28000   --port 8000 --max-num-seqs 10 --gpu-memory-utilization 0.9 --no-enable-prefix-caching

In [6]:
import requests
import time
import json
from transformers import AutoTokenizer

# Load Qwen3 tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B-AWQ")

In [7]:
import requests
response = requests.get("http://localhost:8000/v1/models")
response.json()['data'][0]['root']

'zhiqing/Qwen3-14B-INT8'

In [8]:
import os
import glob

def measure_vllm_response(file_path, vllm_url="http://localhost:8000/v1/chat/completions", 
                         max_input_tokens=None, max_output_tokens=1024):
    """
    Send file content to vLLM chat endpoint and measure response metrics with streaming
    
    Args:
        file_path: Path to the input file
        vllm_url: vLLM endpoint URL
        max_input_tokens: Maximum tokens for input (for truncation), None for no limit
        max_output_tokens: Maximum tokens for output response
    """
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Truncate content if max_input_tokens is specified
    if max_input_tokens is not None:
        tokens = tokenizer.encode(content)
        if len(tokens) > max_input_tokens:
            truncated_tokens = tokens[:max_input_tokens]
            content = tokenizer.decode(truncated_tokens)
            print(f"Content truncated from {len(tokens)} to {max_input_tokens} tokens")
    
    # Get actual input token count
    input_tokens = len(tokenizer.encode(content))
    
    # Prepare chat request with streaming
    payload = {
        "model": "qwen3",
        "messages": [
            {"role": "user", "content": content}
        ],
        "max_tokens": max_output_tokens,
        "temperature": 0.7,
        "stream": True,
        "stream_options": {"include_usage": True},  # Request usage info in stream
        "chat_template_kwargs": {
                "enable_thinking": False
        }
    }
    
    headers = {
        "Content-Type": "application/json"
    }
    
    # Start timing
    start_time = time.time()
    ttft = None
    
    # Send request to vLLM with streaming
    response = requests.post(vllm_url, json=payload, headers=headers, stream=True)
    
    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")
    
    # Process streaming response
    full_response = ""
    output_tokens = 0
    actual_input_tokens = None
    actual_output_tokens = None
    
    print("Response streaming:")
    print("-" * 50)
    
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                data_str = line[6:]  # Remove 'data: ' prefix
                if data_str.strip() == '[DONE]':
                    break
                
                try:
                    data = json.loads(data_str)
                    
                    # Check for usage information (comes in final event)
                    if 'usage' in data:
                        actual_input_tokens = data['usage']['prompt_tokens']
                        actual_output_tokens = data['usage']['completion_tokens']
                        print(f"\nUsage info received: {actual_input_tokens} input tokens, {actual_output_tokens} output tokens")
                    
                    if 'choices' in data and len(data['choices']) > 0:
                        choice = data['choices'][0]
                        if 'delta' in choice and 'content' in choice['delta']:
                            # Record TTFT (time to first token)
                            if ttft is None:
                                ttft = time.time() - start_time
                            
                            content_chunk = choice['delta']['content']
                            full_response += content_chunk
                            
                            # Stream to console
                            print(content_chunk, end='', flush=True)
                            
                except json.JSONDecodeError:
                    continue
    
    print()  # New line after streaming
    print("-" * 50)
    
    end_time = time.time()
    total_time = end_time - start_time
    
    # If no TTFT was recorded (no tokens received), set it to total time
    if ttft is None:
        ttft = total_time
    
    # Use actual token counts from server if available, otherwise fall back to tokenizer estimate
    if actual_input_tokens is not None and actual_output_tokens is not None:
        input_tokens = actual_input_tokens
        output_tokens = actual_output_tokens
        print(f"Using server-reported token counts: {input_tokens} input, {output_tokens} output")
    else:
        # Fallback: estimate output tokens using tokenizer
        output_tokens = len(tokenizer.encode(full_response))
        print(f"Using tokenizer estimates: {input_tokens} input, {output_tokens} output")
    
    # Calculate generation speed using actual token counts
    generation_time = max(total_time - ttft, 1e-9)
    gen_tokens_per_sec = output_tokens / generation_time
    
    # Total tokens
    total_tokens = input_tokens + output_tokens
    
    # Metrics dictionary
    metrics = {
        'ttft': ttft,
        'gen_tokens_per_sec': gen_tokens_per_sec,
        'total_tokens': total_tokens,
        'total_time': total_time,
        'input_tokens': input_tokens,
        'output_tokens': output_tokens,
        'generation_time': generation_time
    }
    
    return full_response, metrics

def save_and_print_metrics(response, metrics, output_file):
    """
    Save response and metrics to file and print metrics
    """
    # Print metrics
    print(f"TTFT: {metrics['ttft']:.2f} seconds")
    print(f"Gen tokens/sec (post-TTFT): {metrics['gen_tokens_per_sec']:.2f}")
    # print(f"E2E tokens/sec (incl. prefill): {metrics['e2e_tokens_per_sec']:.2f}")
    print(f"Total tokens: {metrics['total_tokens']}")
    print(f"Input tokens: {metrics['input_tokens']}")
    print(f"Output tokens: {metrics['output_tokens']}")
    print(f"Total time: {metrics['total_time']:.2f} seconds")
    print(f"Generation time: {metrics['generation_time']:.2f} seconds")
    
    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        # Write metrics in the format similar to the examples
        f.write(f"ttft: {metrics['ttft']:.2f}\n")
        f.write(f"gen_tokens_per_second: {metrics['gen_tokens_per_sec']:.2f}\n")
        # f.write(f"e2e_tokens_per_second: {metrics['e2e_tokens_per_sec']:.2f}\n")
        f.write(f"total_tokens: {metrics['total_tokens']}\n")
        f.write(f"total_time: {metrics['total_time']:.2f}\n")
        f.write(f"input_tokens: {metrics['input_tokens']}\n")
        f.write(f"output_tokens: {metrics['output_tokens']}\n")
        f.write(f"generation_time: {metrics['generation_time']:.2f}\n\n")
        f.write(response)

# file_path = "tests/daily.txt"
# vllm_url = "http://localhost:8000/v1/chat/completions"
# max_input_tokens =  28000
# max_output_tokens = 1024

# measure_vllm_response(file_path, vllm_url, max_input_tokens, max_output_tokens)

In [5]:
# # Create output directory if it doesn't exist
# # Get model name from running vLLM server
# try:
#     import requests
#     response = requests.get("http://localhost:8000/v1/models")
#     if response.status_code == 200:
#         models_data = response.json()
#         if models_data.get('data') and len(models_data['data']) > 0:
#             model_name = models_data['data'][0]['root'].split('/')[-1]
#         else:
#             model_name = "unknown_model"
#     else:
#         model_name = "unknown_model"
# except Exception as e:
#     print(f"Could not get model name from server: {e}")
#     model_name = "unknown_model"

# output_dir = f"summary_results/T2_x2/{model_name}"
# os.makedirs(output_dir, exist_ok=True)

# # Find all .txt files in tests folder
# txt_files = glob.glob("tests/*.txt")

# print(f"Found {len(txt_files)} .txt files in tests folder:")
# for file in txt_files:
#     print(f"  - {file}")

# print("\nStarting measurements...\n")

# # Process each .txt file
# for file_path in txt_files:
#     # Get base filename without extension
#     base_name = os.path.splitext(os.path.basename(file_path))[0]
#     output_file = os.path.join(output_dir, f"{base_name}_result.txt")
    
#     print(f"=== Processing {file_path} ===")
    
#     try:
#         full_response, metrics = measure_vllm_response(
#             file_path=file_path,
#             max_input_tokens=31000,  # No token limit
#             max_output_tokens=1024
#         )
        
#         save_and_print_metrics(full_response, metrics, output_file)
#         print(f"Results saved to: {output_file}\n")
        
#     except Exception as e:
#         print(f"Error processing {file_path}: {e}\n")
#         continue

# print("All files processed!")


In [6]:
# file_path = "tests/daily.txt"
# input_token_sizes = [25000]

# print("Testing different input token sizes:")
# for token_size in input_token_sizes:
#     print(f"\n=== Testing with {token_size} input tokens ===")
#     output_file = f"speed_tests/A2/{token_size}_length_1_parallel.txt"
    
#     response, metrics = measure_vllm_response(
#         file_path=file_path,
#         max_input_tokens=token_size,
#         max_output_tokens=100
#     )

In [None]:
# Warmup request to prepare the model
print("Warming up the model with a test request...")
warmup_response, warmup_metrics = measure_vllm_response(
    file_path="tests/daily.txt",
    max_input_tokens=100,
    max_output_tokens=10
)
print(f"Warmup completed in {warmup_metrics['total_time']:.2f}s")
print("Model is ready for testing.\n")

file_path = "tests/book.txt"
input_token_sizes = [1000, 5000, 10000, 15000 , 20000, 25000, 30000]
input_token_sizes = [25000]

import threading
import time
import os

def run_concurrent_test(file_path, max_input_tokens, max_output_tokens, request_id):
    """Run a single request for concurrent testing"""
    print(f"Starting request {request_id}")
    start = time.time()
    
    try:
        response, metrics = measure_vllm_response(
            file_path=file_path,
            max_input_tokens=max_input_tokens,
            max_output_tokens=max_output_tokens
        )
        
        # Check if token generation (after prefill) takes too long
        generation_time = metrics['total_time'] - metrics['ttft']
        time_limit = 7
        if generation_time > time_limit:
            print(f"Request {request_id} generation took {generation_time:.2f}s (>{time_limit}s), but recording metrics...")
            
        end = time.time()
        print(f"Request {request_id} completed in {end - start:.2f}s")
        return metrics
        
    except Exception as e:
        print(f"Request {request_id} failed: {e}")
        return None

print("Testing different input token sizes with various concurrent requests:")

concurrent_counts = [5] 

for token_size in input_token_sizes:
    print(f"\n" + "="*80)
    print(f"Testing with {token_size} input tokens")
    print("="*80)
    
    for concurrent_count in concurrent_counts:
        print(f"\n=== {token_size} tokens with {concurrent_count} concurrent requests ===")
        print(f"Waiting for all current requests to complete before starting this set...")
        
        threads = []
        start_time = time.time()
        results = []
        
        # Start concurrent requests
        for i in range(concurrent_count):
            thread = threading.Thread(
                target=lambda i=i: results.append(run_concurrent_test(file_path, token_size, 100, f"{token_size}_{concurrent_count}_{i+1}"))
            )
            threads.append(thread)
            thread.start()
        
        # Wait for ALL threads to complete before proceeding to next set
        print(f"Waiting for all {concurrent_count} threads to complete...")
        for i, thread in enumerate(threads):
            thread.join()
            print(f"Thread {i+1}/{concurrent_count} completed")
        
        end_time = time.time()
        total_concurrent_time = end_time - start_time
        
        # Filter out None results (failed requests only)
        valid_results = [r for r in results if r is not None]
        
        print(f"All {concurrent_count} requests completed in {total_concurrent_time:.2f}s")
        print(f"Valid results: {len(valid_results)}/{concurrent_count}")
        print(f"Average time per request: {total_concurrent_time/concurrent_count:.2f}s")
        
        # Save aggregated metrics
        # Get model name from vLLM API
        try:
            import requests
            response = requests.get("http://localhost:8000/v1/models")
            models_data = response.json()
            model_name = models_data['data'][0]['root'].split('/')[-1] if models_data['data'] else "unknown_model"
        except Exception as e:
            print(f"Failed to get model name from API: {e}")
            model_name = "Qwen3-unkown"  # fallback
            
        output_dir = f"speed_tests/A2_x2/{model_name}"
        os.makedirs(output_dir, exist_ok=True)
        output_file = os.path.join(output_dir, f"{token_size}_length_{concurrent_count}_parallel.txt")
        
        # Calculate average metrics
        if valid_results:
            avg_ttft = sum(r['ttft'] for r in valid_results) / len(valid_results)
            avg_gen_tokens_per_sec = sum(r['gen_tokens_per_sec'] for r in valid_results) / len(valid_results)
            avg_total_tokens = sum(r['total_tokens'] for r in valid_results) / len(valid_results)
            avg_total_time = sum(r['total_time'] for r in valid_results) / len(valid_results)
            avg_input_tokens = sum(r['input_tokens'] for r in valid_results) / len(valid_results)
            avg_output_tokens = sum(r['output_tokens'] for r in valid_results) / len(valid_results)
            
            # Create average metrics dictionary
            avg_metrics = {
                'ttft': avg_ttft,
                'gen_tokens_per_sec': avg_gen_tokens_per_sec,
                'total_tokens': avg_total_tokens,
                'total_time': avg_total_time,
                'input_tokens': avg_input_tokens,
                'output_tokens': avg_output_tokens
            }
            
            # Print metrics
            print(f"Average TTFT: {avg_metrics['ttft']:.2f} seconds")
            print(f"Average Tokens/sec: {avg_metrics['gen_tokens_per_sec']:.2f}")
            print(f"Average Total tokens: {avg_metrics['total_tokens']:.0f}")
            print(f"Average Total time: {avg_metrics['total_time']:.2f} seconds")
            
            # Save average metrics to file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(f"ttft: {avg_metrics['ttft']:.2f}\n")
                f.write(f"gen_tokens_per_second: {avg_metrics['gen_tokens_per_sec']:.2f}\n")
                f.write(f"total_tokens: {avg_metrics['total_tokens']:.0f}\n")
                f.write(f"total_time: {avg_metrics['total_time']:.2f}\n")
                f.write(f"input_tokens: {avg_metrics['input_tokens']:.0f}\n")
                f.write(f"output_tokens: {avg_metrics['output_tokens']:.0f}\n")
                f.write(f"concurrent_requests: {concurrent_count}\n")
                f.write(f"valid_requests: {len(valid_results)}\n")
                f.write(f"total_concurrent_time: {total_concurrent_time:.2f}\n\n")
                f.write(f"Average metrics for {len(valid_results)}/{concurrent_count} concurrent requests of {token_size} tokens each\n\n")
                
                # Write individual request metrics for ALL valid results
                f.write("Individual request metrics:\n")
                for i, metrics in enumerate(valid_results):
                    f.write(f"\nRequest {i+1}:\n")
                    f.write(f"  ttft: {metrics['ttft']:.2f}\n")
                    f.write(f"  gen_tokens_per_second: {metrics['gen_tokens_per_sec']:.2f}\n")
                    f.write(f"  total_tokens: {metrics['total_tokens']}\n")
                    f.write(f"  total_time: {metrics['total_time']:.2f}\n")
                    f.write(f"  input_tokens: {metrics['input_tokens']}\n")
                    f.write(f"  output_tokens: {metrics['output_tokens']}\n")
        else:
            print("No valid results to save")
        
        # Add a pause between different concurrent count sets for the same token size
        print(f"Set [{token_size} tokens - {concurrent_count} parallel] completed. Proceeding to next set...")
        time.sleep(2)  # Brief pause to ensure system is ready for next set

Warming up the model with a test request...
Content truncated from 13961 to 100 tokens
Response streaming:
--------------------------------------------------
Кажется, вы ввели фрагмент
Usage info received: 111 input tokens, 10 output tokens

--------------------------------------------------
Using server-reported token counts: 111 input, 10 output
Warmup completed in 0.63s
Model is ready for testing.

Testing different input token sizes with various concurrent requests:

Testing with 1000 input tokens

=== 1000 tokens with 1 concurrent requests ===
Waiting for all current requests to complete before starting this set...
Starting request 1000_1_1
Waiting for all 1 threads to complete...
Content truncated from 181863 to 1000 tokens
Response streaming:
--------------------------------------------------
**Огонь и лед**  
**Эрин Хантер**  
**Пролог**

Оранжевые языки пламени плясали в холодном воздухе, бросая в ночное небо снопы ослепительных искр. Отсветы огня пробегали по жесткой траве пу