In [4]:
import requests
import time
import json
from transformers import AutoTokenizer

# Load Qwen3 tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def measure_vllm_response(file_path, vllm_url="http://localhost:8000/v1/chat/completions", 
                         max_input_tokens=None, max_output_tokens=1024):
    """
    Send file content to vLLM chat endpoint and measure response metrics with streaming
    
    Args:
        file_path: Path to the input file
        vllm_url: vLLM endpoint URL
        max_input_tokens: Maximum tokens for input (for truncation), None for no limit
        max_output_tokens: Maximum tokens for output response
    """
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Truncate content if max_input_tokens is specified
    if max_input_tokens is not None:
        tokens = tokenizer.encode(content)
        if len(tokens) > max_input_tokens:
            truncated_tokens = tokens[:max_input_tokens]
            content = tokenizer.decode(truncated_tokens)
            print(f"Content truncated from {len(tokens)} to {max_input_tokens} tokens")
    
    # Get actual input token count
    input_tokens = len(tokenizer.encode(content))
    
    # Prepare chat request with streaming
    payload = {
        "model": "qwen3",
        "messages": [
            {"role": "user", "content": content}
        ],
        "max_tokens": max_output_tokens,
        "temperature": 0.7,
        "stream": True,
        "chat_template_kwargs": {
                "enable_thinking": False
        }
    }
    
    headers = {
        "Content-Type": "application/json"
    }
    
    # Start timing
    start_time = time.time()
    ttft = None
    
    # Send request to vLLM with streaming
    response = requests.post(vllm_url, json=payload, headers=headers, stream=True)
    
    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")
    
    # Process streaming response
    full_response = ""
    output_tokens = 0
    
    print("Response streaming:")
    print("-" * 50)
    
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                data_str = line[6:]  # Remove 'data: ' prefix
                if data_str.strip() == '[DONE]':
                    break
                
                try:
                    data = json.loads(data_str)
                    if 'choices' in data and len(data['choices']) > 0:
                        choice = data['choices'][0]
                        if 'delta' in choice and 'content' in choice['delta']:
                            # Record TTFT (time to first token)
                            if ttft is None:
                                ttft = time.time() - start_time
                            
                            content_chunk = choice['delta']['content']
                            full_response += content_chunk
                            
                            # Stream to console token by token
                            print(content_chunk, end='', flush=True)
                            
                            # Estimate token count for the chunk
                            output_tokens += len(tokenizer.encode(content_chunk))
                except json.JSONDecodeError:
                    continue
    
    print()  # New line after streaming
    print("-" * 50)
    
    end_time = time.time()
    total_time = end_time - start_time
    
    # If no TTFT was recorded (no tokens received), set it to total time
    if ttft is None:
        ttft = total_time
    
    # Calculate tokens per second based on output tokens
    tokens_per_second = output_tokens / total_time if total_time > 0 else 0
    
    # Total tokens
    total_tokens = input_tokens + output_tokens
    
    # Metrics dictionary
    metrics = {
        'ttft': ttft,
        'tokens_per_second': tokens_per_second,
        'total_tokens': total_tokens,
        'total_time': total_time,
        'input_tokens': input_tokens,
        'output_tokens': output_tokens
    }
    
    return full_response, metrics

def save_and_print_metrics(response, metrics, output_file):
    """
    Save response and metrics to file and print metrics
    """
    # Print metrics
    print(f"TTFT: {metrics['ttft']:.2f} seconds")
    print(f"Tokens/sec: {metrics['tokens_per_second']:.2f}")
    print(f"Total tokens: {metrics['total_tokens']}")
    print(f"Input tokens: {metrics['input_tokens']}")
    print(f"Output tokens: {metrics['output_tokens']}")
    print(f"Total time: {metrics['total_time']:.2f} seconds")
    
    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        # Write metrics in the format similar to the examples
        f.write(f"ttft: {metrics['ttft']:.2f}\n")
        f.write(f"tokens_per_second: {metrics['tokens_per_second']:.2f}\n")
        f.write(f"total_tokens: {metrics['total_tokens']}\n")
        f.write(f"total_time: {metrics['total_time']:.2f}\n")
        f.write(f"input_tokens: {metrics['input_tokens']}\n")
        f.write(f"output_tokens: {metrics['output_tokens']}\n\n")
        f.write(response)

In [None]:
# file_path = "tests/daily.txt"
# input_token_sizes = [25000]

# print("Testing different input token sizes:")
# for token_size in input_token_sizes:
#     print(f"\n=== Testing with {token_size} input tokens ===")
#     output_file = f"speed_tests/A2/{token_size}_length_1_parallel.txt"
    
#     response, metrics = measure_vllm_response(
#         file_path=file_path,
#         max_input_tokens=token_size,
#         max_output_tokens=100
#     )

In [6]:
# Warmup request to prepare the model
print("Warming up the model with a test request...")
warmup_response, warmup_metrics = measure_vllm_response(
    file_path="tests/daily.txt",
    max_input_tokens=100,
    max_output_tokens=10
)
print(f"Warmup completed in {warmup_metrics['total_time']:.2f}s")
print("Model is ready for testing.\n")

file_path = "tests/daily.txt"
input_token_sizes = [1000, 5000, 10000, 15000, 20000, 25000]

import threading
import time

def run_concurrent_test(file_path, max_input_tokens, max_output_tokens, request_id):
    """Run a single request for concurrent testing"""
    print(f"Starting request {request_id}")
    start = time.time()
    response, metrics = measure_vllm_response(
        file_path=file_path,
        max_input_tokens=max_input_tokens,
        max_output_tokens=max_output_tokens
    )
    end = time.time()
    print(f"Request {request_id} completed in {end - start:.2f}s")
    
    return metrics

print("Testing different input token sizes with various concurrent requests:")

concurrent_counts = [1, 2, 5, 10]

for token_size in input_token_sizes:
    print(f"\n" + "="*80)
    print(f"Testing with {token_size} input tokens")
    print("="*80)
    
    for concurrent_count in concurrent_counts:
        print(f"\n=== {token_size} tokens with {concurrent_count} concurrent requests ===")
        
        threads = []
        start_time = time.time()
        results = []
        
        # Start concurrent requests
        for i in range(concurrent_count):
            thread = threading.Thread(
                target=lambda i=i: results.append(run_concurrent_test(file_path, token_size, 100, f"{token_size}_{concurrent_count}_{i+1}"))
            )
            threads.append(thread)
            thread.start()
        
        # Wait for all threads to complete
        for thread in threads:
            thread.join()
        
        end_time = time.time()
        total_concurrent_time = end_time - start_time
        
        print(f"All {concurrent_count} requests completed in {total_concurrent_time:.2f}s")
        print(f"Average time per request: {total_concurrent_time/concurrent_count:.2f}s")
        
        # Save aggregated metrics
        output_file = f"speed_tests/A2/{token_size}_length_{concurrent_count}_parallel.txt"
        
        # Calculate average metrics
        if results:
            avg_ttft = sum(r['ttft'] for r in results) / len(results)
            avg_tokens_per_second = sum(r['tokens_per_second'] for r in results) / len(results)
            avg_total_tokens = sum(r['total_tokens'] for r in results) / len(results)
            avg_total_time = sum(r['total_time'] for r in results) / len(results)
            avg_input_tokens = sum(r['input_tokens'] for r in results) / len(results)
            avg_output_tokens = sum(r['output_tokens'] for r in results) / len(results)
            
            # Create average metrics dictionary
            avg_metrics = {
                'ttft': avg_ttft,
                'tokens_per_second': avg_tokens_per_second,
                'total_tokens': avg_total_tokens,
                'total_time': avg_total_time,
                'input_tokens': avg_input_tokens,
                'output_tokens': avg_output_tokens
            }
            
            # Print metrics
            print(f"Average TTFT: {avg_metrics['ttft']:.2f} seconds")
            print(f"Average Tokens/sec: {avg_metrics['tokens_per_second']:.2f}")
            print(f"Average Total tokens: {avg_metrics['total_tokens']:.0f}")
            print(f"Average Total time: {avg_metrics['total_time']:.2f} seconds")
            
            # Save average metrics to file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(f"ttft: {avg_metrics['ttft']:.2f}\n")
                f.write(f"tokens_per_second: {avg_metrics['tokens_per_second']:.2f}\n")
                f.write(f"total_tokens: {avg_metrics['total_tokens']:.0f}\n")
                f.write(f"total_time: {avg_metrics['total_time']:.2f}\n")
                f.write(f"input_tokens: {avg_metrics['input_tokens']:.0f}\n")
                f.write(f"output_tokens: {avg_metrics['output_tokens']:.0f}\n")
                f.write(f"concurrent_requests: {concurrent_count}\n")
                f.write(f"total_concurrent_time: {total_concurrent_time:.2f}\n\n")
                f.write(f"Average metrics for {concurrent_count} concurrent requests of {token_size} tokens each\n\n")
                
                # Write individual request metrics
                f.write("Individual request metrics:\n")
                for i, metrics in enumerate(results):
                    f.write(f"\nRequest {i+1}:\n")
                    f.write(f"  ttft: {metrics['ttft']:.2f}\n")
                    f.write(f"  tokens_per_second: {metrics['tokens_per_second']:.2f}\n")
                    f.write(f"  total_tokens: {metrics['total_tokens']}\n")
                    f.write(f"  total_time: {metrics['total_time']:.2f}\n")
                    f.write(f"  input_tokens: {metrics['input_tokens']}\n")
                    f.write(f"  output_tokens: {metrics['output_tokens']}\n")

Testing different input token sizes with various concurrent requests:

Testing with 1000 input tokens

=== 1000 tokens with 1 concurrent requests ===
Starting request 1000_1_1
Content truncated from 13961 to 1000 tokens
Response streaming:
--------------------------------------------------
Вот текст в более структурированном и понятном виде, с сохранением сути разговора и основных моментов:

---

**Speaker 2 (ведущий):**  
В общем, наверное, можем потихоньку начинать. Петя ещё подключится, Роман сегодня не подключится — он сейчас проверяет задачи, и у него, как обычно, очередь тестирования никогда не закан
--------------------------------------------------
Request 1000_1_1 completed in 6.25s
All 1 requests completed in 6.26s
Average time per request: 6.26s
Average TTFT: 0.70 seconds
Average Tokens/sec: 16.08
Average Total tokens: 1100
Average Total time: 6.22 seconds

=== 1000 tokens with 2 concurrent requests ===
Starting request 1000_2_1
Starting request 1000_2_2
Content truncated fr