In [2]:
import requests
import time
import json
from transformers import AutoTokenizer

# Load Qwen3 tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import requests
import json
import time

def measure_ollama_response(file_path, ollama_url="http://localhost:11434/api/chat", 
                           max_input_tokens=None, max_output_tokens=1024, time_limit=7):
    """
    Send file content to Ollama chat endpoint and measure response metrics with streaming
    
    Args:
        file_path: Path to the input file
        ollama_url: Ollama endpoint URL
        max_input_tokens: Maximum tokens for input (for truncation), None for no limit
        max_output_tokens: Maximum tokens for output response
        time_limit: Maximum time in seconds for generation (after prefill), None for no limit
    """
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Truncate content if max_input_tokens is specified
    if max_input_tokens is not None:
        tokens = tokenizer.encode(content)
        if len(tokens) > max_input_tokens:
            truncated_tokens = tokens[:max_input_tokens]
            content = tokenizer.decode(truncated_tokens)
            print(f"Content truncated from {len(tokens)} to {max_input_tokens} tokens")
    
    # Get actual input token count
    input_tokens = len(tokenizer.encode(content))
    
    # Prepare chat request with streaming
    payload = {
        "model": "qwen3:8b",
        "messages": [
            {"role": "user", "content": content}
        ],
        "stream": True,
        "think": False,
        "options": {
            "num_predict": max_output_tokens,
            "temperature": 0.7,
            "num_ctx": 32768
        }
    }
    
    # Start timing
    start_time = time.time()
    ttft = None
    generation_timeout = False
    
    # Send request to Ollama with streaming
    response = requests.post(ollama_url, json=payload, stream=True)
    
    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")
    
    # Process streaming response
    full_response = ""
    output_tokens = 0
    
    print("Response streaming:")
    print("-" * 50)
    
    for line in response.iter_lines():
        if line:
            try:
                data = json.loads(line)
                
                if 'message' in data and 'content' in data['message']:
                    # Record TTFT (time to first token)
                    if ttft is None:
                        ttft = time.time() - start_time
                        print(f"\nFirst token received at {ttft:.2f}s")
                    
                    # Check time limit after first token (generation phase)
                    if time_limit is not None and ttft is not None:
                        generation_time = time.time() - start_time - ttft
                        if generation_time > time_limit:
                            print(f"\nGeneration timeout reached ({time_limit}s limit)")
                            generation_timeout = True
                            break
                    
                    content_chunk = data['message']['content']
                    full_response += content_chunk
                    
                    # Stream to console
                    print(content_chunk, end='', flush=True)
                
                # Check if this is the final message
                if data.get('done', False):
                    break
                    
            except json.JSONDecodeError:
                continue
    
    print()  # New line after streaming
    print("-" * 50)
    
    end_time = time.time()
    total_time = end_time - start_time
    
    # If no TTFT was recorded (no tokens received), set it to total time
    if ttft is None:
        ttft = total_time
    
    # Estimate output tokens using tokenizer
    output_tokens = len(tokenizer.encode(full_response))
    print(f"Using tokenizer estimates: {input_tokens} input, {output_tokens} output")
    
    # Calculate generation speed using actual token counts
    generation_time = max(total_time - ttft, 1e-9)
    gen_tokens_per_sec = output_tokens / generation_time
    
    # Total tokens
    total_tokens = input_tokens + output_tokens
    
    # Metrics dictionary
    metrics = {
        'ttft': ttft,
        'gen_tokens_per_sec': gen_tokens_per_sec,
        'total_tokens': total_tokens,
        'total_time': total_time,
        'input_tokens': input_tokens,
        'output_tokens': output_tokens,
        'generation_time': generation_time,
        'generation_timeout': generation_timeout
    }
    
    return full_response, metrics

def save_and_print_metrics(response, metrics, output_file):
    """
    Save response and metrics to file and print metrics
    """
    # Print metrics
    print(f"TTFT: {metrics['ttft']:.2f} seconds")
    print(f"Gen tokens/sec (post-TTFT): {metrics['gen_tokens_per_sec']:.2f}")
    print(f"Total tokens: {metrics['total_tokens']}")
    print(f"Input tokens: {metrics['input_tokens']}")
    print(f"Output tokens: {metrics['output_tokens']}")
    print(f"Total time: {metrics['total_time']:.2f} seconds")
    print(f"Generation time: {metrics['generation_time']:.2f} seconds")
    if metrics.get('generation_timeout', False):
        print("Generation was interrupted due to timeout")
    
    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        # Write metrics in the format similar to the examples
        f.write(f"ttft: {metrics['ttft']:.2f}\n")
        f.write(f"gen_tokens_per_second: {metrics['gen_tokens_per_sec']:.2f}\n")
        f.write(f"total_tokens: {metrics['total_tokens']}\n")
        f.write(f"total_time: {metrics['total_time']:.2f}\n")
        f.write(f"input_tokens: {metrics['input_tokens']}\n")
        f.write(f"output_tokens: {metrics['output_tokens']}\n")
        f.write(f"generation_time: {metrics['generation_time']:.2f}\n")
        if metrics.get('generation_timeout', False):
            f.write("generation_timeout: true\n")
        f.write("\n")
        f.write(response)

# Example usage:
file_path = "tests/book.txt"
ollama_url = "http://localhost:11434/api/chat"
max_input_tokens = 10000
max_output_tokens = 100

response, metrics = measure_ollama_response(file_path, ollama_url, max_input_tokens, max_output_tokens)
save_and_print_metrics(response, metrics, "ollama_test_output.txt")


Content truncated from 181863 to 10000 tokens
Response streaming:
--------------------------------------------------

First token received at 8.37s
Огнегрив с волнением наблюдал, как в лесной чаще скрывается тень Ячменя. В его глазах мелькали тревога и неуверенность. Он знал, что Синяя Звезда не станет сомневаться в его словах, но не мог не чувствовать собственного смятения. Горелый — не просто кот, а человек, и его
--------------------------------------------------
Using tokenizer estimates: 10000 input, 98 output
TTFT: 8.37 seconds
Gen tokens/sec (post-TTFT): 19.12
Total tokens: 10098
Input tokens: 10000
Output tokens: 98
Total time: 13.50 seconds
Generation time: 5.12 seconds


In [8]:
# Warmup request to prepare the model
print("Warming up the model with a test request...")
warmup_response, warmup_metrics = measure_ollama_response(
    file_path="tests/daily.txt",
    ollama_url="http://localhost:11434/api/chat",
    max_input_tokens=100,
    max_output_tokens=10
)
print(f"Warmup completed in {warmup_metrics['total_time']:.2f}s")
print("Model is ready for testing.\n")

file_path = "tests/book.txt"
ollama_url = "http://localhost:11434/api/chat"
input_token_sizes = [1000, 5000, 10000, 15000 , 20000, 25000, 30000]
# input_token_sizes = [20000, 25000, 30000]
# input_token_sizes = [25000]

import threading
import time
import os

def run_concurrent_test(file_path, ollama_url, max_input_tokens, max_output_tokens, request_id):
    """Run a single request for concurrent testing"""
    print(f"Starting request {request_id}")
    start = time.time()
    
    try:
        response, metrics = measure_ollama_response(
            file_path=file_path,
            ollama_url=ollama_url,
            max_input_tokens=max_input_tokens,
            max_output_tokens=max_output_tokens,
            time_limit=7
        )

        end = time.time()
        print(f"Request {request_id} completed in {end - start:.2f}s")
        return metrics
        
    except Exception as e:
        print(f"Request {request_id} failed: {e}")
        return None

print("Testing different input token sizes with various concurrent requests:")

concurrent_counts = [1] #,  2, 5] 

for token_size in input_token_sizes:
    print(f"\n" + "="*80)
    print(f"Testing with {token_size} input tokens")
    print("="*80)
    
    for concurrent_count in concurrent_counts:
        print(f"\n=== {token_size} tokens with {concurrent_count} concurrent requests ===")
        print(f"Waiting for all current requests to complete before starting this set...")
        
        threads = []
        start_time = time.time()
        results = []
        
        # Start concurrent requests
        for i in range(concurrent_count):
            thread = threading.Thread(
                target=lambda i=i: results.append(run_concurrent_test(file_path, ollama_url, token_size, 100, f"{token_size}_{concurrent_count}_{i+1}"))
            )
            threads.append(thread)
            thread.start()
        
        # Wait for ALL threads to complete before proceeding to next set
        print(f"Waiting for all {concurrent_count} threads to complete...")
        for i, thread in enumerate(threads):
            thread.join()
            print(f"Thread {i+1}/{concurrent_count} completed")
        
        end_time = time.time()
        total_concurrent_time = end_time - start_time
        
        # Filter out None results (failed requests only)
        valid_results = [r for r in results if r is not None]
        
        print(f"All {concurrent_count} requests completed in {total_concurrent_time:.2f}s")
        print(f"Valid results: {len(valid_results)}/{concurrent_count}")
        print(f"Average time per request: {total_concurrent_time/concurrent_count:.2f}s")
        
        # Save aggregated metrics
        # Get model name from Ollama API
        try:
            import requests
            response = requests.get("http://localhost:11434/api/tags")
            models_data = response.json()
            model_name = models_data['models'][0]['name'].replace(":", "-") if models_data['models'] else "unknown_model"
        except Exception as e:
            print(f"Failed to get model name from Ollama API: {e}")
            model_name = "ollama-unknown"  # fallback


        # ----------------------------
        output_dir = f"speed_tests/A2_x1/{model_name}"
        # ----------------------------

        
        os.makedirs(output_dir, exist_ok=True)
        output_file = os.path.join(output_dir, f"{token_size}_length_{concurrent_count}_parallel.txt")
        
        # Calculate average metrics
        if valid_results:
            avg_ttft = sum(r['ttft'] for r in valid_results) / len(valid_results)
            avg_gen_tokens_per_sec = sum(r['gen_tokens_per_sec'] for r in valid_results) / len(valid_results)
            avg_total_tokens = sum(r['total_tokens'] for r in valid_results) / len(valid_results)
            avg_total_time = sum(r['total_time'] for r in valid_results) / len(valid_results)
            avg_input_tokens = sum(r['input_tokens'] for r in valid_results) / len(valid_results)
            avg_output_tokens = sum(r['output_tokens'] for r in valid_results) / len(valid_results)
            
            # Create average metrics dictionary
            avg_metrics = {
                'ttft': avg_ttft,
                'gen_tokens_per_sec': avg_gen_tokens_per_sec,
                'total_tokens': avg_total_tokens,
                'total_time': avg_total_time,
                'input_tokens': avg_input_tokens,
                'output_tokens': avg_output_tokens
            }
            
            # Print metrics
            print(f"Average TTFT: {avg_metrics['ttft']:.2f} seconds")
            print(f"Average Tokens/sec: {avg_metrics['gen_tokens_per_sec']:.2f}")
            print(f"Average Total tokens: {avg_metrics['total_tokens']:.0f}")
            print(f"Average Total time: {avg_metrics['total_time']:.2f} seconds")
            
            # Save average metrics to file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(f"ttft: {avg_metrics['ttft']:.2f}\n")
                f.write(f"gen_tokens_per_second: {avg_metrics['gen_tokens_per_sec']:.2f}\n")
                f.write(f"total_tokens: {avg_metrics['total_tokens']:.0f}\n")
                f.write(f"total_time: {avg_metrics['total_time']:.2f}\n")
                f.write(f"input_tokens: {avg_metrics['input_tokens']:.0f}\n")
                f.write(f"output_tokens: {avg_metrics['output_tokens']:.0f}\n")
                f.write(f"concurrent_requests: {concurrent_count}\n")
                f.write(f"valid_requests: {len(valid_results)}\n")
                f.write(f"total_concurrent_time: {total_concurrent_time:.2f}\n\n")
                f.write(f"Average metrics for {len(valid_results)}/{concurrent_count} concurrent requests of {token_size} tokens each\n\n")
                
                # Write individual request metrics for ALL valid results
                f.write("Individual request metrics:\n")
                for i, metrics in enumerate(valid_results):
                    f.write(f"\nRequest {i+1}:\n")
                    f.write(f"  ttft: {metrics['ttft']:.2f}\n")
                    f.write(f"  gen_tokens_per_second: {metrics['gen_tokens_per_sec']:.2f}\n")
                    f.write(f"  total_tokens: {metrics['total_tokens']}\n")
                    f.write(f"  total_time: {metrics['total_time']:.2f}\n")
                    f.write(f"  input_tokens: {metrics['input_tokens']}\n")
                    f.write(f"  output_tokens: {metrics['output_tokens']}\n")
        else:
            print("No valid results to save")
        
        # Add a pause between different concurrent count sets for the same token size
        print(f"Set [{token_size} tokens - {concurrent_count} parallel] completed. Proceeding to next set...")
        time.sleep(2)  # Brief pause to ensure system is ready for next set


Warming up the model with a test request...
Content truncated from 13961 to 100 tokens
Response streaming:
--------------------------------------------------

First token received at 5.50s
Спасибо, я готов! Д
--------------------------------------------------
Using tokenizer estimates: 100 input, 9 output
Warmup completed in 5.87s
Model is ready for testing.

Testing different input token sizes with various concurrent requests:

Testing with 1000 input tokens

=== 1000 tokens with 1 concurrent requests ===
Waiting for all current requests to complete before starting this set...
Starting request 1000_1_1
Waiting for all 1 threads to complete...
Content truncated from 181863 to 1000 tokens
Response streaming:
--------------------------------------------------

First token received at 1.80s
Очень интересный фрагмент из книги **«Огонь и лед»** Эрины Хантер, которая является частью серии **«Племена Котов»**. Этот пролог дает первые впечатления о напряжённой обстановке в мире котов, о конфли