In [2]:
import requests
import time
import json
from transformers import AutoTokenizer

# Load Qwen3 tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B")

In [8]:
import requests
import json
import time

def measure_ollama_response(file_path, ollama_url="http://localhost:11434/api/chat", 
                           max_input_tokens=None, max_output_tokens=1024, time_limit=7):
    """
    Send file content to Ollama chat endpoint and measure response metrics with streaming
    
    Args:
        file_path: Path to the input file
        ollama_url: Ollama endpoint URL
        max_input_tokens: Maximum tokens for input (for truncation), None for no limit
        max_output_tokens: Maximum tokens for output response
        time_limit: Maximum time in seconds for generation (after prefill), None for no limit
    """
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Truncate content if max_input_tokens is specified
    if max_input_tokens is not None:
        tokens = tokenizer.encode(content)
        if len(tokens) > max_input_tokens:
            truncated_tokens = tokens[:max_input_tokens]
            content = tokenizer.decode(truncated_tokens)
            print(f"Content truncated from {len(tokens)} to {max_input_tokens} tokens")
    
    # Get actual input token count
    input_tokens = len(tokenizer.encode(content))
    
    # Prepare chat request with streaming
    payload = {
        "model": "qwen3:14b",
        "messages": [
            {"role": "user", "content": content}
        ],
        "stream": True,
        "think": False,
        "options": {
            "num_predict": max_output_tokens,
            "temperature": 0.7
        }
    }
    
    # Start timing
    start_time = time.time()
    ttft = None
    generation_timeout = False
    
    # Send request to Ollama with streaming
    response = requests.post(ollama_url, json=payload, stream=True)
    
    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")
    
    # Process streaming response
    full_response = ""
    output_tokens = 0
    
    print("Response streaming:")
    print("-" * 50)
    
    for line in response.iter_lines():
        if line:
            try:
                data = json.loads(line)
                
                if 'message' in data and 'content' in data['message']:
                    # Record TTFT (time to first token)
                    if ttft is None:
                        ttft = time.time() - start_time
                        print(f"\nFirst token received at {ttft:.2f}s")
                    
                    # Check time limit after first token (generation phase)
                    if time_limit is not None and ttft is not None:
                        generation_time = time.time() - start_time - ttft
                        if generation_time > time_limit:
                            print(f"\nGeneration timeout reached ({time_limit}s limit)")
                            generation_timeout = True
                            break
                    
                    content_chunk = data['message']['content']
                    full_response += content_chunk
                    
                    # Stream to console
                    print(content_chunk, end='', flush=True)
                
                # Check if this is the final message
                if data.get('done', False):
                    break
                    
            except json.JSONDecodeError:
                continue
    
    print()  # New line after streaming
    print("-" * 50)
    
    end_time = time.time()
    total_time = end_time - start_time
    
    # If no TTFT was recorded (no tokens received), set it to total time
    if ttft is None:
        ttft = total_time
    
    # Estimate output tokens using tokenizer
    output_tokens = len(tokenizer.encode(full_response))
    print(f"Using tokenizer estimates: {input_tokens} input, {output_tokens} output")
    
    # Calculate generation speed using actual token counts
    generation_time = max(total_time - ttft, 1e-9)
    gen_tokens_per_sec = output_tokens / generation_time
    
    # Total tokens
    total_tokens = input_tokens + output_tokens
    
    # Metrics dictionary
    metrics = {
        'ttft': ttft,
        'gen_tokens_per_sec': gen_tokens_per_sec,
        'total_tokens': total_tokens,
        'total_time': total_time,
        'input_tokens': input_tokens,
        'output_tokens': output_tokens,
        'generation_time': generation_time,
        'generation_timeout': generation_timeout
    }
    
    return full_response, metrics

def save_and_print_metrics(response, metrics, output_file):
    """
    Save response and metrics to file and print metrics
    """
    # Print metrics
    print(f"TTFT: {metrics['ttft']:.2f} seconds")
    print(f"Gen tokens/sec (post-TTFT): {metrics['gen_tokens_per_sec']:.2f}")
    print(f"Total tokens: {metrics['total_tokens']}")
    print(f"Input tokens: {metrics['input_tokens']}")
    print(f"Output tokens: {metrics['output_tokens']}")
    print(f"Total time: {metrics['total_time']:.2f} seconds")
    print(f"Generation time: {metrics['generation_time']:.2f} seconds")
    if metrics.get('generation_timeout', False):
        print("Generation was interrupted due to timeout")
    
    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        # Write metrics in the format similar to the examples
        f.write(f"ttft: {metrics['ttft']:.2f}\n")
        f.write(f"gen_tokens_per_second: {metrics['gen_tokens_per_sec']:.2f}\n")
        f.write(f"total_tokens: {metrics['total_tokens']}\n")
        f.write(f"total_time: {metrics['total_time']:.2f}\n")
        f.write(f"input_tokens: {metrics['input_tokens']}\n")
        f.write(f"output_tokens: {metrics['output_tokens']}\n")
        f.write(f"generation_time: {metrics['generation_time']:.2f}\n")
        if metrics.get('generation_timeout', False):
            f.write("generation_timeout: true\n")
        f.write("\n")
        f.write(response)

# Example usage:
file_path = "tests/book.txt"
ollama_url = "http://localhost:11434/api/chat"
max_input_tokens = 300
max_output_tokens = 100

response, metrics = measure_ollama_response(file_path, ollama_url, max_input_tokens, max_output_tokens)
save_and_print_metrics(response, metrics, "ollama_test_output.txt")


Content truncated from 181863 to 300 tokens
Response streaming:
--------------------------------------------------

First token received at 0.15s
К сожалению, ваш запрос содержит неполный фрагмент текста книги **«Огонь и лед»** Эрин Хантер, и вы не указали, что именно вы хотите получить в результате. Чтобы помоч

KeyboardInterrupt: 