In [1]:
import time
from openai import OpenAI
from transformers import AutoTokenizer

# Load tokenizer for qwen/qwen3-8b
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Function to read file and truncate to max tokens
def read_file_with_token_limit(file_path, max_tokens, tokenizer):
    # Try different encodings to handle various file types
    encodings = ['utf-8', 'cp1251', 'latin-1', 'ascii']
    content = None
    
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                content = f.read()
            break
        except UnicodeDecodeError:
            continue
    
    if content is None:
        # If all encodings fail, read as binary and decode with error handling
        with open(file_path, 'rb') as f:
            raw_content = f.read()
        content = raw_content.decode('utf-8', errors='ignore')
    
    # Tokenize the content
    tokens = tokenizer.encode(content)
    
    # Truncate if necessary
    if len(tokens) > max_tokens:
        truncated_tokens = tokens[:max_tokens]
        content = tokenizer.decode(truncated_tokens)
    
    return content

# Function to count input tokens
def count_input_tokens(messages, tokenizer):
    # Convert messages to text format that would be sent to the model
    text = ""
    for message in messages:
        text += f"{message['role']}: {message['content']}\n"
    
    # Tokenize and count
    tokens = tokenizer.encode(text)
    return len(tokens)

# Function to call LLM with file input
def call_llm(file_path, max_input_tokens, max_output_tokens, tokenizer, client):
    # Read content from file with token limit
    note_content = read_file_with_token_limit(file_path, max_input_tokens, tokenizer)
    
    # Prepare messages with content from the file
    messages = [
        {"role": "system", "content": "Твоя задача - рассказать о том, что происходит в тексте, который будет передан в поле content. Суммаризация должна быть краткой, но и содержательной, не упускающей никаких деталей, структурированный ответ по пунктам. Суммаризация должна быть на русском языке."},
        {"role": "user", "content": note_content},
    ]
    
    # Count input tokens
    input_token_count = count_input_tokens(messages, tokenizer)
    print(f"Input tokens: {input_token_count}")
    
    start_time = time.time()
    first_token_time = None
    token_count = 0
    
    chat_response = client.chat.completions.create(
        model="qwen3",
        messages=messages,
        max_tokens=max_output_tokens,
        stream=True,
        extra_body={
            "chat_template_kwargs": {
                "enable_thinking": False
                }
            }
    )
    
    content = ""
    for chunk in chat_response:
        if chunk.choices[0].delta.content:
            if first_token_time is None:
                first_token_time = time.time()
                ttft = first_token_time - start_time
            
            content += chunk.choices[0].delta.content
            token_count += 1
            print(chunk.choices[0].delta.content, end="", flush=True)
    
    end_time = time.time()
    total_time = end_time - start_time
    metrics = {}
    if first_token_time:
        generation_time = end_time - first_token_time
        tokens_per_second = token_count / generation_time if generation_time > 0 else 0
        metrics = {
            "ttft": ttft,
            "tokens_per_second": tokens_per_second,
            "total_tokens": token_count,
            "total_time": total_time,
            "input_tokens": input_token_count
        }
        print(f"\n\nTime to first token (TTFT): {ttft:.3f} seconds")
        print(f"Tok/sec: {tokens_per_second:.2f}")
        print(f"Total tokens: {token_count}")
        print(f"Total time: {total_time:.3f} seconds")
    
    return content, metrics

# Async function to call LLM with file input
async def call_llm_async(file_path, max_input_tokens, max_output_tokens, tokenizer, async_client):
    try:
        # Read content from file with token limit
        note_content = read_file_with_token_limit(file_path, max_input_tokens, tokenizer)
        
        # Prepare messages with content from the file
        messages = [
            {"role": "system", "content": "Твоя задача - рассказать о том, что происходит в тексте, который будет передан в поле content. Суммаризация должна быть краткой, но и содержательной, не упускающей никаких деталей, структурированный ответ по пунктам. Суммаризация должна быть на русском языке."},
            {"role": "user", "content": note_content},
        ]
        
        # Count input tokens
        input_token_count = count_input_tokens(messages, tokenizer)
        print(f"Input tokens for {file_path}: {input_token_count}")
        
        start_time = time.time()
        first_token_time = None
        token_count = 0
        
        chat_response = await async_client.chat.completions.create(
            model="qwen3",
            messages=messages,
            max_tokens=max_output_tokens,
            stream=True,
            extra_body={
                "chat_template_kwargs": {
                    "enable_thinking": False
                    }
                }
        )
        
        content = ""
        async for chunk in chat_response:
            if chunk.choices[0].delta.content:
                if first_token_time is None:
                    first_token_time = time.time()
                    ttft = first_token_time - start_time
                
                content += chunk.choices[0].delta.content
                token_count += 1
                print(chunk.choices[0].delta.content, end="", flush=True)
        
        end_time = time.time()
        total_time = end_time - start_time
        metrics = {}
        if first_token_time:
            generation_time = end_time - first_token_time
            tokens_per_second = token_count / generation_time if generation_time > 0 else 0
            print(f"\n\nTime to first token (TTFT) for {file_path}: {ttft:.3f} seconds")
            print(f"Tok/sec: {tokens_per_second:.2f}")
            print(f"Total tokens: {token_count}")
            print(f"Total time: {total_time:.3f} seconds")
            
            metrics = {
                "ttft": ttft,
                "tokens_per_second": tokens_per_second,
                "total_tokens": token_count,
                "total_time": total_time,
                "input_tokens": input_token_count
            }
        
        return content, metrics
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        # Return empty result and metrics with error info
        return f"Error processing file: {str(e)}", {
            "ttft": 0,
            "tokens_per_second": 0,
            "total_tokens": 0,
            "total_time": 0,
            "input_tokens": 0,
            "error": str(e)
        }

# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

# Create async client
from openai import AsyncOpenAI
async_client = AsyncOpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

# Example usage
max_input_tokens = 25000
max_output_tokens = 2768
import os
import asyncio

# Get all files in tests folder
tests_folder = "tests"
test_files = [f for f in os.listdir(tests_folder) if os.path.isfile(os.path.join(tests_folder, f))]
# test_files = ["daily.txt"]

# Synchronous version
results = {}
metrics = {}
for test_file in test_files:
    full_path = os.path.join(tests_folder, test_file)
    result, metric = call_llm(full_path, max_input_tokens, max_output_tokens, tokenizer, client)
    results[test_file] = result
    metrics[test_file] = metric

# Async version with concurrent calls
# async def process_files_async():
#     tasks = []
#     for test_file in test_files:
#         full_path = os.path.join(tests_folder, test_file)
#         task = call_llm_async(full_path, max_input_tokens, max_output_tokens, tokenizer, async_client)
#         tasks.append((test_file, task))
    
#     # Run all tasks concurrently and collect results
#     task_results = await asyncio.gather(*[task for _, task in tasks], return_exceptions=True)
    
#     # Map results back to filenames
#     results_async = {}
#     metrics_async = {}
#     for i, (test_file, _) in enumerate(tasks):
#         if isinstance(task_results[i], Exception):
#             print(f"Exception for {test_file}: {task_results[i]}")
#             results_async[test_file] = f"Error: {str(task_results[i])}"
#             metrics_async[test_file] = {
#                 "ttft": 0,
#                 "tokens_per_second": 0,
#                 "total_tokens": 0,
#                 "total_time": 0,
#                 "input_tokens": 0,
#                 "error": str(task_results[i])
#             }
#         else:
#             content, metrics = task_results[i]
#             results_async[test_file] = content
#             metrics_async[test_file] = metrics
    
#     return results_async, metrics_async

# # Run async version
# results_async, metrics_async = await process_files_async()

Input tokens: 2260
**Суммаризация текста:**

1. **Общие требования к стикерам для Telegram:**
   - Формат: TGC (экспортируется из After Effects с плагином Budimovim).
   - Максимальный размер файла: 64 КБ.
   - Максимальная продолжительность: 3 секунды.
   - Частота кадров: до 60 FPS.
   - Размер холста: 512x512 пикселей.
   - Прозрачность фона.

2. **Общие визуальные требования:**
   - Все стикеры должны быть в формате PNG с прозрачным фоном.
   - Размер: 512x512 пикселей.
   - Обводка: светлая (чтобы не терялась на темном фоне).
   - Мимика: живая, но не переигрывающая.
   - Фон: только в сюжетных стикерах и должен быть легким.

3. **Стикер-пак без текста (универсальные эмоции):**
   - **Медитация:** Маскот в позе лотоса, закрытые глаза, эмоция спокойствия и принятия.
   - **Фокус:** Маскот смотрит в ноутбук, эмоция концентрации.
   - **Радостное приветствие:** Маскот улыбается, машет крылышком, используется как приветствие.
   - **Усталость:** Маскот лежит, глаза прикрыты, язык высо

In [2]:
import json
import os
# Get the model name from the API
import requests
try:
    response = requests.get('http://localhost:8000/v1/models')
    models_data = response.json()
    model_name = models_data['data'][0]['root'].split('/')[-1] if models_data.get('data') else 'unknown-model'
except Exception as e:
    print(f"Error getting model name: {e}")
    model_name = 'unknown-model'

# Create results directory if it doesn't exist
results_dir = f'./summary_results/A2_x2/{model_name}'
os.makedirs(results_dir, exist_ok=True)

for test_file, result in results.items():
    with open(f'{results_dir}/{test_file.split(".")[0]}_result.txt', 'w', encoding='utf-8') as f:
        metrics_str = ""
        for key, value in metrics[test_file].items():
            value_str = f"{value:.2f}" if isinstance(value, float) else str(value)
            metrics_str += f"{key}: {value_str}\n"
        
        f.write(metrics_str + "\n\n" + result)

NameError: name 'results' is not defined

In [None]:
# # Create results directory if it doesn't exist
# results_dir = './results/Qwen3-4B'
# # os.makedirs(results_dir, exist_ok=False)

# for test_file, result in results_async.items():
#     with open(f'{results_dir}/{test_file.split(".")[0]}_result.txt', 'w', encoding='utf-8') as f:
#         f.write(str(metrics_async[test_file]))
#         f.write(result)