# ⚡ Optimalizace LLM Inference

**Autor:** Praut s.r.o. - AI Integration & Business Automation

V tomto notebooku se naučíme optimalizovat inference velkých jazykových modelů:

- Kvantizace modelů (4-bit, 8-bit)
- Flash Attention a další optimalizace
- Batching a throughput optimalizace
- Model caching a KV-cache
- Praktické benchmarky a měření výkonu

## Proč optimalizovat?

| Technika | Úspora paměti | Rychlost | Kvalita |
|----------|---------------|----------|---------||
| FP32 (základ) | - | 1x | 100% |
| FP16 | 50% | 1.5-2x | ~100% |
| INT8 | 75% | 2-3x | 99%+ |
| INT4 | 87.5% | 3-4x | 95-99% |

In [None]:
# Instalace knihoven
!pip install -q transformers accelerate bitsandbytes optimum auto-gptq scipy

In [None]:
import torch
import time
import gc
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
import numpy as np

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    TextStreamer
)

# Kontrola GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Zařízení: {device}")

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"Paměť GPU: {gpu_memory:.1f} GB")

## 1. Utility pro Měření Výkonu

In [None]:
@dataclass
class BenchmarkResult:
    """Výsledek benchmarku."""
    model_name: str
    quantization: str
    memory_gb: float
    load_time: float
    tokens_per_second: float
    first_token_latency: float
    total_time: float
    output_text: str


def get_gpu_memory_usage() -> float:
    """Vrátí využití GPU paměti v GB."""
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / 1e9
    return 0.0


def clear_gpu_memory():
    """Vyčistí GPU paměť."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()


class InferenceBenchmark:
    """
    Nástroj pro benchmarkování LLM inference.
    """
    
    def __init__(self, test_prompt: str = None):
        self.test_prompt = test_prompt or "Vysvětli stručně, co je umělá inteligence:"
        self.results: List[BenchmarkResult] = []
    
    def benchmark_model(
        self,
        model,
        tokenizer,
        model_name: str,
        quantization: str,
        max_new_tokens: int = 100,
        num_runs: int = 3
    ) -> BenchmarkResult:
        """Spustí benchmark pro model."""
        
        # Warm-up run
        inputs = tokenizer(self.test_prompt, return_tensors="pt").to(model.device)
        _ = model.generate(**inputs, max_new_tokens=10, do_sample=False)
        
        # Benchmark runs
        times = []
        first_token_times = []
        total_tokens = 0
        output_text = ""
        
        for _ in range(num_runs):
            inputs = tokenizer(self.test_prompt, return_tensors="pt").to(model.device)
            
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            
            start_time = time.time()
            
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
            
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            
            end_time = time.time()
            
            # Dekódování
            generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
            output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            
            times.append(end_time - start_time)
            total_tokens += len(generated_tokens)
        
        avg_time = np.mean(times)
        avg_tokens = total_tokens / num_runs
        tokens_per_sec = avg_tokens / avg_time
        
        result = BenchmarkResult(
            model_name=model_name,
            quantization=quantization,
            memory_gb=get_gpu_memory_usage(),
            load_time=0,  # Set externally
            tokens_per_second=tokens_per_sec,
            first_token_latency=times[0] / max_new_tokens,  # Approximate
            total_time=avg_time,
            output_text=output_text[:200]
        )
        
        self.results.append(result)
        return result
    
    def print_results(self):
        """Vypíše výsledky benchmarku."""
        print("\n" + "="*80)
        print("BENCHMARK VÝSLEDKY")
        print("="*80)
        
        for r in self.results:
            print(f"\n{r.model_name} ({r.quantization})")
            print("-" * 40)
            print(f"  Paměť GPU: {r.memory_gb:.2f} GB")
            print(f"  Rychlost: {r.tokens_per_second:.1f} tokens/s")
            print(f"  Celkový čas: {r.total_time:.2f}s")
    
    def compare_results(self):
        """Porovná výsledky a vrátí tabulku."""
        if not self.results:
            return None
        
        baseline = self.results[0]
        
        print("\nPOROVNÁNÍ (relativně k prvnímu modelu):")
        print("-" * 60)
        
        for r in self.results:
            mem_ratio = r.memory_gb / baseline.memory_gb if baseline.memory_gb > 0 else 0
            speed_ratio = r.tokens_per_second / baseline.tokens_per_second if baseline.tokens_per_second > 0 else 0
            
            print(f"{r.quantization:15} | Paměť: {mem_ratio:.2f}x | Rychlost: {speed_ratio:.2f}x")

## 2. Kvantizace Modelů

### 2.1 Načtení v různých precizích

In [None]:
class ModelLoader:
    """
    Načítá modely s různými optimalizacemi.
    """
    
    @staticmethod
    def load_fp16(model_name: str):
        """Načte model v FP16."""
        print(f"Načítám {model_name} (FP16)...")
        clear_gpu_memory()
        
        start = time.time()
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        load_time = time.time() - start
        
        print(f"  Načteno za {load_time:.1f}s, paměť: {get_gpu_memory_usage():.2f} GB")
        return model, tokenizer, load_time
    
    @staticmethod
    def load_8bit(model_name: str):
        """Načte model v 8-bit kvantizaci."""
        print(f"Načítám {model_name} (8-bit)...")
        clear_gpu_memory()
        
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=6.0
        )
        
        start = time.time()
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto"
        )
        load_time = time.time() - start
        
        print(f"  Načteno za {load_time:.1f}s, paměť: {get_gpu_memory_usage():.2f} GB")
        return model, tokenizer, load_time
    
    @staticmethod
    def load_4bit(model_name: str, use_double_quant: bool = True):
        """Načte model v 4-bit kvantizaci s QLoRA konfigurací."""
        print(f"Načítám {model_name} (4-bit)...")
        clear_gpu_memory()
        
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",  # NormalFloat4
            bnb_4bit_use_double_quant=use_double_quant
        )
        
        start = time.time()
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto"
        )
        load_time = time.time() - start
        
        print(f"  Načteno za {load_time:.1f}s, paměť: {get_gpu_memory_usage():.2f} GB")
        return model, tokenizer, load_time

In [None]:
# Benchmark různých kvantizací
# Použijeme menší model pro demonstraci
MODEL_NAME = "microsoft/phi-2"  # 2.7B parametrů

# Alternativy pro různé velikosti GPU:
# "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - 1.1B (malé GPU)
# "microsoft/phi-2" - 2.7B (střední GPU)
# "mistralai/Mistral-7B-Instruct-v0.2" - 7B (velké GPU)

benchmark = InferenceBenchmark(
    test_prompt="Vysvětli stručně v jedné větě, co je strojové učení:"
)

In [None]:
# Test FP16
if torch.cuda.is_available():
    try:
        model_fp16, tokenizer_fp16, load_time_fp16 = ModelLoader.load_fp16(MODEL_NAME)
        result_fp16 = benchmark.benchmark_model(
            model_fp16, tokenizer_fp16, MODEL_NAME, "FP16", max_new_tokens=50
        )
        result_fp16.load_time = load_time_fp16
        
        print(f"\nVýstup: {result_fp16.output_text}")
        
        # Uvolnění paměti
        del model_fp16, tokenizer_fp16
        clear_gpu_memory()
    except Exception as e:
        print(f"FP16 test selhal: {e}")

In [None]:
# Test 8-bit
if torch.cuda.is_available():
    try:
        model_8bit, tokenizer_8bit, load_time_8bit = ModelLoader.load_8bit(MODEL_NAME)
        result_8bit = benchmark.benchmark_model(
            model_8bit, tokenizer_8bit, MODEL_NAME, "8-bit", max_new_tokens=50
        )
        result_8bit.load_time = load_time_8bit
        
        print(f"\nVýstup: {result_8bit.output_text}")
        
        del model_8bit, tokenizer_8bit
        clear_gpu_memory()
    except Exception as e:
        print(f"8-bit test selhal: {e}")

In [None]:
# Test 4-bit
if torch.cuda.is_available():
    try:
        model_4bit, tokenizer_4bit, load_time_4bit = ModelLoader.load_4bit(MODEL_NAME)
        result_4bit = benchmark.benchmark_model(
            model_4bit, tokenizer_4bit, MODEL_NAME, "4-bit", max_new_tokens=50
        )
        result_4bit.load_time = load_time_4bit
        
        print(f"\nVýstup: {result_4bit.output_text}")
        
        del model_4bit, tokenizer_4bit
        clear_gpu_memory()
    except Exception as e:
        print(f"4-bit test selhal: {e}")

In [None]:
# Výsledky benchmarku
benchmark.print_results()
benchmark.compare_results()

## 3. Optimalizace Inference

### 3.1 Optimalizovaný Generator

In [None]:
class OptimizedGenerator:
    """
    Optimalizovaný text generátor s různými technikami.
    """
    
    def __init__(
        self,
        model_name: str,
        quantization: str = "4bit",  # fp16, 8bit, 4bit
        use_flash_attention: bool = True
    ):
        self.model_name = model_name
        self.quantization = quantization
        
        # Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Kvantizační konfigurace
        quant_config = self._get_quant_config()
        
        # Načtení modelu
        model_kwargs = {
            "device_map": "auto",
            "trust_remote_code": True
        }
        
        if quant_config:
            model_kwargs["quantization_config"] = quant_config
        elif quantization == "fp16":
            model_kwargs["torch_dtype"] = torch.float16
        
        # Flash Attention 2 (pokud je k dispozici)
        if use_flash_attention:
            try:
                model_kwargs["attn_implementation"] = "flash_attention_2"
            except:
                print("Flash Attention není k dispozici, používám standardní attention.")
        
        print(f"Načítám model {model_name} ({quantization})...")
        self.model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
        print(f"Model načten. Paměť: {get_gpu_memory_usage():.2f} GB")
    
    def _get_quant_config(self):
        """Vrátí kvantizační konfiguraci."""
        if self.quantization == "8bit":
            return BitsAndBytesConfig(load_in_8bit=True)
        elif self.quantization == "4bit":
            return BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True
            )
        return None
    
    def generate(
        self,
        prompt: str,
        max_new_tokens: int = 256,
        temperature: float = 0.7,
        top_p: float = 0.9,
        top_k: int = 50,
        repetition_penalty: float = 1.1,
        do_sample: bool = True,
        stream: bool = False
    ) -> str:
        """Generuje text s optimalizacemi."""
        
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(self.model.device)
        
        generation_config = {
            "max_new_tokens": max_new_tokens,
            "do_sample": do_sample,
            "pad_token_id": self.tokenizer.pad_token_id,
            "eos_token_id": self.tokenizer.eos_token_id,
        }
        
        if do_sample:
            generation_config.update({
                "temperature": temperature,
                "top_p": top_p,
                "top_k": top_k,
                "repetition_penalty": repetition_penalty
            })
        
        # Streaming output
        if stream:
            streamer = TextStreamer(self.tokenizer, skip_special_tokens=True)
            generation_config["streamer"] = streamer
        
        # Generování s torch.inference_mode pro lepší výkon
        with torch.inference_mode():
            outputs = self.model.generate(**inputs, **generation_config)
        
        # Dekódování
        generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
        return self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    def batch_generate(
        self,
        prompts: List[str],
        max_new_tokens: int = 256,
        **kwargs
    ) -> List[str]:
        """Generuje text pro batch promptů."""
        
        # Tokenizace batche
        inputs = self.tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(self.model.device)
        
        with torch.inference_mode():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                pad_token_id=self.tokenizer.pad_token_id,
                **kwargs
            )
        
        # Dekódování všech výstupů
        results = []
        for i, output in enumerate(outputs):
            generated = output[inputs["input_ids"].shape[1]:]
            results.append(self.tokenizer.decode(generated, skip_special_tokens=True))
        
        return results

In [None]:
# Test optimalizovaného generátoru
if torch.cuda.is_available():
    generator = OptimizedGenerator(
        model_name=MODEL_NAME,
        quantization="4bit"
    )
    
    # Jednoduchý test
    prompt = "Napiš tři výhody umělé inteligence:"
    
    print(f"Prompt: {prompt}\n")
    print("Generuji...")
    
    start = time.time()
    output = generator.generate(prompt, max_new_tokens=150)
    elapsed = time.time() - start
    
    print(f"\nVýstup:\n{output}")
    print(f"\nČas generování: {elapsed:.2f}s")

In [None]:
# Test batch generování
if torch.cuda.is_available() and 'generator' in dir():
    prompts = [
        "Co je Python?",
        "Co je JavaScript?",
        "Co je machine learning?"
    ]
    
    print("Batch generování:")
    print("="*50)
    
    start = time.time()
    outputs = generator.batch_generate(prompts, max_new_tokens=50, do_sample=False)
    elapsed = time.time() - start
    
    for prompt, output in zip(prompts, outputs):
        print(f"\nQ: {prompt}")
        print(f"A: {output[:150]}...")
    
    print(f"\nCelkový čas: {elapsed:.2f}s")
    print(f"Průměr na prompt: {elapsed/len(prompts):.2f}s")

## 4. KV-Cache a Optimalizace Paměti

In [None]:
class CachedGenerator:
    """
    Generátor s KV-cache pro efektivní konverzace.
    """
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.past_key_values = None
        self.context_tokens = None
    
    def set_context(self, context: str):
        """Nastaví kontext a uloží KV-cache."""
        inputs = self.tokenizer(context, return_tensors="pt").to(self.model.device)
        
        with torch.inference_mode():
            outputs = self.model(**inputs, use_cache=True)
            self.past_key_values = outputs.past_key_values
            self.context_tokens = inputs["input_ids"]
        
        print(f"Kontext uložen ({inputs['input_ids'].shape[1]} tokenů)")
    
    def generate_with_cache(
        self,
        continuation: str,
        max_new_tokens: int = 50
    ) -> str:
        """Generuje s využitím cached kontextu."""
        if self.past_key_values is None:
            raise ValueError("Nejprve nastavte kontext pomocí set_context()")
        
        # Tokenizace pokračování
        new_inputs = self.tokenizer(continuation, return_tensors="pt").to(self.model.device)
        
        # Kombinace s kontextem
        combined_ids = torch.cat([self.context_tokens, new_inputs["input_ids"]], dim=1)
        
        with torch.inference_mode():
            outputs = self.model.generate(
                input_ids=combined_ids,
                past_key_values=self.past_key_values,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=self.tokenizer.pad_token_id
            )
        
        generated = outputs[0][combined_ids.shape[1]:]
        return self.tokenizer.decode(generated, skip_special_tokens=True)
    
    def clear_cache(self):
        """Vymaže cache."""
        self.past_key_values = None
        self.context_tokens = None
        clear_gpu_memory()

In [None]:
# Demonstrace KV-cache (teoretická - vyžaduje model s podporou)
print("KV-Cache umožňuje:")
print("  1. Uložit kontext (systémový prompt, dokument)")
print("  2. Rychle generovat odpovědi bez přepočítávání kontextu")
print("  3. Úspora 50-80% času při opakovaných dotazech")
print("\nPříklad použití:")
print("""
cached_gen = CachedGenerator(model, tokenizer)
cached_gen.set_context("Dlouhý dokument o AI...")

# Rychlé odpovědi na otázky
answer1 = cached_gen.generate_with_cache("Otázka: Co je AI?")
answer2 = cached_gen.generate_with_cache("Otázka: Jaké jsou výhody?") 
""")

## 5. Throughput Optimalizace

In [None]:
class ThroughputOptimizer:
    """
    Optimalizace propustnosti pro produkční nasazení.
    """
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def find_optimal_batch_size(self, max_batch: int = 32, max_tokens: int = 100) -> int:
        """Najde optimální batch size pro GPU."""
        test_prompt = "Test prompt for batch sizing."
        
        optimal_batch = 1
        best_throughput = 0
        
        for batch_size in [1, 2, 4, 8, 16, 32]:
            if batch_size > max_batch:
                break
            
            try:
                clear_gpu_memory()
                prompts = [test_prompt] * batch_size
                
                inputs = self.tokenizer(
                    prompts, return_tensors="pt", padding=True
                ).to(self.model.device)
                
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                
                start = time.time()
                with torch.inference_mode():
                    _ = self.model.generate(
                        **inputs,
                        max_new_tokens=max_tokens,
                        do_sample=False,
                        pad_token_id=self.tokenizer.pad_token_id
                    )
                
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                
                elapsed = time.time() - start
                throughput = batch_size / elapsed
                
                print(f"  Batch {batch_size}: {throughput:.2f} req/s, paměť: {get_gpu_memory_usage():.2f} GB")
                
                if throughput > best_throughput:
                    best_throughput = throughput
                    optimal_batch = batch_size
                    
            except RuntimeError as e:
                if "out of memory" in str(e).lower():
                    print(f"  Batch {batch_size}: OOM")
                    clear_gpu_memory()
                    break
                raise
        
        return optimal_batch
    
    def benchmark_throughput(
        self,
        prompts: List[str],
        batch_size: int,
        max_tokens: int = 100
    ) -> Dict[str, float]:
        """Měří throughput pro dané prompty."""
        total_requests = len(prompts)
        total_tokens = 0
        
        start = time.time()
        
        for i in range(0, len(prompts), batch_size):
            batch = prompts[i:i + batch_size]
            
            inputs = self.tokenizer(
                batch, return_tensors="pt", padding=True, truncation=True
            ).to(self.model.device)
            
            with torch.inference_mode():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_tokens,
                    do_sample=False,
                    pad_token_id=self.tokenizer.pad_token_id
                )
            
            for output in outputs:
                total_tokens += len(output)
        
        elapsed = time.time() - start
        
        return {
            "total_requests": total_requests,
            "total_time": elapsed,
            "requests_per_second": total_requests / elapsed,
            "tokens_per_second": total_tokens / elapsed,
            "avg_latency": elapsed / total_requests
        }

In [None]:
# Test optimalizace throughputu
if torch.cuda.is_available() and 'generator' in dir():
    optimizer = ThroughputOptimizer(generator.model, generator.tokenizer)
    
    print("Hledám optimální batch size...")
    optimal = optimizer.find_optimal_batch_size(max_batch=16, max_tokens=50)
    print(f"\nOptimální batch size: {optimal}")

## 6. Produkční Inference Server

In [None]:
from queue import Queue
from threading import Thread
from dataclasses import field
import uuid

@dataclass
class InferenceRequest:
    """Request pro inference."""
    request_id: str
    prompt: str
    max_tokens: int = 256
    temperature: float = 0.7
    created_at: float = field(default_factory=time.time)


@dataclass
class InferenceResponse:
    """Response z inference."""
    request_id: str
    text: str
    tokens_generated: int
    latency: float
    queue_time: float


class InferenceServer:
    """
    Jednoduchý inference server s batched processing.
    """
    
    def __init__(
        self,
        model,
        tokenizer,
        batch_size: int = 4,
        max_wait_time: float = 0.1
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_wait_time = max_wait_time
        
        self.request_queue = Queue()
        self.response_dict = {}
        self.running = False
        self.stats = {
            "total_requests": 0,
            "total_tokens": 0,
            "total_time": 0
        }
    
    def start(self):
        """Spustí server."""
        self.running = True
        self.worker_thread = Thread(target=self._process_loop, daemon=True)
        self.worker_thread.start()
        print("Inference server spuštěn.")
    
    def stop(self):
        """Zastaví server."""
        self.running = False
        print("Inference server zastaven.")
    
    def submit(self, prompt: str, max_tokens: int = 256) -> str:
        """Odešle request a vrátí request_id."""
        request_id = str(uuid.uuid4())
        request = InferenceRequest(
            request_id=request_id,
            prompt=prompt,
            max_tokens=max_tokens
        )
        self.request_queue.put(request)
        return request_id
    
    def get_response(self, request_id: str, timeout: float = 30) -> Optional[InferenceResponse]:
        """Čeká na response."""
        start = time.time()
        while time.time() - start < timeout:
            if request_id in self.response_dict:
                return self.response_dict.pop(request_id)
            time.sleep(0.01)
        return None
    
    def _process_loop(self):
        """Hlavní loop pro zpracování requestů."""
        while self.running:
            batch = []
            wait_start = time.time()
            
            # Sbíráme batch
            while len(batch) < self.batch_size:
                if time.time() - wait_start > self.max_wait_time and batch:
                    break
                try:
                    request = self.request_queue.get(timeout=0.01)
                    batch.append(request)
                except:
                    continue
            
            if batch:
                self._process_batch(batch)
    
    def _process_batch(self, batch: List[InferenceRequest]):
        """Zpracuje batch requestů."""
        prompts = [r.prompt for r in batch]
        max_tokens = max(r.max_tokens for r in batch)
        
        # Tokenizace
        inputs = self.tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(self.model.device)
        
        # Inference
        start = time.time()
        with torch.inference_mode():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=False,
                pad_token_id=self.tokenizer.pad_token_id
            )
        inference_time = time.time() - start
        
        # Zpracování výstupů
        for i, (request, output) in enumerate(zip(batch, outputs)):
            generated = output[inputs["input_ids"].shape[1]:]
            text = self.tokenizer.decode(generated, skip_special_tokens=True)
            
            response = InferenceResponse(
                request_id=request.request_id,
                text=text,
                tokens_generated=len(generated),
                latency=inference_time / len(batch),
                queue_time=time.time() - request.created_at
            )
            
            self.response_dict[request.request_id] = response
            
            # Statistiky
            self.stats["total_requests"] += 1
            self.stats["total_tokens"] += len(generated)
            self.stats["total_time"] += response.latency
    
    def get_stats(self) -> Dict[str, float]:
        """Vrátí statistiky serveru."""
        if self.stats["total_requests"] == 0:
            return self.stats
        
        return {
            **self.stats,
            "avg_latency": self.stats["total_time"] / self.stats["total_requests"],
            "avg_tokens": self.stats["total_tokens"] / self.stats["total_requests"]
        }

In [None]:
# Test inference serveru
if torch.cuda.is_available() and 'generator' in dir():
    server = InferenceServer(
        model=generator.model,
        tokenizer=generator.tokenizer,
        batch_size=4,
        max_wait_time=0.05
    )
    
    server.start()
    
    # Odeslání requestů
    prompts = [
        "Co je Python?",
        "Vysvětli AI:",
        "Co je cloud computing?",
        "Popiš machine learning:"
    ]
    
    request_ids = []
    for prompt in prompts:
        rid = server.submit(prompt, max_tokens=50)
        request_ids.append(rid)
        print(f"Odesláno: {prompt[:30]}... -> {rid[:8]}")
    
    # Čekání na odpovědi
    print("\nOdpovědi:")
    print("-" * 50)
    
    for rid in request_ids:
        response = server.get_response(rid, timeout=30)
        if response:
            print(f"\n[{response.tokens_generated} tokenů, {response.latency:.2f}s]")
            print(f"{response.text[:150]}...")
    
    # Statistiky
    print("\n" + "="*50)
    print("STATISTIKY SERVERU:")
    for k, v in server.get_stats().items():
        print(f"  {k}: {v:.3f}" if isinstance(v, float) else f"  {k}: {v}")
    
    server.stop()

## Shrnutí

V tomto notebooku jsme se naučili:

1. **Kvantizace** - FP16, 8-bit, 4-bit pro úsporu paměti
2. **Benchmarking** - měření výkonu a porovnání
3. **Optimalizovaný generátor** - s flash attention a dalšími technikami
4. **KV-Cache** - pro efektivní konverzace
5. **Throughput optimalizace** - nalezení optimálního batch size
6. **Inference server** - pro produkční nasazení

### Doporučení pro produkci:
- Použijte 4-bit kvantizaci pro maximální úsporu paměti
- Optimalizujte batch size podle vaší GPU
- Implementujte request batching pro vysoký throughput
- Sledujte metriky: latence, throughput, využití GPU