# 15. Multimod√°ln√≠ Modely

**Autor:** Praut s.r.o. - AI Integration & Business Automation

V tomto notebooku se nauƒç√≠me:
- Z√°klady multimod√°ln√≠ch model≈Ø (text + obraz)
- Image Captioning - popis obr√°zk≈Ø
- Visual Question Answering (VQA)
- CLIP - spojen√≠ textu a obrazu
- Multimod√°ln√≠ embeddingy
- Praktick√© aplikace v byznysu

In [None]:
# Instalace pot≈ôebn√Ωch knihoven
!pip install transformers accelerate pillow requests torch torchvision -q
!pip install open_clip_torch sentence-transformers -q

In [None]:
import torch
import requests
from PIL import Image
from io import BytesIO
import numpy as np
from typing import List, Dict, Any, Optional, Tuple
import matplotlib.pyplot as plt
from transformers import (
    AutoProcessor, 
    AutoModelForCausalLM,
    BlipProcessor,
    BlipForConditionalGeneration,
    BlipForQuestionAnswering,
    CLIPProcessor,
    CLIPModel,
    AutoTokenizer
)
import warnings
warnings.filterwarnings('ignore')

# Kontrola GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Pou≈æ√≠v√°m za≈ô√≠zen√≠: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Pomocn√© funkce pro naƒç√≠t√°n√≠ obr√°zk≈Ø

def load_image_from_url(url: str) -> Image.Image:
    """Naƒçte obr√°zek z URL."""
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return Image.open(BytesIO(response.content)).convert("RGB")

def show_image(image: Image.Image, title: str = ""):
    """Zobraz√≠ obr√°zek."""
    plt.figure(figsize=(8, 8))
    plt.imshow(image)
    plt.title(title)
    plt.axis("off")
    plt.show()

def show_images_grid(images: List[Image.Image], titles: List[str] = None, cols: int = 3):
    """Zobraz√≠ m≈ô√≠≈æku obr√°zk≈Ø."""
    n = len(images)
    rows = (n + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
    axes = axes.flatten() if n > 1 else [axes]
    
    for i, (ax, img) in enumerate(zip(axes, images)):
        ax.imshow(img)
        if titles and i < len(titles):
            ax.set_title(titles[i], fontsize=10)
        ax.axis("off")
    
    # Skr√Ωt pr√°zdn√© subploty
    for j in range(i+1, len(axes)):
        axes[j].axis("off")
    
    plt.tight_layout()
    plt.show()

## 1. Image Captioning - BLIP

BLIP (Bootstrapping Language-Image Pre-training) je model pro:
- Generov√°n√≠ popis≈Ø obr√°zk≈Ø
- Visual Question Answering
- Image-Text Matching

In [None]:
class ImageCaptioner:
    """T≈ô√≠da pro generov√°n√≠ popis≈Ø obr√°zk≈Ø."""
    
    def __init__(self, model_name: str = "Salesforce/blip-image-captioning-base"):
        print(f"Naƒç√≠t√°m model: {model_name}")
        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)
        self.model.eval()
        print("Model naƒçten!")
    
    def generate_caption(self, 
                        image: Image.Image,
                        conditional_text: str = None,
                        max_length: int = 50,
                        num_beams: int = 4) -> str:
        """
        Generuje popis obr√°zku.
        
        Args:
            image: PIL obr√°zek
            conditional_text: Voliteln√Ω prefix pro ≈ô√≠zen√© generov√°n√≠
            max_length: Maxim√°ln√≠ d√©lka popisu
            num_beams: Poƒçet beams pro beam search
        """
        # P≈ô√≠prava vstupu
        if conditional_text:
            inputs = self.processor(image, conditional_text, return_tensors="pt").to(device)
        else:
            inputs = self.processor(image, return_tensors="pt").to(device)
        
        # Generov√°n√≠
        with torch.no_grad():
            output = self.model.generate(
                **inputs,
                max_length=max_length,
                num_beams=num_beams
            )
        
        # Dek√≥dov√°n√≠
        caption = self.processor.decode(output[0], skip_special_tokens=True)
        return caption
    
    def generate_multiple_captions(self,
                                   image: Image.Image,
                                   num_captions: int = 3,
                                   temperature: float = 0.9) -> List[str]:
        """Generuje v√≠ce r≈Øzn√Ωch popis≈Ø."""
        inputs = self.processor(image, return_tensors="pt").to(device)
        
        captions = []
        with torch.no_grad():
            for _ in range(num_captions):
                output = self.model.generate(
                    **inputs,
                    max_length=50,
                    do_sample=True,
                    temperature=temperature,
                    top_p=0.9
                )
                caption = self.processor.decode(output[0], skip_special_tokens=True)
                if caption not in captions:
                    captions.append(caption)
        
        return captions

In [None]:
# Inicializace captioneru
captioner = ImageCaptioner()

In [None]:
# Test na uk√°zkov√Ωch obr√°zc√≠ch

test_urls = [
    "https://images.unsplash.com/photo-1587300003388-59208cc962cb?w=400",  # Pes
    "https://images.unsplash.com/photo-1460925895917-afdab827c52f?w=400",  # Kancel√°≈ô
    "https://images.unsplash.com/photo-1551288049-bebda4e38f71?w=400",  # Dashboard
]

print("Generuji popisy obr√°zk≈Ø...")
print("=" * 60)

for i, url in enumerate(test_urls, 1):
    try:
        image = load_image_from_url(url)
        caption = captioner.generate_caption(image)
        
        print(f"\nObr√°zek {i}:")
        print(f"Popis: {caption}")
        
        # Zobrazen√≠
        show_image(image, caption)
        
    except Exception as e:
        print(f"Chyba p≈ôi zpracov√°n√≠ obr√°zku {i}: {e}")

In [None]:
# ≈ò√≠zen√© generov√°n√≠ s prefixem

url = "https://images.unsplash.com/photo-1460925895917-afdab827c52f?w=400"
image = load_image_from_url(url)

prefixes = [
    "a photo of",
    "this image shows",
    "in this picture we can see",
    "the scene depicts"
]

print("≈ò√≠zen√© generov√°n√≠ s r≈Øzn√Ωmi prefixy:")
print("=" * 60)
for prefix in prefixes:
    caption = captioner.generate_caption(image, conditional_text=prefix)
    print(f"Prefix '{prefix}': {caption}")

## 2. Visual Question Answering (VQA)

VQA umo≈æ≈àuje kl√°st ot√°zky o obsahu obr√°zku.

In [None]:
class VisualQA:
    """T≈ô√≠da pro Visual Question Answering."""
    
    def __init__(self, model_name: str = "Salesforce/blip-vqa-base"):
        print(f"Naƒç√≠t√°m VQA model: {model_name}")
        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)
        self.model.eval()
        print("VQA model naƒçten!")
    
    def answer_question(self, 
                       image: Image.Image, 
                       question: str,
                       max_length: int = 30) -> str:
        """Odpov√≠d√° na ot√°zku o obr√°zku."""
        
        inputs = self.processor(image, question, return_tensors="pt").to(device)
        
        with torch.no_grad():
            output = self.model.generate(**inputs, max_length=max_length)
        
        answer = self.processor.decode(output[0], skip_special_tokens=True)
        return answer
    
    def batch_qa(self, 
                image: Image.Image, 
                questions: List[str]) -> List[Dict[str, str]]:
        """Odpov√≠d√° na v√≠ce ot√°zek."""
        results = []
        for q in questions:
            answer = self.answer_question(image, q)
            results.append({"question": q, "answer": answer})
        return results

In [None]:
# Inicializace VQA
vqa = VisualQA()

In [None]:
# Test VQA

url = "https://images.unsplash.com/photo-1587300003388-59208cc962cb?w=400"
image = load_image_from_url(url)

questions = [
    "What animal is in the picture?",
    "What color is it?",
    "Is it inside or outside?",
    "How many animals are there?",
    "What is the animal doing?"
]

print("Visual Question Answering:")
print("=" * 60)
show_image(image, "Test Image")

results = vqa.batch_qa(image, questions)
for r in results:
    print(f"Q: {r['question']}")
    print(f"A: {r['answer']}")
    print()

## 3. CLIP - Contrastive Language-Image Pre-training

CLIP spojuje text a obr√°zky ve spoleƒçn√©m embedding prostoru:
- Vyhled√°v√°n√≠ obr√°zk≈Ø podle textu
- Zero-shot klasifikace obr√°zk≈Ø
- Similarity matching

In [None]:
class CLIPSystem:
    """T≈ô√≠da pro pr√°ci s CLIP modelem."""
    
    def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
        print(f"Naƒç√≠t√°m CLIP model: {model_name}")
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.model = CLIPModel.from_pretrained(model_name).to(device)
        self.model.eval()
        print("CLIP model naƒçten!")
    
    def get_image_embeddings(self, images: List[Image.Image]) -> torch.Tensor:
        """Vytvo≈ô√≠ embeddingy pro obr√°zky."""
        inputs = self.processor(images=images, return_tensors="pt", padding=True).to(device)
        
        with torch.no_grad():
            image_features = self.model.get_image_features(**inputs)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        
        return image_features
    
    def get_text_embeddings(self, texts: List[str]) -> torch.Tensor:
        """Vytvo≈ô√≠ embeddingy pro texty."""
        inputs = self.processor(text=texts, return_tensors="pt", padding=True).to(device)
        
        with torch.no_grad():
            text_features = self.model.get_text_features(**inputs)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        
        return text_features
    
    def compute_similarity(self, 
                          images: List[Image.Image], 
                          texts: List[str]) -> np.ndarray:
        """Vypoƒç√≠t√° podobnost mezi obr√°zky a texty."""
        image_emb = self.get_image_embeddings(images)
        text_emb = self.get_text_embeddings(texts)
        
        # Kosinov√° podobnost
        similarity = (image_emb @ text_emb.T).cpu().numpy()
        return similarity * 100  # ≈†k√°lov√°n√≠ pro lep≈°√≠ ƒçitelnost
    
    def zero_shot_classify(self, 
                          image: Image.Image, 
                          labels: List[str],
                          template: str = "a photo of {}") -> Dict[str, float]:
        """Zero-shot klasifikace obr√°zku."""
        
        # Vytvo≈ôen√≠ textov√Ωch popis≈Ø z template
        texts = [template.format(label) for label in labels]
        
        # V√Ωpoƒçet podobnosti
        similarity = self.compute_similarity([image], texts)[0]
        
        # Softmax pro pravdƒõpodobnosti
        probs = np.exp(similarity) / np.exp(similarity).sum()
        
        return {label: float(prob) for label, prob in zip(labels, probs)}
    
    def search_images(self, 
                     images: List[Image.Image],
                     query: str,
                     top_k: int = 5) -> List[Tuple[int, float]]:
        """Vyhled√° obr√°zky podle textov√©ho dotazu."""
        
        similarity = self.compute_similarity(images, [query])[:, 0]
        
        # Se≈ôazen√≠ podle podobnosti
        indices = np.argsort(similarity)[::-1][:top_k]
        
        return [(int(idx), float(similarity[idx])) for idx in indices]

In [None]:
# Inicializace CLIP
clip_system = CLIPSystem()

In [None]:
# Zero-shot klasifikace

url = "https://images.unsplash.com/photo-1587300003388-59208cc962cb?w=400"
image = load_image_from_url(url)

labels = ["dog", "cat", "bird", "horse", "elephant", "fish"]

results = clip_system.zero_shot_classify(image, labels)

print("Zero-shot klasifikace:")
print("=" * 40)
show_image(image, "Test Image")

for label, prob in sorted(results.items(), key=lambda x: x[1], reverse=True):
    bar = "‚ñà" * int(prob * 30)
    print(f"{label:12} {prob:6.1%} {bar}")

In [None]:
# Image-Text Similarity Matrix

# Naƒçten√≠ nƒõkolika obr√°zk≈Ø
image_urls = [
    "https://images.unsplash.com/photo-1587300003388-59208cc962cb?w=300",  # Pes
    "https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba?w=300",  # Koƒçka
    "https://images.unsplash.com/photo-1460925895917-afdab827c52f?w=300",  # Kancel√°≈ô
]

images = [load_image_from_url(url) for url in image_urls]
texts = ["a dog", "a cat", "an office workspace", "a landscape"]

# V√Ωpoƒçet similarity matrix
similarity = clip_system.compute_similarity(images, texts)

# Vizualizace
plt.figure(figsize=(10, 6))
plt.imshow(similarity, cmap='Blues')
plt.colorbar(label='Similarity Score')
plt.xticks(range(len(texts)), texts, rotation=45, ha='right')
plt.yticks(range(len(images)), [f'Image {i+1}' for i in range(len(images))])
plt.title('Image-Text Similarity Matrix')

# Hodnoty do bunƒõk
for i in range(len(images)):
    for j in range(len(texts)):
        plt.text(j, i, f'{similarity[i,j]:.1f}', 
                ha='center', va='center', fontsize=10)

plt.tight_layout()
plt.show()

# Zobrazen√≠ obr√°zk≈Ø
show_images_grid(images, [f"Image {i+1}" for i in range(len(images))])

## 4. Multimod√°ln√≠ Vyhled√°v√°n√≠

Kombinace CLIP s vektorovou datab√°z√≠ pro vyhled√°v√°n√≠ obr√°zk≈Ø.

In [None]:
class MultimodalSearchEngine:
    """Vyhled√°vaƒç obr√°zk≈Ø s multimod√°ln√≠mi embeddingy."""
    
    def __init__(self, clip_model: CLIPSystem = None):
        self.clip = clip_model or CLIPSystem()
        self.images: List[Image.Image] = []
        self.image_embeddings: torch.Tensor = None
        self.metadata: List[Dict] = []
    
    def add_images(self, 
                  images: List[Image.Image], 
                  metadata: List[Dict] = None):
        """P≈ôid√° obr√°zky do indexu."""
        
        # V√Ωpoƒçet embedding≈Ø
        new_embeddings = self.clip.get_image_embeddings(images)
        
        # P≈ôid√°n√≠ do seznamu
        self.images.extend(images)
        
        # Metadata
        if metadata:
            self.metadata.extend(metadata)
        else:
            self.metadata.extend([{} for _ in images])
        
        # Aktualizace embedding≈Ø
        if self.image_embeddings is None:
            self.image_embeddings = new_embeddings
        else:
            self.image_embeddings = torch.cat([self.image_embeddings, new_embeddings])
        
        print(f"P≈ôid√°no {len(images)} obr√°zk≈Ø. Celkem: {len(self.images)}")
    
    def search_by_text(self, 
                      query: str, 
                      top_k: int = 5) -> List[Dict]:
        """Vyhled√° obr√°zky podle textov√©ho dotazu."""
        
        if not self.images:
            return []
        
        # Text embedding
        text_emb = self.clip.get_text_embeddings([query])
        
        # Podobnost
        similarity = (self.image_embeddings @ text_emb.T).squeeze().cpu().numpy()
        
        # Top-k v√Ωsledky
        top_indices = np.argsort(similarity)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                "index": int(idx),
                "score": float(similarity[idx]),
                "image": self.images[idx],
                "metadata": self.metadata[idx]
            })
        
        return results
    
    def search_by_image(self, 
                       query_image: Image.Image, 
                       top_k: int = 5) -> List[Dict]:
        """Vyhled√° podobn√© obr√°zky."""
        
        if not self.images:
            return []
        
        # Image embedding
        query_emb = self.clip.get_image_embeddings([query_image])
        
        # Podobnost
        similarity = (self.image_embeddings @ query_emb.T).squeeze().cpu().numpy()
        
        # Top-k v√Ωsledky
        top_indices = np.argsort(similarity)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                "index": int(idx),
                "score": float(similarity[idx]),
                "image": self.images[idx],
                "metadata": self.metadata[idx]
            })
        
        return results

In [None]:
# Vytvo≈ôen√≠ vyhled√°vaƒçe
search_engine = MultimodalSearchEngine(clip_system)

# P≈ôid√°n√≠ obr√°zk≈Ø s metadaty
sample_urls = [
    ("https://images.unsplash.com/photo-1587300003388-59208cc962cb?w=300", {"category": "animals", "tags": ["dog", "pet"]}),
    ("https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba?w=300", {"category": "animals", "tags": ["cat", "pet"]}),
    ("https://images.unsplash.com/photo-1460925895917-afdab827c52f?w=300", {"category": "work", "tags": ["office", "computer"]}),
    ("https://images.unsplash.com/photo-1551288049-bebda4e38f71?w=300", {"category": "work", "tags": ["dashboard", "data"]}),
    ("https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=300", {"category": "nature", "tags": ["mountains", "landscape"]}),
]

images = []
metadata = []
for url, meta in sample_urls:
    try:
        img = load_image_from_url(url)
        images.append(img)
        metadata.append(meta)
    except:
        print(f"Nepoda≈ôilo se naƒç√≠st: {url}")

search_engine.add_images(images, metadata)

In [None]:
# Vyhled√°v√°n√≠ podle textu

queries = [
    "a cute pet",
    "working at desk with computer",
    "beautiful nature scenery"
]

for query in queries:
    print(f"\nüîç Dotaz: '{query}'")
    print("-" * 40)
    
    results = search_engine.search_by_text(query, top_k=3)
    
    result_images = [r["image"] for r in results]
    result_titles = [f"Score: {r['score']:.2f}\n{r['metadata'].get('tags', [])}" for r in results]
    
    show_images_grid(result_images, result_titles)

## 5. Praktick√© Aplikace pro Byznys

In [None]:
class ProductAnalyzer:
    """Analyz√°tor produktov√Ωch fotografi√≠ pro e-commerce."""
    
    def __init__(self):
        self.captioner = captioner
        self.vqa = vqa
        self.clip = clip_system
    
    def analyze_product(self, image: Image.Image) -> Dict[str, Any]:
        """Komplexn√≠ anal√Ωza produktov√© fotografie."""
        
        results = {
            "description": None,
            "attributes": {},
            "categories": {},
            "quality_checks": {}
        }
        
        # 1. Popis produktu
        results["description"] = self.captioner.generate_caption(
            image, 
            conditional_text="this product is"
        )
        
        # 2. Atributy produktu
        attribute_questions = [
            ("color", "What is the main color?"),
            ("material", "What material is this made of?"),
            ("brand_visible", "Is there a brand logo visible?"),
        ]
        
        for attr_name, question in attribute_questions:
            results["attributes"][attr_name] = self.vqa.answer_question(image, question)
        
        # 3. Kategorizace
        categories = ["electronics", "clothing", "furniture", "food", "toys", "sports"]
        results["categories"] = self.clip.zero_shot_classify(image, categories)
        
        # 4. Quality checks
        quality_labels = ["professional photo", "amateur photo", "blurry image", "well-lit image"]
        results["quality_checks"] = self.clip.zero_shot_classify(image, quality_labels)
        
        return results
    
    def generate_listing(self, image: Image.Image) -> str:
        """Generuje n√°vrh produktov√©ho listingu."""
        
        analysis = self.analyze_product(image)
        
        # Hlavn√≠ kategorie
        top_category = max(analysis["categories"].items(), key=lambda x: x[1])[0]
        
        listing = f"""
üì¶ PRODUKTOV√ù LISTING
{'='*40}

üìù Popis:
{analysis['description']}

üè∑Ô∏è Kategorie: {top_category}

üìã Atributy:
- Barva: {analysis['attributes'].get('color', 'N/A')}
- Materi√°l: {analysis['attributes'].get('material', 'N/A')}

üìä Kvalita fotografie:
- Profesion√°ln√≠: {analysis['quality_checks'].get('professional photo', 0):.1%}
- Dobr√© osvƒõtlen√≠: {analysis['quality_checks'].get('well-lit image', 0):.1%}
"""
        return listing

In [None]:
# Test produktov√©ho analyz√°toru

analyzer = ProductAnalyzer()

# Test na obr√°zku
url = "https://images.unsplash.com/photo-1523275335684-37898b6baf30?w=400"  # Hodinky
try:
    product_image = load_image_from_url(url)
    show_image(product_image, "Produkt k anal√Ωze")
    
    listing = analyzer.generate_listing(product_image)
    print(listing)
except Exception as e:
    print(f"Chyba: {e}")

In [None]:
class ContentModerator:
    """Moder√°tor obsahu pomoc√≠ CLIP."""
    
    def __init__(self, clip_model: CLIPSystem):
        self.clip = clip_model
        
        # Kategorie pro moderaci
        self.safe_categories = [
            "safe content",
            "family friendly image",
            "professional photo",
            "nature scene",
            "product photo"
        ]
        
        self.unsafe_categories = [
            "violent content",
            "inappropriate content",
            "spam or advertisement",
            "low quality image"
        ]
    
    def moderate(self, image: Image.Image) -> Dict[str, Any]:
        """Moderuje obr√°zek."""
        
        all_categories = self.safe_categories + self.unsafe_categories
        scores = self.clip.zero_shot_classify(image, all_categories)
        
        # V√Ωpoƒçet bezpeƒçnosti
        safe_score = sum(scores.get(cat, 0) for cat in self.safe_categories)
        unsafe_score = sum(scores.get(cat, 0) for cat in self.unsafe_categories)
        
        return {
            "is_safe": safe_score > unsafe_score,
            "safe_score": safe_score,
            "unsafe_score": unsafe_score,
            "detailed_scores": scores,
            "recommendation": "approve" if safe_score > 0.6 else "review" if safe_score > 0.4 else "reject"
        }

# Test moder√°toru
moderator = ContentModerator(clip_system)

url = "https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=400"
test_image = load_image_from_url(url)

result = moderator.moderate(test_image)
print("V√Ωsledek moderace:")
print(f"  Bezpeƒçn√Ω: {result['is_safe']}")
print(f"  Doporuƒçen√≠: {result['recommendation']}")
print(f"  Safe score: {result['safe_score']:.2%}")

## 6. Produkƒçn√≠ T≈ô√≠da pro Multimod√°ln√≠ AI

In [None]:
class MultimodalAIService:
    """Produkƒçn√≠ slu≈æba pro multimod√°ln√≠ AI."""
    
    def __init__(self):
        print("Inicializuji MultimodalAIService...")
        self._captioner = None
        self._vqa = None
        self._clip = None
        self._cache = {}
        print("Slu≈æba p≈ôipravena (lazy loading)")
    
    @property
    def captioner(self):
        if self._captioner is None:
            self._captioner = ImageCaptioner()
        return self._captioner
    
    @property
    def vqa(self):
        if self._vqa is None:
            self._vqa = VisualQA()
        return self._vqa
    
    @property
    def clip(self):
        if self._clip is None:
            self._clip = CLIPSystem()
        return self._clip
    
    def process_image(self, 
                     image: Image.Image,
                     tasks: List[str] = None) -> Dict[str, Any]:
        """
        Zpracuje obr√°zek vybran√Ωmi √∫lohami.
        
        Args:
            image: PIL obr√°zek
            tasks: Seznam √∫loh ['caption', 'vqa', 'classify']
        """
        
        if tasks is None:
            tasks = ["caption"]
        
        results = {}
        
        if "caption" in tasks:
            results["caption"] = self.captioner.generate_caption(image)
        
        if "vqa" in tasks:
            default_questions = [
                "What is the main subject?",
                "What colors are visible?"
            ]
            results["vqa"] = self.vqa.batch_qa(image, default_questions)
        
        if "classify" in tasks:
            default_labels = ["product", "person", "nature", "document", "food"]
            results["classification"] = self.clip.zero_shot_classify(image, default_labels)
        
        return results
    
    def batch_process(self, 
                     images: List[Image.Image],
                     tasks: List[str] = None) -> List[Dict[str, Any]]:
        """Zpracuje v√≠ce obr√°zk≈Ø."""
        return [self.process_image(img, tasks) for img in images]

In [None]:
# Test produkƒçn√≠ slu≈æby

service = MultimodalAIService()

# Naƒçten√≠ obr√°zku
url = "https://images.unsplash.com/photo-1551288049-bebda4e38f71?w=400"
image = load_image_from_url(url)

# Komplexn√≠ zpracov√°n√≠
results = service.process_image(image, tasks=["caption", "classify"])

print("V√Ωsledky zpracov√°n√≠:")
print("=" * 40)
print(f"\nPopis: {results.get('caption')}")

if "classification" in results:
    print("\nKlasifikace:")
    for label, prob in sorted(results["classification"].items(), key=lambda x: x[1], reverse=True):
        print(f"  {label}: {prob:.1%}")

show_image(image, results.get('caption', ''))

## Shrnut√≠

V tomto notebooku jsme se nauƒçili:

1. **Image Captioning** - automatick√© generov√°n√≠ popis≈Ø obr√°zk≈Ø
2. **Visual QA** - odpov√≠d√°n√≠ na ot√°zky o obr√°zc√≠ch
3. **CLIP** - spojen√≠ textu a obrazu v spoleƒçn√©m prostoru
4. **Multimod√°ln√≠ vyhled√°v√°n√≠** - hled√°n√≠ obr√°zk≈Ø podle textu
5. **Praktick√© aplikace** - produktov√° anal√Ωza, moderace obsahu

### Dal≈°√≠ modely k prozkoum√°n√≠:
- **BLIP-2** - vylep≈°en√Ω BLIP s LLM
- **LLaVA** - Large Language and Vision Assistant
- **GPT-4V** - multimod√°ln√≠ GPT
- **Flamingo** - few-shot multimod√°ln√≠ learning