<a href="https://colab.research.google.com/github/Eric-Chung-0511/Learning-Record/blob/main/Data%20Science%20Projects/VisionScout/(Llama_3_2_3B_Instruct)_Vision_Scout_Model_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import userdata
from huggingface_hub import login

hf_token = userdata.get("HF_TOKEN")  # 已新增至Secret
login(token=hf_token)

In [None]:
!pip install opencv-python gradio



In [None]:
!pip install yt-dlp requests



In [None]:
!pip install torch clip git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-bytg3gdk
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-bytg3gdk
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
!pip install ultralytics==8.3.128



In [None]:
!pip install -U transformers accelerate bitsandbytes sentencepiece



In [None]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# %%writefile detection_model.py
from ultralytics import YOLO
from typing import Any, List, Dict, Optional
import torch
import numpy as np
import os

class DetectionModel:
    """Core detection model class for object detection using YOLOv8"""

    # Model information dictionary
    MODEL_INFO = {
        "yolov8n.pt": {
            "name": "YOLOv8n (Nano)",
            "description": "Fastest model with smallest size (3.2M parameters). Best for speed-critical applications.",
            "size_mb": 6,
            "inference_speed": "Very Fast"
        },
        "yolov8m.pt": {
            "name": "YOLOv8m (Medium)",
            "description": "Balanced model with good accuracy-speed tradeoff (25.9M parameters). Recommended for general use.",
            "size_mb": 25,
            "inference_speed": "Medium"
        },
        "yolov8x.pt": {
            "name": "YOLOv8x (XLarge)",
            "description": "Most accurate but slower model (68.2M parameters). Best for accuracy-critical applications.",
            "size_mb": 68,
            "inference_speed": "Slower"
        }
    }

    def __init__(self, model_name: str = 'yolov8m.pt', confidence: float = 0.25, iou: float = 0.25):
        """
        Initialize the detection model

        Args:
            model_name: Model name or path, default is yolov8m.pt
            confidence: Confidence threshold, default is 0.25
            iou: IoU threshold for non-maximum suppression, default is 0.45
        """
        self.model_name = model_name
        self.confidence = confidence
        self.iou = iou
        self.model = None
        self.class_names = {}
        self.is_model_loaded = False

        # Load model on initialization
        self._load_model()

    def _load_model(self):
        """Load the YOLO model"""
        try:
            print(f"Loading model: {self.model_name}")
            self.model = YOLO(self.model_name)
            self.class_names = self.model.names
            self.is_model_loaded = True
            print(f"Successfully loaded model: {self.model_name}")
            print(f"Number of classes the model can recognize: {len(self.class_names)}")
        except Exception as e:
            print(f"Error occurred when loading the model: {e}")
            self.is_model_loaded = False

    def change_model(self, new_model_name: str) -> bool:
        """
        Change the currently loaded model

        Args:
            new_model_name: Name of the new model to load

        Returns:
            bool: True if model changed successfully, False otherwise
        """
        if self.model_name == new_model_name and self.is_model_loaded:
            print(f"Model {new_model_name} is already loaded")
            return True

        print(f"Changing model from {self.model_name} to {new_model_name}")

        # Unload current model to free memory
        if self.model is not None:
            del self.model
            self.model = None

            # Clean GPU memory if available
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        # Update model name and load new model
        self.model_name = new_model_name
        self._load_model()

        return self.is_model_loaded

    def reload_model(self):
        """Reload the model (useful for changing model or after error)"""
        if self.model is not None:
            del self.model
            self.model = None

            # Clean GPU memory if available
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        self._load_model()

    def detect(self, image_input: Any) -> Optional[Any]:
        """
        Perform object detection on a single image

        Args:
            image_input: Image path (str), PIL Image, or numpy array

        Returns:
            Detection result object or None if error occurred
        """
        if self.model is None or not self.is_model_loaded:
            print("Model not found or not loaded. Attempting to reload...")
            self._load_model()
            if self.model is None or not self.is_model_loaded:
                print("Failed to load model. Cannot perform detection.")
                return None

        try:
            results = self.model(image_input, conf=self.confidence, iou=self.iou)
            return results[0]
        except Exception as e:
            print(f"Error occurred during detection: {e}")
            return None

    def get_class_names(self, class_id: int) -> str:
        """Get class name for a given class ID"""
        return self.class_names.get(class_id, "Unknown Class")

    def get_supported_classes(self) -> Dict[int, str]:
        """Get all supported classes as a dictionary of {id: class_name}"""
        return self.class_names

    @classmethod
    def get_available_models(cls) -> List[Dict]:
        """
        Get list of available models with their information

        Returns:
            List of dictionaries containing model information
        """
        models = []
        for model_file, info in cls.MODEL_INFO.items():
            models.append({
                "model_file": model_file,
                "name": info["name"],
                "description": info["description"],
                "size_mb": info["size_mb"],
                "inference_speed": info["inference_speed"]
            })
        return models

    @classmethod
    def get_model_description(cls, model_name: str) -> str:
        """Get description for a specific model"""
        if model_name in cls.MODEL_INFO:
            info = cls.MODEL_INFO[model_name]
            return f"{info['name']}: {info['description']} (Size: ~{info['size_mb']}MB, Speed: {info['inference_speed']})"
        return "Model information not available"

In [None]:
# %%writefile color_mapper.py
import numpy as np
from typing import Dict, List, Tuple, Union, Any

class ColorMapper:
    """
    A class for consistent color mapping of object detection classes
    Provides color schemes for visualization in both RGB and hex formats
    """

    # Class categories for better organization
    CATEGORIES = {
        "person": [0],
        "vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
        "traffic": [9, 10, 11, 12],
        "animals": [14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
        "outdoor": [13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33],
        "sports": [34, 35, 36, 37, 38],
        "kitchen": [39, 40, 41, 42, 43, 44, 45],
        "food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
        "furniture": [56, 57, 58, 59, 60, 61],
        "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
        "household": [71, 72, 73, 74, 75, 76, 77, 78, 79]
    }

    # Base colors for each category (in HSV for easier variation)
    # HSV:  Hue, Saturation, Value
    CATEGORY_COLORS = {
        "person": (0, 0.8, 0.9),       # Red
        "vehicles": (210, 0.8, 0.9),   # Blue
        "traffic": (45, 0.8, 0.9),     # Orange
        "animals": (120, 0.7, 0.8),    # Green
        "outdoor": (180, 0.7, 0.9),    # Cyan
        "sports": (270, 0.7, 0.8),     # Purple
        "kitchen": (30, 0.7, 0.9),     # Light Orange
        "food": (330, 0.7, 0.85),      # Pink
        "furniture": (150, 0.5, 0.85), # Light Green
        "electronics": (240, 0.6, 0.9), # Light Blue
        "household": (60, 0.6, 0.9)    # Yellow
    }

    def __init__(self):
        """Initialize the ColorMapper with COCO class mappings"""
        self.class_names = self._get_coco_classes()
        self.color_map = self._generate_color_map()

    def _get_coco_classes(self) -> Dict[int, str]:
        """Get the standard COCO class names with their IDs"""
        return {
            0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
            5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
            10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
            14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
            20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
            25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
            30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
            35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
            39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
            44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
            49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
            54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
            59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
            64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
            69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book',
            74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier',
            79: 'toothbrush'
        }

    def _hsv_to_rgb(self, h: float, s: float, v: float) -> Tuple[int, int, int]:
        """
        Convert HSV color to RGB

        Args:
            h: Hue (0-360)
            s: Saturation (0-1)
            v: Value (0-1)

        Returns:
            Tuple of (R, G, B) values (0-255)
        """
        h = h / 60
        i = int(h)
        f = h - i
        p = v * (1 - s)
        q = v * (1 - s * f)
        t = v * (1 - s * (1 - f))

        if i == 0:
            r, g, b = v, t, p
        elif i == 1:
            r, g, b = q, v, p
        elif i == 2:
            r, g, b = p, v, t
        elif i == 3:
            r, g, b = p, q, v
        elif i == 4:
            r, g, b = t, p, v
        else:
            r, g, b = v, p, q

        return (int(r * 255), int(g * 255), int(b * 255))

    def _rgb_to_hex(self, rgb: Tuple[int, int, int]) -> str:
        """
        Convert RGB color to hex color code

        Args:
            rgb: Tuple of (R, G, B) values (0-255)

        Returns:
            Hex color code (e.g. '#FF0000')
        """
        return f'#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}'

    def _find_category(self, class_id: int) -> str:
        """
        Find the category for a given class ID

        Args:
            class_id: Class ID (0-79)

        Returns:
            Category name
        """
        for category, ids in self.CATEGORIES.items():
            if class_id in ids:
                return category
        return "other"  # Fallback

    def _generate_color_map(self) -> Dict:
        """
        Generate a color map for all 80 COCO classes

        Returns:
            Dictionary mapping class IDs and names to color values
        """
        color_map = {
            'by_id': {},      # Map class ID to RGB and hex
            'by_name': {},    # Map class name to RGB and hex
            'categories': {}  # Map category to base color
        }

        # Generate colors for categories
        for category, hsv in self.CATEGORY_COLORS.items():
            rgb = self._hsv_to_rgb(hsv[0], hsv[1], hsv[2])
            hex_color = self._rgb_to_hex(rgb)
            color_map['categories'][category] = {
                'rgb': rgb,
                'hex': hex_color
            }

        # Generate variations for each class within a category
        for class_id, class_name in self.class_names.items():
            category = self._find_category(class_id)
            base_hsv = self.CATEGORY_COLORS.get(category, (0, 0, 0.8))  # Default gray

            # Slightly vary the hue and saturation within the category
            ids_in_category = self.CATEGORIES.get(category, [])
            if ids_in_category:
                position = ids_in_category.index(class_id) if class_id in ids_in_category else 0
                variation = position / max(1, len(ids_in_category) - 1)  # 0 to 1

                # Vary hue slightly (±15°) and saturation
                h_offset = 30 * variation - 15  # -15 to +15
                s_offset = 0.2 * variation  # 0 to 0.2

                h = (base_hsv[0] + h_offset) % 360
                s = min(1.0, base_hsv[1] + s_offset)
                v = base_hsv[2]
            else:
                h, s, v = base_hsv

            rgb = self._hsv_to_rgb(h, s, v)
            hex_color = self._rgb_to_hex(rgb)

            # Store in both mappings
            color_map['by_id'][class_id] = {
                'rgb': rgb,
                'hex': hex_color,
                'category': category
            }

            color_map['by_name'][class_name] = {
                'rgb': rgb,
                'hex': hex_color,
                'category': category
            }

        return color_map

    def get_color(self, class_identifier: Union[int, str], format: str = 'hex') -> Any:
        """
        Get color for a specific class

        Args:
            class_identifier: Class ID (int) or name (str)
            format: Color format ('hex', 'rgb', or 'bgr')

        Returns:
            Color in requested format
        """
        # Determine if identifier is an ID or name
        if isinstance(class_identifier, int):
            color_info = self.color_map['by_id'].get(class_identifier)
        else:
            color_info = self.color_map['by_name'].get(class_identifier)

        if not color_info:
            # Fallback color if not found
            return '#CCCCCC' if format == 'hex' else (204, 204, 204)

        if format == 'hex':
            return color_info['hex']
        elif format == 'rgb':
            return color_info['rgb']
        elif format == 'bgr':
            # Convert RGB to BGR for OpenCV
            r, g, b = color_info['rgb']
            return (b, g, r)
        else:
            return color_info['rgb']

    def get_all_colors(self, format: str = 'hex') -> Dict:
        """
        Get all colors in the specified format

        Args:
            format: Color format ('hex', 'rgb', or 'bgr')

        Returns:
            Dictionary mapping class names to colors
        """
        result = {}
        for class_id, class_name in self.class_names.items():
            result[class_name] = self.get_color(class_id, format)
        return result

    def get_category_colors(self, format: str = 'hex') -> Dict:
        """
        Get base colors for each category

        Args:
            format: Color format ('hex', 'rgb', or 'bgr')

        Returns:
            Dictionary mapping categories to colors
        """
        result = {}
        for category, color_info in self.color_map['categories'].items():
            if format == 'hex':
                result[category] = color_info['hex']
            elif format == 'bgr':
                r, g, b = color_info['rgb']
                result[category] = (b, g, r)
            else:
                result[category] = color_info['rgb']
        return result

    def get_category_for_class(self, class_identifier: Union[int, str]) -> str:
        """
        Get the category for a specific class

        Args:
            class_identifier: Class ID (int) or name (str)

        Returns:
            Category name
        """
        if isinstance(class_identifier, int):
            return self.color_map['by_id'].get(class_identifier, {}).get('category', 'other')
        else:
            return self.color_map['by_name'].get(class_identifier, {}).get('category', 'other')

In [None]:
# %%writefile visualization_helper.py
import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
from typing import Any, List, Dict, Tuple, Optional
import io
from PIL import Image

class VisualizationHelper:
    """Helper class for visualizing detection results"""

    @staticmethod
    def visualize_detection(image: Any, result: Any, color_mapper: Optional[Any] = None,
                            figsize: Tuple[int, int] = (12, 12),
                            return_pil: bool = False,
                            filter_classes: Optional[List[int]] = None) -> Optional[Image.Image]:
        """
        Visualize detection results on a single image

        Args:
            image: Image path or numpy array
            result: Detection result object
            color_mapper: ColorMapper instance for consistent colors
            figsize: Figure size
            return_pil: If True, returns a PIL Image object

        Returns:
            PIL Image if return_pil is True, otherwise displays the plot
        """
        if result is None:
            print('No data for visualization')
            return None

        # Read image if path is provided
        if isinstance(image, str):
            img = cv2.imread(image)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        else:
            img = image
            if len(img.shape) == 3 and img.shape[2] == 3:
                # Check if BGR format (OpenCV) and convert to RGB if needed
                if isinstance(img, np.ndarray):
                    # Assuming BGR format from OpenCV
                    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Create figure
        fig, ax = plt.subplots(figsize=figsize)
        ax.imshow(img)

        # Get bounding boxes, classes and confidences
        boxes = result.boxes.xyxy.cpu().numpy()
        classes = result.boxes.cls.cpu().numpy()
        confs = result.boxes.conf.cpu().numpy()

        # Get class names
        names = result.names

        # Create a default color mapper if none is provided
        if color_mapper is None:
            # For backward compatibility, fallback to a simple color function
            from matplotlib import colormaps
            cmap = colormaps['tab10']
            def get_color(class_id):
                return cmap(class_id % 10)
        else:
            # Use the provided color mapper
            def get_color(class_id):
                hex_color = color_mapper.get_color(class_id)
                # Convert hex to RGB float values for matplotlib
                hex_color = hex_color.lstrip('#')
                return tuple(int(hex_color[i:i+2], 16) / 255 for i in (0, 2, 4)) + (1.0,)

        # Draw detection results
        for box, cls, conf in zip(boxes, classes, confs):
            x1, y1, x2, y2 = box
            cls_id = int(cls)

            if filter_classes and cls_id not in filter_classes:
                continue

            cls_name = names[cls_id]

            # Get color for this class
            box_color = get_color(cls_id)

            box_width = x2 - x1
            box_height = y2 - y1
            box_area = box_width * box_height

            # 根據框大小調整字體大小，但有限制
            adaptive_fontsize = max(10, min(14, int(10 + box_area / 10000)))


            ax.text(x1, y1 - 8, f'{cls_name}: {conf:.2f}',
                    color='white', fontsize=adaptive_fontsize, fontweight="bold",
                    bbox=dict(facecolor=box_color[:3], alpha=0.85, pad=3, boxstyle="round,pad=0.3"),
                    path_effects=[path_effects.withStroke(linewidth=1.5, foreground="black")])

            # Add bounding box
            ax.add_patch(plt.Rectangle((x1, y1), x2-x1, y2-y1,
                                    fill=False, edgecolor=box_color[:3], linewidth=2))

        ax.axis('off')
        # ax.set_title('Detection Result')
        plt.tight_layout()

        if return_pil:
            # Convert plot to PIL Image
            buf = io.BytesIO()
            fig.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
            buf.seek(0)
            pil_img = Image.open(buf)
            plt.close(fig)
            return pil_img
        else:
            plt.show()
            return None

    @staticmethod
    def create_summary(result: Any) -> Dict:
        """
        Create a summary of detection results

        Args:
            result: Detection result object

        Returns:
            Dictionary with detection summary statistics
        """
        if result is None:
            return {"error": "No detection result provided"}

        # Get classes and confidences
        classes = result.boxes.cls.cpu().numpy().astype(int)
        confidences = result.boxes.conf.cpu().numpy()
        names = result.names

        # Count detections by class
        class_counts = {}
        for cls, conf in zip(classes, confidences):
            cls_name = names[int(cls)]
            if cls_name not in class_counts:
                class_counts[cls_name] = {"count": 0, "confidences": []}

            class_counts[cls_name]["count"] += 1
            class_counts[cls_name]["confidences"].append(float(conf))

        # Calculate average confidence for each class
        for cls_name, stats in class_counts.items():
            if stats["confidences"]:
                stats["average_confidence"] = float(np.mean(stats["confidences"]))
                stats.pop("confidences")  # Remove detailed confidences list to keep summary concise

        # Prepare summary
        summary = {
            "total_objects": len(classes),
            "class_counts": class_counts,
            "unique_classes": len(class_counts)
        }

        return summary

In [None]:
# %%writefile evaluation_metrics.py
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List, Any, Optional, Tuple

class EvaluationMetrics:
    """Class for computing detection metrics, generating statistics and visualization data"""

    @staticmethod
    def calculate_basic_stats(result: Any) -> Dict:
        """
        Calculate basic statistics for a single detection result

        Args:
            result: Detection result object

        Returns:
            Dictionary with basic statistics
        """
        if result is None:
            return {"error": "No detection result provided"}

        # Get classes and confidences
        classes = result.boxes.cls.cpu().numpy().astype(int)
        confidences = result.boxes.conf.cpu().numpy()
        names = result.names

        # Count by class
        class_counts = {}
        for cls, conf in zip(classes, confidences):
            cls_name = names[int(cls)]
            if cls_name not in class_counts:
                class_counts[cls_name] = {"count": 0, "total_confidence": 0, "confidences": []}

            class_counts[cls_name]["count"] += 1
            class_counts[cls_name]["total_confidence"] += float(conf)
            class_counts[cls_name]["confidences"].append(float(conf))

        # Calculate average confidence
        for cls_name, stats in class_counts.items():
            if stats["count"] > 0:
                stats["average_confidence"] = stats["total_confidence"] / stats["count"]
                stats["confidence_std"] = float(np.std(stats["confidences"])) if len(stats["confidences"]) > 1 else 0
                stats.pop("total_confidence")  # Remove intermediate calculation

        # Prepare summary
        stats = {
            "total_objects": len(classes),
            "class_statistics": class_counts,
            "average_confidence": float(np.mean(confidences)) if len(confidences) > 0 else 0
        }

        return stats

    @staticmethod
    def generate_visualization_data(result: Any, class_colors: Dict = None) -> Dict:
        """
        Generate structured data suitable for visualization

        Args:
            result: Detection result object
            class_colors: Dictionary mapping class names to color codes (optional)

        Returns:
            Dictionary with visualization-ready data
        """
        if result is None:
            return {"error": "No detection result provided"}

        # Get basic stats first
        stats = EvaluationMetrics.calculate_basic_stats(result)

        # Create visualization-specific data structure
        viz_data = {
            "total_objects": stats["total_objects"],
            "average_confidence": stats["average_confidence"],
            "class_data": []
        }

        # Sort classes by count (descending)
        sorted_classes = sorted(
            stats["class_statistics"].items(),
            key=lambda x: x[1]["count"],
            reverse=True
        )

        # Create class-specific visualization data
        for cls_name, cls_stats in sorted_classes:
            class_id = -1
            # Find the class ID based on the name
            for idx, name in result.names.items():
                if name == cls_name:
                    class_id = idx
                    break

            cls_data = {
                "name": cls_name,
                "class_id": class_id,
                "count": cls_stats["count"],
                "average_confidence": cls_stats.get("average_confidence", 0),
                "confidence_std": cls_stats.get("confidence_std", 0),
                "color": class_colors.get(cls_name, "#CCCCCC") if class_colors else "#CCCCCC"
            }

            viz_data["class_data"].append(cls_data)

        return viz_data

    @staticmethod
    def create_stats_plot(viz_data: Dict, figsize: Tuple[int, int] = (10, 7), max_classes: int = 30) -> plt.Figure:
        """
        Create a horizontal bar chart showing detection statistics

        Args:
            viz_data: Visualization data generated by generate_visualization_data
            figsize: Figure size (width, height) in inches
            max_classes: Maximum number of classes to display

        Returns:
            Matplotlib figure object
        """
        # Use the enhanced version
        return EvaluationMetrics.create_enhanced_stats_plot(viz_data, figsize, max_classes)

    @staticmethod
    def create_enhanced_stats_plot(viz_data: Dict, figsize: Tuple[int, int] = (10, 7), max_classes: int = 30) -> plt.Figure:
        """
        Create an enhanced horizontal bar chart with larger fonts and better styling

        Args:
            viz_data: Visualization data dictionary
            figsize: Figure size (width, height) in inches
            max_classes: Maximum number of classes to display

        Returns:
            Matplotlib figure with enhanced styling
        """
        if "error" in viz_data:
            # Create empty plot if error
            fig, ax = plt.subplots(figsize=figsize)
            ax.text(0.5, 0.5, viz_data["error"],
                    ha='center', va='center', fontsize=14)
            ax.set_xlim(0, 1)
            ax.set_ylim(0, 1)
            ax.axis('off')
            return fig

        if "class_data" not in viz_data or not viz_data["class_data"]:
            # Create empty plot if no data
            fig, ax = plt.subplots(figsize=figsize)
            ax.text(0.5, 0.5, "No detection data available",
                    ha='center', va='center', fontsize=14)
            ax.set_xlim(0, 1)
            ax.set_ylim(0, 1)
            ax.axis('off')
            return fig

        # Limit to max_classes
        class_data = viz_data["class_data"][:max_classes]

        # Extract data for plotting
        class_names = [item["name"] for item in class_data]
        counts = [item["count"] for item in class_data]
        colors = [item["color"] for item in class_data]

        # Create figure and horizontal bar chart with improved styling
        fig, ax = plt.subplots(figsize=figsize)

        # Set background color to white
        fig.patch.set_facecolor('white')
        ax.set_facecolor('white')

        y_pos = np.arange(len(class_names))

        # Create horizontal bars with class-specific colors
        bars = ax.barh(y_pos, counts, color=colors, alpha=0.8, height=0.6)

        # Add count values at end of each bar with larger font
        for i, bar in enumerate(bars):
            width = bar.get_width()
            conf = class_data[i]["average_confidence"]
            ax.text(width + 0.3, bar.get_y() + bar.get_height()/2,
                    f"{width:.0f} (conf: {conf:.2f})",
                    va='center', fontsize=12)

        # Customize axis and labels with larger fonts
        ax.set_yticks(y_pos)
        ax.set_yticklabels(class_names, fontsize=14)
        ax.invert_yaxis()  # Labels read top-to-bottom
        ax.set_xlabel('Count', fontsize=14)
        ax.set_title(f'Objects Detected: {viz_data["total_objects"]} Total',
                    fontsize=16, fontweight='bold')

        # Add grid for better readability
        ax.set_axisbelow(True)
        ax.grid(axis='x', linestyle='--', alpha=0.7, color='#E5E7EB')

        # Increase tick label font size
        ax.tick_params(axis='both', which='major', labelsize=12)

        # Add detection summary as a text box with improved styling
        summary_text = (
            f"Total Objects: {viz_data['total_objects']}\n"
            f"Average Confidence: {viz_data['average_confidence']:.2f}\n"
            f"Unique Classes: {len(viz_data['class_data'])}"
        )
        plt.figtext(0.02, 0.02, summary_text, fontsize=12,
                bbox=dict(facecolor='white', alpha=0.9, boxstyle='round,pad=0.5',
                            edgecolor='#E5E7EB'))

        plt.tight_layout()
        return fig

    @staticmethod
    def format_detection_summary(viz_data: Dict) -> str:
        if "error" in viz_data:
            return viz_data["error"]

        if "total_objects" not in viz_data:
            return "No detection data available."

        total_objects = viz_data["total_objects"]
        avg_confidence = viz_data["average_confidence"]

        lines = [
            f"Detected {total_objects} objects.",
            f"Average confidence: {avg_confidence:.2f}",
            "Objects by class:"
        ]

        if "class_data" in viz_data and viz_data["class_data"]:
            for item in viz_data["class_data"]:
                count = item['count']
                item_text = "item" if count == 1 else "items"
                lines.append(f"• {item['name']}: {count} {item_text} (Confidence: {item['average_confidence']:.2f})")
        else:
            lines.append("No class information available.")

        return "\n".join(lines)

    @staticmethod
    def calculate_distance_metrics(result: Any) -> Dict:
        """
        Calculate distance-related metrics for detected objects

        Args:
            result: Detection result object

        Returns:
            Dictionary with distance metrics
        """
        if result is None:
            return {"error": "No detection result provided"}

        boxes = result.boxes.xyxy.cpu().numpy()
        classes = result.boxes.cls.cpu().numpy().astype(int)
        names = result.names

        # Initialize metrics
        metrics = {
            "proximity": {},  # Classes that appear close to each other
            "spatial_distribution": {},  # Distribution across the image
            "size_distribution": {}  # Size distribution of objects
        }

        # Calculate image dimensions (assuming normalized coordinates or extract from result)
        img_width, img_height = 1, 1
        if hasattr(result, "orig_shape"):
            img_height, img_width = result.orig_shape[:2]

        # Calculate bounding box areas and centers
        areas = []
        centers = []
        class_names = []

        for box, cls in zip(boxes, classes):
            x1, y1, x2, y2 = box
            width, height = x2 - x1, y2 - y1
            area = width * height
            center_x, center_y = (x1 + x2) / 2, (y1 + y2) / 2

            areas.append(area)
            centers.append((center_x, center_y))
            class_names.append(names[int(cls)])

        # Calculate spatial distribution
        if centers:
            x_coords = [c[0] for c in centers]
            y_coords = [c[1] for c in centers]

            metrics["spatial_distribution"] = {
                "x_mean": float(np.mean(x_coords)) / img_width,
                "y_mean": float(np.mean(y_coords)) / img_height,
                "x_std": float(np.std(x_coords)) / img_width,
                "y_std": float(np.std(y_coords)) / img_height
            }

        # Calculate size distribution
        if areas:
            metrics["size_distribution"] = {
                "mean_area": float(np.mean(areas)) / (img_width * img_height),
                "std_area": float(np.std(areas)) / (img_width * img_height),
                "min_area": float(np.min(areas)) / (img_width * img_height),
                "max_area": float(np.max(areas)) / (img_width * img_height)
            }

        # Calculate proximity between different classes
        class_centers = {}
        for cls_name, center in zip(class_names, centers):
            if cls_name not in class_centers:
                class_centers[cls_name] = []
            class_centers[cls_name].append(center)

        # Find classes that appear close to each other
        proximity_pairs = []
        for i, cls1 in enumerate(class_centers.keys()):
            for j, cls2 in enumerate(class_centers.keys()):
                if i >= j:  # Avoid duplicate pairs and self-comparison
                    continue

                # Calculate minimum distance between any two objects of these classes
                min_distance = float('inf')
                for center1 in class_centers[cls1]:
                    for center2 in class_centers[cls2]:
                        dist = np.sqrt((center1[0] - center2[0])**2 + (center1[1] - center2[1])**2)
                        min_distance = min(min_distance, dist)

                # Normalize by image diagonal
                img_diagonal = np.sqrt(img_width**2 + img_height**2)
                norm_distance = min_distance / img_diagonal

                proximity_pairs.append({
                    "class1": cls1,
                    "class2": cls2,
                    "distance": float(norm_distance)
                })

        # Sort by distance and keep the closest pairs
        proximity_pairs.sort(key=lambda x: x["distance"])
        metrics["proximity"] = proximity_pairs[:5]  # Keep top 5 closest pairs

        return metrics

In [None]:
# %%writefile style.py

class Style:

    @staticmethod
    def get_css():

        css = """
        /* Base styles and typography */
        body {
            font-family: Arial, sans-serif;
            background: linear-gradient(135deg, #f0f9ff, #e1f5fe);
            margin: 0;
            padding: 0;
            display: flex;
            justify-content: center;
            min-height: 100vh;
        }

        /* Typography improvements */
        h1, h2, h3, h4, h5, h6, p, span, div, label, button {
            font-family: Arial, sans-serif;
        }

        /* Container styling */
        .gradio-container {
            max-width: 1200px !important;
            margin: auto !important;
            padding: 1rem;
            width: 100%;
        }

        /* Header area styling with gradient background */
        .app-header {
            text-align: center;
            margin-bottom: 2rem;
            background: linear-gradient(135deg, #f8f9fa, #e9ecef);
            padding: 1.5rem;
            border-radius: 10px;
            box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05);
            width: 100%;
        }

        .app-title {
            color: #2D3748;
            font-size: 2.5rem;
            margin-bottom: 0.5rem;
            background: linear-gradient(90deg, #38b2ac, #4299e1);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            font-weight: bold;
        }

        .app-subtitle {
            color: #4A5568;
            font-size: 1.2rem;
            font-weight: normal;
            margin-top: 0.25rem;
        }

        .app-divider {
            width: 80px;
            height: 3px;
            background: linear-gradient(90deg, #38b2ac, #4299e1);
            margin: 1rem auto;
        }

        /* Panel styling - gradient background */
        .input-panel, .output-panel {
            background: white;
            border-radius: 10px;
            padding: 1.5rem;
            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
            margin: 0 auto 1rem auto;
        }

        /* 修改輸出面板確保內容能夠完整顯示 */
        .output-panel {
            display: flex;
            flex-direction: column;
            width: 100%;
            padding: 0 !important;
        }

        /* 確保輸出面板內的元素寬度可以適應面板 */
        .output-panel > * {
            width: 100%;
        }

        /* How-to-use section with gradient background */
        .how-to-use {
            background: linear-gradient(135deg, #f8fafc, #e8f4fd);
            border-radius: 10px;
            padding: 1.5rem;
            margin-top: 1rem;
            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
            color: #2d3748;
        }

        /* Detection button styling */
        .detect-btn {
            background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
            color: white !important;
            border: none !important;
            border-radius: 8px !important;
            transition: transform 0.3s, box-shadow 0.3s !important;
            font-weight: bold !important;
            letter-spacing: 0.5px !important;
            padding: 0.75rem 1.5rem !important;
            width: 100%;
            margin: 1rem auto !important;
            font-family: Arial, sans-serif !important;
        }

        .detect-btn:hover {
            transform: translateY(-2px) !important;
            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2) !important;
        }

        .detect-btn:active {
            transform: translateY(1px) !important;
            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2) !important;
        }

        /* JSON display improvements */
        .json-display {
            width: 98% !important;
            margin: 0.5rem auto 1.5rem auto !important;
            padding: 1rem !important;
            border-radius: 8px !important;
            background-color: white !important;
            border: 1px solid #E2E8F0 !important;
            box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.05) !important;
        }

        .json-key {
            color: #e53e3e;
        }

        .json-value {
            color: #2b6cb0;
        }

        .json-string {
            color: #38a169;
        }

        /* Chart/plot styling improvements */
        .plot-container {
            background: white;
            border-radius: 8px;
            padding: 0.5rem;
            box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
        }

        /* Larger font for plots */
        .plot-container text {
            font-family: Arial, sans-serif !important;
            font-size: 14px !important;
        }

        /* Title styling for charts */
        .plot-title {
            font-family: Arial, sans-serif !important;
            font-size: 16px !important;
            font-weight: bold !important;
        }

        /* Tab styling with subtle gradient */
        .tabs {
            width: 100%;
            display: flex;
            justify-content: center;
        }

        .tabs > div:first-child {
            background: linear-gradient(to right, #f8fafc, #e8f4fd) !important;
            border-radius: 8px 8px 0 0;
        }

        /* Tab content styling - 確保內容區域有足夠寬度 */
        .tab-content {
            width: 100% !important;
            box-sizing: border-box !important;
            padding: 0 !important;
        }

        /* Footer styling with gradient background */
        .footer {
            text-align: center;
            margin-top: 2rem;
            font-size: 0.9rem;
            color: #4A5568;
            padding: 1rem;
            background: linear-gradient(135deg, #f8f9fa, #e1effe);
            border-radius: 10px;
            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
            width: 100%;
        }

        /* Ensure centering works for all elements */
        .container, .gr-container, .gr-row, .gr-col {
            display: flex;
            flex-direction: column;
            align-items: center;
            justify-content: center;
            width: 100%;
        }

        /* 統一文本框樣式，確保寬度一致 */
        .gr-textbox, .gr-textarea, .gr-text-input {
            width: 100% !important;
            max-width: 100% !important;
            min-width: 100% !important;
            box-sizing: border-box !important;
        }

        /* 確保文本區域可以適應容器寬度 */
        textarea.gr-textarea, .gr-textbox textarea, .gr-text-input textarea {
            width: 100% !important;
            max-width: 100% !important;
            min-width: 100% !important;
            box-sizing: border-box !important;
            padding: 16px !important;
            font-family: 'Arial', sans-serif !important;
            font-size: 14px !important;
            line-height: 1.6 !important;
            white-space: pre-wrap !important;
            word-wrap: break-word !important;
            word-break: normal !important;
        }

        /* 特別針對場景描述文本框樣式增強 */
        #scene-description-text, #detection-details {
            width: 100% !important;
            min-width: 100% !important;
            box-sizing: border-box !important;
            padding: 16px !important;
            line-height: 1.8 !important;
            white-space: pre-wrap !important;
            word-wrap: break-word !important;
            border-radius: 8px !important;
            min-height: 250px !important;
            overflow-y: auto !important;
            border: 1px solid #e2e8f0 !important;
            background-color: white !important;
            display: block !important;
            font-family: 'Arial', sans-serif !important;
            font-size: 14px !important;
            margin: 0 !important;
        }

        /* 針對場景描述容器的樣式 */
        .scene-description-container {
            width: 100% !important;
            max-width: 100% !important;
            box-sizing: border-box !important;
            padding: 0 !important;
            margin: 0 !important;
        }

        /* Scene Understanding Tab 特定樣式 */
        .scene-understanding-tab .result-details-box {
            display: flex !important;
            flex-direction: column !important;
            align-items: stretch !important;
            width: 100% !important;
            box-sizing: border-box !important;
            padding: 0 !important;
        }

        /* 場景分析描述區域樣式 */
        .scene-description-box {
            background-color: #f8f9fa !important;
            border: 1px solid #e2e8f0 !important;
            border-radius: 8px !important;
            padding: 15px !important;
            margin: 10px 0 20px 0 !important;
            box-shadow: 0 1px 3px rgba(0,0,0,0.05) !important;
            font-family: Arial, sans-serif !important;
            line-height: 1.7 !important;
            color: #2D3748 !important;
            font-size: 16px !important;
            width: 100% !important;
            box-sizing: border-box !important;
        }

        #scene_analysis_description_text {
            background-color: #f0f0f0 !important; /* 淺灰色背景 */
            padding: 15px !important;             /* 內邊距，讓文字和邊框有點空間 */
            border-radius: 8px !important;        /* 圓角 */
            margin: 10px 0 20px 0 !important;     /* 其他元素的間距，特別是上下的part */
            display: block !important;
            width: 100% !important;
            box-sizing: border-box !important;
        }

        #scene_analysis_description_text p {
            margin: 0 !important;
            color: #2D3748 !important; /* 確保文字顏色 */
            font-family: Arial, sans-serif !important;
            font-size: 16px !important; /* 你可以調整文字大小 */
            line-height: 1.7 !important;
        }

        /* 結果容器樣式 */
        .result-container {
            width: 100% !important;
            padding: 1rem !important;
            border-radius: 8px !important;
            border: 1px solid #E2E8F0 !important;
            margin-bottom: 1.5rem !important;
            background-color: #F8FAFC !important;
            box-sizing: border-box !important;
        }

        /* 結果文本框的樣式 */
        .wide-result-text {
            width: 100% !important;
            min-width: 100% !important;
            box-sizing: border-box !important;
            padding: 0 !important;
            margin: 0 !important;
        }

        /* 片段標題樣式 */
        .section-heading {
            font-size: 1.25rem !important;
            font-weight: 600 !important;
            color: #2D3748 !important;
            margin: 1rem auto !important;
            padding: 0.75rem 1rem !important;
            background: linear-gradient(to right, #e6f3fc, #f0f9ff) !important;
            border-radius: 8px !important;
            width: 98% !important;
            display: inline-block !important;
            box-sizing: border-box !important;
            text-align: center !important;
            overflow: visible !important;
            line-height: 1.5 !important;
            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
        }

        /* JSON 顯示區域樣式 */
        .json-box {
            width: 100% !important;
            min-height: 200px !important;
            overflow-y: auto !important;
            background: white !important;
            padding: 1rem !important;
            border-radius: 8px !important;
            box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
            font-family: monospace !important;
            box-sizing: border-box !important;
        }

        /* 欄佈局調整 */
        .plot-column, .stats-column {
            display: flex;
            flex-direction: column;
            padding: 1rem;
            box-sizing: border-box !important;
            width: 100% !important;
        }

        /* statistics plot */
        .large-plot-container {
            width: 100% !important;
            min-height: 400px !important;
            box-sizing: border-box !important;
        }

        /* 增強 JSON 顯示 */
        .enhanced-json-display {
            background: white !important;
            border-radius: 8px !important;
            padding: 1rem !important;
            box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
            width: 100% !important;
            min-height: 300px !important;
            max-height: 500px !important;
            overflow-y: auto !important;
            font-family: monospace !important;
            box-sizing: border-box !important;
        }

        /* 確保全寬元素真正占滿整個寬度 */
        .full-width-element {
            width: 100% !important;
            max-width: 100% !important;
            box-sizing: border-box !important;
        }

        /* Video summary HTML 容器與內容樣式 */
        #video-summary-html-output {
            width: 100% !important;
            box-sizing: border-box !important;
            padding: 0 !important;
            margin: 0 !important;
        }

        .video-summary-content-wrapper {
            width: 100% !important;
            padding: 16px !important;
            line-height: 1.8 !important;
            white-space: pre-wrap !important;
            word-wrap: break-word !important;
            border-radius: 8px !important;
            min-height: 250px !important;
            max-height: 600px !important;
            overflow-y: auto !important;
            border: 1px solid #e2e8f0 !important;
            background-color: white !important;
            display: block !important;
            font-family: 'Arial', sans-serif !important;
            font-size: 14px !important;
            margin: 0 !important;
        }

        .video-summary-content-wrapper pre {
            white-space: pre-wrap !important;
            word-wrap: break-word !important;
            margin: 0 !important;
            padding: 0 !important;
            font-family: 'Arial', sans-serif !important;
            font-size: 14px !important;
            line-height: 1.8 !important;
            color: #2D3748 !important;
        }

        /* 視頻結果面板相關樣式 */
        .video-result-panel {
            padding: 1rem !important;
            background: white !important;
            border-radius: 10px !important;
            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08) !important;
        }

        .video-output-container {
            width: 100% !important;
            margin-bottom: 1.5rem !important;
            border-radius: 8px !important;
            overflow: hidden !important;
            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
        }

        /* 視頻統計資料顯示增強 */
        .video-stats-display {
            background: white !important;
            border-radius: 8px !important;
            padding: 1rem !important;
            box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
            width: 100% !important;
            min-height: 200px !important;
            max-height: 400px !important;
            overflow-y: auto !important;
            font-family: monospace !important;
            box-sizing: border-box !important;
            color: #2D3748 !important;
        }

        .custom-video-url-input {
            width: 100% !important;
        }

        .custom-video-url-input textarea {
            width: 100% !important;
            min-height: 120px !important;
            padding: 15px !important;
            font-size: 16px !important;
            line-height: 1.6 !important;
            background-color: #F7FAFC !important;
            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
            border: 2px solid #CBD5E0 !important;
            border-radius: 8px !important;
        }

        .custom-video-url-input textarea:focus {
            border-color: #4299E1 !important;
            box-shadow: 0 0 0 3px rgba(66, 153, 225, 0.2) !important;
        }

        /* 輸入框容器100%寬度 */
        .custom-video-url-input > div {
            width: 100% !important;
            max-width: 100% !important;
        }

        /* LLM 增強描述樣式 */
        #llm_enhanced_description_text {
            padding: 15px !important;
            background-color: #ffffff !important;
            border-radius: 8px !important;
            border: 1px solid #e2e8f0 !important;
            margin-bottom: 20px !important;
            box-shadow: 0 1px 3px rgba(0,0,0,0.05) !important;
            font-family: Arial, sans-serif !important;
            line-height: 1.7 !important;
            color: #2D3748 !important;
            font-size: 16px !important;
            width: 100% !important;
            box-sizing: border-box !important;
            min-height: 200px !important;
        }

        /* 原始描述折疊區域樣式 */
        #original_scene_analysis_accordion {
            margin-top: 10px !important;
            margin-bottom: 20px !important;
            background-color: #f8f9fa !important;
            border-radius: 8px !important;
            border: 1px solid #e2e8f0 !important;
        }

        /* 確保折疊區域內容與頁面樣式協調 */
        #original_scene_analysis_accordion > div:nth-child(2) {
            padding: 15px !important;
        }

        /* 動畫效果, 增加互動感 */
        @keyframes fadeIn {
            from { opacity: 0; }
            to { opacity: 1; }
        }

        .video-result-panel > * {
            animation: fadeIn 0.5s ease-in-out;
        }

        /* 響應式調整 */
        @media (max-width: 768px) {
            .app-title {
                font-size: 2rem;
            }

            .app-subtitle {
                font-size: 1rem;
            }

            .gradio-container {
                padding: 0.5rem;
            }

            /* 在小螢幕上調整文本區域的高度 */
            #scene-description-text, #detection-details {
                min-height: 150px !important;
            }
        }

        """
        return css

In [None]:
# %%writefile scene_type.py

SCENE_TYPES = {
    "living_room": {
        "name": "Living Room",
        "required_objects": [57, 62],  # couch, tv
        "optional_objects": [56, 60, 73, 75],  # chair, dining table, book, vase
        "minimum_required": 2,
        "description": "A living room area with furniture for relaxation and entertainment"
    },
    "bedroom": {
        "name": "Bedroom",
        "required_objects": [59],  # bed
        "optional_objects": [56, 60, 73, 74, 75],  # chair, dining table, book, clock, vase
        "minimum_required": 1,
        "description": "A bedroom with sleeping furniture"
    },
    "dining_area": {
        "name": "Dining Area",
        "required_objects": [60],  # dining table
        "optional_objects": [56, 39, 41, 42, 43, 44, 45],  # chair, bottle, cup, fork, knife, spoon, bowl
        "minimum_required": 1,
        "description": "A dining area for meals"
    },
    "kitchen": {
        "name": "Kitchen",
        "required_objects": [72, 68, 69, 71],  # refrigerator, microwave, oven, sink
        "optional_objects": [39, 41, 42, 43, 44, 45],  # bottle, cup, fork, knife, spoon, bowl
        "minimum_required": 1,
        "description": "A kitchen area for food preparation"
    },
    "office_workspace": {
        "name": "Office Workspace",
        "required_objects": [56, 63, 66, 64, 73],  # chair, laptop, keyboard, mouse, book
        "optional_objects": [60, 74, 75, 67],  # dining table, clock, vase, cell phone
        "minimum_required": 2,
        "description": "A workspace with computer equipment for office work"
    },
    "meeting_room": {
        "name": "Meeting Room",
        "required_objects": [56, 60],  # chair, dining table
        "optional_objects": [63, 62, 67],  # laptop, tv, cell phone
        "minimum_required": 2,
        "description": "A room set up for meetings with multiple seating"
    },
    "city_street": {
        "name": "City Street",
        "required_objects": [0, 1, 2, 3, 5, 7, 9],  # person, bicycle, car, motorcycle, bus, truck, traffic light
        "optional_objects": [10, 11, 12, 24, 25, 26, 28],  # fire hydrant, stop sign, parking meter, backpack, umbrella, handbag, suitcase
        "minimum_required": 2,
        "description": "A city street with traffic and pedestrians"
    },
    "parking_lot": {
        "name": "Parking Lot",
        "required_objects": [2, 3, 5, 7],  # car, motorcycle, bus, truck
        "optional_objects": [0, 11, 12],  # person, stop sign, parking meter
        "minimum_required": 3,
        "description": "A parking area with multiple vehicles"
    },
    "park_area": {
        "name": "Park or Recreation Area",
        "required_objects": [0, 13],  # person, bench
        "optional_objects": [1, 14, 16, 25, 33],  # bicycle, bird, dog, umbrella, kite
        "minimum_required": 2,
        "description": "An outdoor recreational area for leisure activities"
    },
    "retail_store": {
        "name": "Retail Store",
        "required_objects": [0, 24, 26, 28],  # person, backpack, handbag, suitcase
        "optional_objects": [39, 45, 67],  # bottle, bowl, cell phone
        "minimum_required": 2,
        "description": "A retail environment with shoppers and merchandise"
    },
    "supermarket": {
        "name": "Supermarket",
        "required_objects": [0, 24, 39, 46, 47, 49],  # person, backpack, bottle, banana, apple, orange
        "optional_objects": [26, 37, 45, 48, 51, 52, 53, 54, 55],  # handbag, surfboard, bowl, sandwich, carrot, hot dog, pizza, donut, cake
        "minimum_required": 3,
        "description": "A supermarket with food items and shoppers"
    },
    "classroom": {
        "name": "Classroom",
        "required_objects": [56, 60, 73],  # chair, dining table, book
        "optional_objects": [63, 66, 67],  # laptop, keyboard, cell phone
        "minimum_required": 2,
        "description": "A classroom environment set up for educational activities"
    },
    "conference_room": {
        "name": "Conference Room",
        "required_objects": [56, 60, 63],  # chair, dining table, laptop
        "optional_objects": [62, 67, 73],  # tv, cell phone, book
        "minimum_required": 2,
        "description": "A conference room designed for meetings and presentations"
    },
    "cafe": {
        "name": "Cafe",
        "required_objects": [56, 60, 41],  # chair, dining table, cup
        "optional_objects": [39, 40, 63, 67, 73],  # bottle, wine glass, laptop, cell phone, book
        "minimum_required": 2,
        "description": "A cafe setting with seating and beverages"
    },
    "library": {
        "name": "Library",
        "required_objects": [56, 60, 73],  # chair, dining table, book
        "optional_objects": [63, 67, 75],  # laptop, cell phone, vase
        "minimum_required": 2,
        "description": "A library with books and reading areas"
    },
    "gym": {
        "name": "Gym",
        "required_objects": [0, 32],  # person, sports ball
        "optional_objects": [24, 25, 28, 38],  # backpack, umbrella, suitcase, tennis racket
        "minimum_required": 1,
        "description": "A gym or fitness area for physical activities"
    },
    "beach": {
        "name": "Beach",
        "required_objects": [0, 25, 29, 33, 37],  # person, umbrella, frisbee, kite, surfboard
        "optional_objects": [1, 24, 26, 38],  # bicycle, backpack, handbag, tennis racket
        "minimum_required": 2,
        "description": "A beach area with people and recreational items"
    },
    "restaurant": {
        "name": "Restaurant",
        "required_objects": [56, 60, 41, 42, 43, 44, 45],  # chair, dining table, cup, fork, knife, spoon, bowl
        "optional_objects": [39, 40, 48, 49, 50, 51, 52, 53, 54, 55],  # bottle, wine glass, sandwich, orange, broccoli, carrot, hot dog, pizza, donut, cake
        "minimum_required": 3,
        "description": "A restaurant setting for dining with tables and eating utensils"
    },
    "train_station": {
        "name": "Train Station",
        "required_objects": [0, 6],  # person, train
        "optional_objects": [1, 2, 24, 28, 67],  # bicycle, car, backpack, suitcase, cell phone
        "minimum_required": 1,
        "description": "A train station with train and passengers"
    },
    "airport": {
        "name": "Airport",
        "required_objects": [0, 4, 28],  # person, airplane, suitcase
        "optional_objects": [24, 25, 26, 67],  # backpack, umbrella, handbag, cell phone
        "minimum_required": 2,
        "description": "An airport with planes and travelers carrying luggage"
    },
      "upscale_dining": {
        "name": "Upscale Dining Area",
        "required_objects": [56, 60, 40, 41],  # chair, dining table, wine glass, cup
        "optional_objects": [39, 42, 43, 44, 45, 62, 75],  # bottle, fork, knife, spoon, bowl, tv, vase
        "minimum_required": 2,
        "description": "An elegantly designed dining space with refined furniture and decorative elements"
    },
    "asian_commercial_street": {
        "name": "Asian Commercial Street",
        "required_objects": [0, 67],  # person, cell phone
        "optional_objects": [1, 2, 3, 24, 25, 26, 28],  # bicycle, car, motorcycle, backpack, umbrella, handbag, suitcase
        "minimum_required": 1,
        "description": "A bustling commercial street with shops, signage, and pedestrians in an Asian urban setting"
    },
    "financial_district": {
        "name": "Financial District",
        "required_objects": [2, 5, 7, 9],  # car, bus, truck, traffic light
        "optional_objects": [0, 1, 3, 8],  # person, bicycle, motorcycle, boat
        "minimum_required": 2,
        "description": "A major thoroughfare in a business district with high-rise buildings and traffic"
    },
    "urban_intersection": {
        "name": "Urban Intersection",
        "required_objects": [0, 9],  # person, traffic light
        "optional_objects": [1, 2, 3, 5, 7],  # bicycle, car, motorcycle, bus, truck
        "minimum_required": 1,
        "description": "A busy urban crossroad with pedestrian crossings and multiple traffic flows"
    },
    "transit_hub": {
        "name": "Transit Hub",
        "required_objects": [0, 5, 6, 7],  # person, bus, train, truck
        "optional_objects": [1, 2, 3, 9, 24, 28],  # bicycle, car, motorcycle, traffic light, backpack, suitcase
        "minimum_required": 2,
        "description": "A transportation center where multiple modes of transit converge"
    },
    "shopping_district": {
        "name": "Shopping District",
        "required_objects": [0, 24, 26],  # person, backpack, handbag
        "optional_objects": [1, 2, 3, 25, 27, 28, 39, 67],  # bicycle, car, motorcycle, umbrella, tie, suitcase, bottle, cell phone
        "minimum_required": 2,
        "description": "A retail-focused area with shops, pedestrians, and commercial activity"
    },
     "bus_stop": {
        "name": "Bus Stop",
        "required_objects": [0, 5],  # person, bus
        "optional_objects": [1, 2, 7, 24],  # bicycle, car, truck, backpack
        "minimum_required": 2,
        "description": "A roadside bus stop with waiting passengers and buses"
    },
    "bus_station": {
        "name": "Bus Station",
        "required_objects": [0, 5, 7],  # person, bus, truck
        "optional_objects": [24, 28, 67],  # backpack, suitcase, cell phone
        "minimum_required": 2,
        "description": "A bus terminal with multiple buses and travelers"
    },
    "zoo": {
        "name": "Zoo",
        "required_objects": [20, 22, 23],  # elephant, zebra, giraffe
        "optional_objects": [0, 14, 16],  # person, bird, dog
        "minimum_required": 2,
        "description": "A zoo environment featuring large animal exhibits and visitors"
    },
    "harbor": {
        "name": "Harbor",
        "required_objects": [8],  # boat
        "optional_objects": [0, 2, 3, 39],  # person, car, motorcycle, bottle
        "minimum_required": 1,
        "description": "A harbor area with boats docked and surrounding traffic"
    },
    "playground": {
        "name": "Playground",
        "required_objects": [0, 32],  # person, sports ball
        "optional_objects": [33, 24, 1],  # kite, backpack, bicycle
        "minimum_required": 1,
        "description": "An outdoor playground with people playing sports and games"
    },
    "sports_field": {
        "name": "Sports Field",
        "required_objects": [32],  # sports ball
        "optional_objects": [38, 34, 35],  # tennis racket, baseball bat, baseball glove
        "minimum_required": 1,
        "description": "A sports field set up for various ball games"
    },
     "narrow_commercial_alley": {
        "name": "Narrow Commercial Alley",
        "required_objects": [0, 3],  # person, motorcycle
        "optional_objects": [2, 7, 24, 26],  # car, truck, backpack, handbag
        "minimum_required": 2,
        "description": "A tight urban alley lined with shops, with pedestrians and light vehicles"
    },
    "daytime_shopping_street": {
        "name": "Daytime Shopping Street",
        "required_objects": [0, 2],  # person, car
        "optional_objects": [1, 3, 24, 26],  # bicycle, motorcycle, backpack, handbag
        "minimum_required": 2,
        "description": "A busy pedestrian street during daytime, featuring shops, vehicles, and shoppers"
    },
    "urban_pedestrian_crossing": {
        "name": "Urban Pedestrian Crossing",
        "required_objects": [0, 9],  # person, traffic light
        "optional_objects": [2, 3, 5],  # car, motorcycle, bus
        "minimum_required": 1,
        "description": "A city street crossing with pedestrians and traffic signals"
    },
    "aerial_view_intersection": {
    "name": "Aerial View Intersection",
    "required_objects": [0, 9],  # person, traffic light
    "optional_objects": [1, 2, 3, 5, 7],  # bicycle, car, motorcycle, bus, truck
    "minimum_required": 1,
    "description": "An intersection viewed from above, showing crossing patterns and pedestrian movement"
    },
    "aerial_view_commercial_area": {
        "name": "Aerial View Commercial Area",
        "required_objects": [0, 2],  # person, car
        "optional_objects": [1, 3, 5, 7, 24, 26],  # bicycle, motorcycle, bus, truck, backpack, handbag
        "minimum_required": 2,
        "description": "A commercial or shopping area viewed from above showing pedestrians and urban layout"
    },
    "aerial_view_plaza": {
        "name": "Aerial View Plaza",
        "required_objects": [0],  # person
        "optional_objects": [1, 2, 24, 25, 26],  # bicycle, car, backpack, umbrella, handbag
        "minimum_required": 1,
        "description": "An urban plaza or public square viewed from above with pedestrian activity"
    },

    # specific cultural item
    "asian_night_market": {
        "name": "Asian Night Market",
        "required_objects": [0, 67],  # person, cell phone
        "optional_objects": [1, 3, 24, 26, 39, 41],  # bicycle, motorcycle, backpack, handbag, bottle, cup
        "minimum_required": 1,
        "description": "A vibrant night market scene typical in Asian cities with food stalls and crowds"
    },
    "asian_temple_area": {
        "name": "Asian Temple Area",
        "required_objects": [0],  # person
        "optional_objects": [24, 25, 26, 67, 75],  # backpack, umbrella, handbag, cell phone, vase
        "minimum_required": 1,
        "description": "A traditional Asian temple complex with visitors and cultural elements"
    },

    # specific time item
    "nighttime_street": {
        "name": "Nighttime Street",
        "required_objects": [0, 9],  # person, traffic light
        "optional_objects": [1, 2, 3, 5, 7, 67],  # bicycle, car, motorcycle, bus, truck, cell phone
        "minimum_required": 1,
        "description": "An urban street at night with artificial lighting and nighttime activity"
    },
    "nighttime_commercial_district": {
        "name": "Nighttime Commercial District",
        "required_objects": [0, 67],  # person, cell phone
        "optional_objects": [1, 2, 3, 24, 26],  # bicycle, car, motorcycle, backpack, handbag
        "minimum_required": 1,
        "description": "A commercial district illuminated at night with neon signs and evening activity"
    },

    # mixture enviroment item
    "indoor_outdoor_cafe": {
        "name": "Indoor-Outdoor Cafe",
        "required_objects": [56, 60, 41],  # chair, dining table, cup
        "optional_objects": [39, 40, 63, 67, 73],  # bottle, wine glass, laptop, cell phone, book
        "minimum_required": 2,
        "description": "A cafe setting with both indoor elements and outdoor patio or sidewalk seating"
    },
    "transit_station_platform": {
        "name": "Transit Station Platform",
        "required_objects": [0],  # person
        "optional_objects": [5, 6, 7, 24, 28, 67],  # bus, train, truck, backpack, suitcase, cell phone
        "minimum_required": 1,
        "description": "A transit platform with waiting passengers and arriving/departing vehicles"
    },
    "sports_stadium": {
        "name": "Sports Stadium",
        "required_objects": [0, 32],  # person, sports ball
        "optional_objects": [24, 38, 39, 41, 67],  # backpack, tennis racket, bottle, cup, cell phone
        "minimum_required": 1,
        "description": "A sports stadium or arena with spectators and athletic activities"
    },
    "construction_site": {
        "name": "Construction Site",
        "required_objects": [0, 7],  # person, truck
        "optional_objects": [2, 3, 11, 76, 77, 78],  # car, motorcycle, fire hydrant, scissors, teddy bear, hair drier
        "minimum_required": 1,
        "description": "A construction site with workers, equipment, and building materials"
    },
    "medical_facility": {
        "name": "Medical Facility",
        "required_objects": [0, 56, 60],  # person, chair, dining table
        "optional_objects": [63, 64, 66, 67, 73],  # laptop, mouse, keyboard, cell phone, book
        "minimum_required": 2,
        "description": "A medical facility such as hospital, clinic or doctor's office with medical staff and patients"
    },
    "educational_setting": {
        "name": "Educational Setting",
        "required_objects": [0, 56, 60, 73],  # person, chair, dining table, book
        "optional_objects": [63, 64, 66, 67, 74],  # laptop, mouse, keyboard, cell phone, clock
        "minimum_required": 2,
        "description": "An educational environment such as classroom, lecture hall or study area"
    },
    "aerial_view_intersection": {
        "name": "Aerial View Intersection",
        "required_objects": [0, 9],  # person, traffic light
        "optional_objects": [1, 2, 3, 5, 7],  # bicycle, car, motorcycle, bus, truck
        "minimum_required": 1,
        "description": "An intersection viewed from above, showing crossing patterns and pedestrian movement",
        "viewpoint_indicator": "aerial", # view side
        "key_features": ["crosswalk_pattern", "pedestrian_flow", "intersection_layout"],  # key feature
        "detection_priority": 10  # priority
    },
    "perpendicular_crosswalk_intersection": {
        "name": "Perpendicular Crosswalk Intersection",
        "required_objects": [0],  # person
        "optional_objects": [1, 2, 3, 5, 7, 9],  # bicycle, car, motorcycle, bus, truck, traffic light
        "minimum_required": 1,
        "description": "An intersection with perpendicular crosswalks where pedestrians cross in multiple directions",
        "viewpoint_indicator": "aerial",
        "key_features": ["perpendicular_crosswalks", "pedestrian_crossing", "multi_directional_movement"],
        "pattern_detection": True, # specific pattern
        "detection_priority": 15  #
    },
    "beach_water_recreation": {
    "name": "Beach/Water Recreation Area",
    "required_objects": [0, 37],  # person, surfboard
    "optional_objects": [25, 33, 1, 8, 29, 24, 26, 39, 41],  # umbrella, kite, bicycle, boat, frisbee, backpack, handbag, bottle, cup
    "minimum_required": 2,
    "description": "A beach or water recreation area with water sports equipment and beach accessories"
    },
    "sports_venue": {
    "name": "Sports Venue",
    "required_objects": [0, 32],  # person, sports ball
    "optional_objects": [34, 35, 38, 25, 24, 26, 39, 41],  # baseball bat, baseball glove, tennis racket, umbrella, backpack, handbag, bottle, cup
    "minimum_required": 2,
    "description": "A professional sports venue with specialized sports equipment and spectator areas"
    },
    "professional_kitchen": {
    "name": "Professional Kitchen",
    "required_objects": [43, 44, 45],  # knife, spoon, bowl
    "optional_objects": [42, 39, 41, 68, 69, 71, 72, 0],  # fork, bottle, cup, microwave, oven, sink, refrigerator, person
    "minimum_required": 3,
    "description": "A commercial kitchen with professional cooking equipment and food preparation areas"
    },
}

In [None]:
# %%writefile confifence_templates.py

CONFIDENCE_TEMPLATES = {
    "high": "{description} {details}",
    "medium": "This appears to be {description} {details}",
    "low": "This might be {description}, but the confidence is low. {details}"
}

In [None]:
# %%writefile scene_detail_templates.py

SCENE_DETAIL_TEMPLATES = {
            "living_room": [
                "The space is arranged for relaxation with {furniture}.",
                "There is {electronics} for entertainment.",
                "The room has a seating area with {seating}."
            ],
            "bedroom": [
                "The room contains {bed_type} in the {bed_location}.",
                "This sleeping area has {bed_description}.",
                "A personal space with {bed_type} and {extras}."
            ],
            "dining_area": [
                "A space set up for meals with {table_setup}.",
                "The dining area contains {table_description}.",
                "A place for eating with {dining_items}."
            ],
            "kitchen": [
                "A food preparation area with {appliances}.",
                "The kitchen contains {kitchen_items}.",
                "A cooking space equipped with {cooking_equipment}."
            ],
            "office_workspace": [
                "A work environment with {office_equipment}.",
                "A space designed for productivity with {desk_setup}.",
                "A workspace containing {computer_equipment}."
            ],
            "city_street": [
                "An urban thoroughfare with {traffic_description}.",
                "A street scene with {people_and_vehicles}.",
                "A city path with {street_elements}."
            ],
            "park_area": [
                "An outdoor recreational space with {park_features}.",
                "A leisure area featuring {outdoor_elements}.",
                "A public outdoor space with {park_description}."
            ],
            "retail_store": [
                "A shopping environment with {store_elements}.",
                "A commercial space where {shopping_activity}.",
                "A retail area containing {store_items}."
            ],
            "upscale_dining": [
            "The space features {furniture} with {design_elements} for an elegant dining experience.",
            "This sophisticated dining area includes {lighting} illuminating {table_setup}.",
            "A stylish dining environment with {seating} arranged around {table_description}."
            ],
            "asian_commercial_street": [
                "A vibrant street lined with {storefront_features} and filled with {pedestrian_flow}.",
                "This urban commercial area displays {asian_elements} with {cultural_elements}.",
                "A lively shopping street characterized by {signage} and busy with {street_activities}."
            ],
            "financial_district": [
                "A canyon of {buildings} with {traffic_elements} moving through the urban landscape.",
                "This business district features {skyscrapers} along {road_features}.",
                "A downtown corridor with {architectural_elements} framing views of {city_landmarks}."
            ],
            "urban_intersection": [
                "A busy crossroad with {crossing_pattern} where {pedestrian_behavior} is observed.",
                "This urban junction features {pedestrian_density} navigating the {traffic_pattern}.",
                "A well-marked intersection designed for {pedestrian_flow} across multiple directions."
            ],
            "transit_hub": [
                "A transportation nexus where {transit_vehicles} arrive and depart amid {passenger_activity}.",
                "This transit center accommodates {transportation_modes} with facilities for {passenger_needs}.",
                "A busy transport hub featuring {transit_infrastructure} and areas for {passenger_movement}."
            ],
            "shopping_district": [
                "A commercial zone filled with {retail_elements} and {shopping_activity}.",
                "This shopping area features {store_types} along {walkway_features}.",
                "A retail district characterized by {commercial_signage} and {consumer_behavior}."
            ],
            "bus_stop": [
                "Passengers waiting at a roadside stop served by {transit_vehicles}.",
                "A designated bus stop with shelters and {passenger_activity}.",
                "Commuters boarding or alighting from {transit_vehicles} at the curb."
            ],
            "bus_station": [
                "Multiple buses parked in a terminal where {passenger_activity}.",
                "A busy station hub featuring {transit_vehicles} and traveler luggage.",
                "A transit center with waiting areas and various {transportation_modes}."
            ],
            "zoo": [
                "Enclosures showcasing elephants, zebras, and giraffes with visitors observing.",
                "A wildlife exhibit area where families watch animal displays.",
                "A recreational space featuring large animal exhibits and strolling guests."
            ],
            "harbor": [
                "Boats docked along the waterfront with nearby vehicular traffic.",
                "A maritime area where vessels anchor beside roads busy with cars and motorcycles.",
                "A coastal dock featuring moored boats and passing traffic elements."
            ],
            "playground": [
                "An open play area equipped with balls and recreational structures.",
                "People engaging in games and sports in a communal space.",
                "A leisure area featuring playground equipment and active participants."
            ],
            "sports_field": [
                "An athletic field marked for various ball games and matches.",
                "Players using equipment like bats, gloves, and rackets on a grassy pitch.",
                "A designated sports area with goalposts or markings for competitive play."
            ],
            "narrow_commercial_alley": [
                "A tight alley lined with {storefront_features} and light vehicles.",
                "Pedestrians navigate a confined lane flanked by shops and {street_activities}.",
                "An urban passage featuring {storefront_features} with {people_and_vehicles}."
            ],
            "daytime_shopping_street": [
                "A bustling street during daytime with {storefront_features} and {pedestrian_flow}.",
                "Shoppers and vehicles move along a retail strip marked by {signage}.",
                "An open commercial avenue filled with {people_and_vehicles} amid shops."
            ],
            "urban_pedestrian_crossing": [
                "A marked crosswalk with {crossing_pattern} under {lighting_modifier} sky.",
                "Pedestrians use designated crossing with {traffic_pattern} at the intersection.",
                "People waiting at a signal-controlled crossing next to {street_elements}."
            ],
            "aerial_view_intersection": [
                "The crossing pattern shows {crossing_pattern} with {pedestrian_flow} across multiple directions.",
                "From above, this intersection reveals {traffic_pattern} with {pedestrian_density} navigating through defined paths.",
                "This bird's-eye view shows {street_elements} converging at a junction where {pedestrian_behavior} is visible."
            ],
            "aerial_view_commercial_area": [
                "From above, this commercial zone shows {storefront_features} with {pedestrian_flow} moving between establishments.",
                "This overhead view reveals {shopping_activity} amid {walkway_features} connecting different businesses.",
                "The aerial perspective captures {retail_elements} organized along {commercial_layout} with visible customer activity."
            ],
            "aerial_view_plaza": [
                "This overhead view of the plaza shows {pedestrian_pattern} across an open public space.",
                "From above, the plaza reveals {gathering_features} where people congregate in {movement_pattern}.",
                "The aerial perspective captures {urban_elements} arranged around a central area where {public_activity} occurs."
            ],
            "asian_night_market": [
                "This bustling night market features {stall_elements} illuminated by {lighting_features} with crowds enjoying {food_elements}.",
                "Rows of {vendor_stalls} line this vibrant market where {nighttime_activity} continues under {cultural_lighting}.",
                "The market atmosphere is created by {asian_elements} and {night_market_sounds} amid {evening_crowd_behavior}."
            ],
            "asian_temple_area": [
                "This sacred space features {architectural_elements} displaying {cultural_symbols} with visitors engaging in {ritual_activities}.",
                "The temple area contains {religious_structures} adorned with {decorative_features} where people practice {cultural_practices}.",
                "Traditional {temple_architecture} creates a spiritual atmosphere enhanced by {sensory_elements} and {visitor_activities}."
            ],
            "european_plaza": [
                "This historic plaza is framed by {architectural_style} surrounding an open space where {public_activities} take place.",
                "The European square features {historic_elements} and {urban_design} creating a space for {social_behaviors}.",
                "Classical {european_features} define this public space where {tourist_activities} blend with {local_customs}."
            ],
            "nighttime_street": [
                "The night transforms this street with {lighting_effects} casting {shadow_patterns} across {urban_features}.",
                "After dark, this urban corridor is defined by {illuminated_elements} with {evening_activities} visible in the artificial light.",
                "The nocturnal street scene captures {light_sources} creating contrast between {lit_areas} and {shadowed_zones}."
            ],
            "nighttime_commercial_district": [
                "After sunset, this commercial area comes alive with {illuminated_signage} and {evening_activities} under {colorful_lighting}.",
                "The district's nighttime character is defined by {neon_elements} highlighting {storefront_features} amid {night_crowd_behavior}.",
                "Evening transforms this zone through {light_displays} that accentuate {building_features} and frame {nightlife_activities}."
            ],
            "indoor_outdoor_cafe": [
                "This cafe blends indoor comfort with outdoor atmosphere through {transitional_elements} connecting {indoor_features} with {outdoor_setting}.",
                "Customers enjoy both {interior_amenities} and {exterior_features} in this space that bridges indoor comfort and outdoor ambiance.",
                "The cafe design creates flow between {inside_elements} and {outside_spaces} allowing patrons to experience {dual_environment_benefits}."
            ],
            "transit_station_platform": [
                "This transit platform combines covered areas with open sections where {passenger_activities} occur while awaiting {transportation_types}.",
                "The station design balances {sheltered_elements} with {exposed_areas} for passengers engaged in {waiting_behaviors}.",
                "Commuters navigate between {indoor_facilities} and {platform_features} while {transit_routines} unfold around arriving vehicles."
            ],
            "sports_stadium": [
                "This athletic venue features {seating_arrangement} surrounding {playing_surface} where {sporting_activities} take place.",
                "The stadium design incorporates {spectator_facilities} overlooking {competition_space} designed for {sports_events}.",
                "Fans occupy {viewing_areas} arranged to maximize visibility of {field_elements} where athletes engage in {game_activities}."
            ],
            "construction_site": [
                "This development area shows {construction_equipment} amid {building_materials} where workers conduct {construction_activities}.",
                "The construction process is visible through {work_elements} positioned around {structural_components} in various stages of completion.",
                "Workers utilize {site_equipment} to transform {raw_materials} following {construction_process} stages."
            ],
            "medical_facility": [
                "This healthcare environment features {medical_elements} arranged to support {clinical_activities} in a {facility_design}.",
                "The medical space incorporates {healthcare_features} where {patient_interactions} occur in a controlled environment.",
                "Professional medical staff utilize {equipment_types} while conducting {care_procedures} in specialized {treatment_spaces}."
            ],
            "educational_setting": [
                "This learning environment contains {educational_furniture} arranged to facilitate {learning_activities} through {instructional_design}.",
                "The educational space features {classroom_elements} organized for {teaching_methods} and {student_engagement}.",
                "Students and educators interact within {learning_spaces} equipped with {educational_tools} supporting {knowledge_transfer}."
            ],
            "beach_water_recreation": [
                "A coastal recreation area with {beach_equipment} and people enjoying {water_activities}.",
                "This shoreline space features {beach_equipment} where visitors engage in {water_activities}.",
                "An outdoor water recreation zone with {beach_equipment} set up for {water_activities}."
            ],
            "sports_venue": [
                "A professional sports facility with {sports_equipment} arranged for {competitive_activities}.",
                "This athletics venue features {sports_equipment} with spaces designated for {competitive_activities}.",
                "A specialized sports arena containing {sports_equipment} designed for {competitive_activities}."
            ],
            "professional_kitchen": [
                "A commercial cooking space with {kitchen_equipment} organized for {food_preparation}.",
                "This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
                "An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
            ],
        }

In [None]:
# %%writefile object_template_fillers.py

OBJECT_TEMPLATE_FILLERS = {
                "furniture": ["designer chairs", "wooden dining table", "stylish seating", "upholstered armchairs", "elegant dining furniture"],
                "design_elements": ["art pieces", "decorative wreaths", "statement lighting", "seasonal decorations", "sophisticated decor"],
                "lighting": ["pendant lights", "decorative fixtures", "geometric lighting", "modern chandeliers", "ambient illumination"],
                "table_setup": ["elegantly set table", "tabletop decorations", "seasonal centerpieces", "formal place settings", "floral arrangements"],
                "seating": ["upholstered chairs", "accent armchairs", "mixed seating styles", "designer dining chairs", "comfortable dining seats"],
                "table_description": ["solid wood table", "designer dining table", "expansive dining surface", "artisanal table", "statement dining table"],

                "storefront_features": ["multi-story shops", "illuminated signs", "merchandise displays", "compact storefronts", "vertical retail spaces"],
                "pedestrian_flow": ["people walking", "shoppers", "pedestrians", "locals and tourists", "urban foot traffic"],
                "asian_elements": ["Asian language signage", "decorative lanterns", "local storefronts", "character-based text", "regional design elements"],
                "cultural_elements": ["red lanterns", "local typography", "distinctive architecture", "cultural symbols", "traditional decorations"],
                "signage": ["bright store signs", "multilingual text", "vertical signboards", "neon displays", "electronic advertisements"],
                "street_activities": ["shopping", "commuting", "socializing", "vendor transactions", "urban navigation"],

                "buildings": ["high-rise office buildings", "corporate towers", "skyscrapers", "financial institutions", "commercial headquarters"],
                "traffic_elements": ["vehicle lights", "trams/street cars", "lane markers", "traffic signals", "urban transit"],
                "skyscrapers": ["glass and steel buildings", "tall structures", "modern architecture", "office towers", "urban high-rises"],
                "road_features": ["wide avenues", "tram tracks", "traffic lanes", "median dividers", "urban throughways"],
                "architectural_elements": ["contemporary buildings", "urban design", "varied architectural styles", "corporate architecture", "city planning features"],
                "city_landmarks": ["distant bridge", "skyline features", "iconic structures", "urban monuments", "signature buildings"],

                "crossing_pattern": ["zebra crosswalks", "pedestrian walkways", "crosswalk markings", "intersection design", "safety stripes"],
                "pedestrian_density": ["groups of people", "commuters", "diverse pedestrians", "urban crowds", "varying foot traffic"],
                "pedestrian_behavior": ["walking in different directions", "crossing together", "waiting for signals", "navigating intersections", "following traffic rules"],
                "traffic_pattern": ["four-way intersection", "crossroad", "junction", "multi-directional traffic", "regulated crossing"],
                "pedestrian_flow": ["people crossing", "directional movement", "coordinated crossing", "timed pedestrian traffic", "intersection navigation"],

                "transit_vehicles": ["buses", "trams", "trains", "taxis", "shuttles"],
                "passenger_activity": ["boarding", "waiting", "exiting vehicles", "checking schedules", "navigating stations"],
                "transportation_modes": ["public transit", "private vehicles", "ride services", "light rail", "bus systems"],
                "passenger_needs": ["waiting areas", "information displays", "ticketing services", "transit connections", "seating"],
                "transit_infrastructure": ["stations", "platforms", "boarding areas", "transit lanes", "signaling systems"],
                "passenger_movement": ["transfers", "entrances and exits", "queueing", "platform access", "terminal navigation"],

                "retail_elements": ["storefronts", "display windows", "shopping bags", "merchandise", "retail signage"],
                "shopping_activity": ["browsing", "carrying purchases", "window shopping", "social shopping", "consumer activities"],
                "store_types": ["boutiques", "brand stores", "local shops", "chain retailers", "specialty stores"],
                "walkway_features": ["pedestrian paths", "shopping promenades", "retail corridors", "commercial walkways", "shopping streets"],
                "commercial_signage": ["brand logos", "sale announcements", "store names", "advertising displays", "digital signage"],
                "consumer_behavior": ["shopping in groups", "individual browsing", "carrying bags", "examining products", "moving between stores"],

                "beach_equipment": ["beach umbrellas", "surfboards", "beach towels", "sun protection", "recreational equipment"],
                "water_activities": ["water sports", "surfing", "beach recreation", "sun bathing", "coastal leisure"],
                "sports_equipment": ["game balls", "professional equipment", "athletic gear", "sports apparatus", "competition items"],
                "competitive_activities": ["team sports", "athletic contests", "competitive games", "sporting events", "professional matches"],
                "kitchen_equipment": ["professional appliances", "cooking stations", "preparation surfaces", "culinary tools", "industrial equipment"],
                "food_preparation": ["meal production", "culinary operations", "food service preparation", "commercial cooking", "kitchen workflow"],

                "crossing_pattern": ["grid-like pedestrian crossings", "multi-directional crosswalks", "cross-shaped intersection design", "perpendicular crossing lanes", "zebra-striped crosswalks viewed from above"],
                "pedestrian_pattern": ["scattered distribution of people", "organized flow of pedestrians", "clustered gatherings", "radial movement patterns", "linear procession of individuals"],
                "commercial_layout": ["parallel shopping streets", "interconnected shopping blocks", "radial marketplace design", "grid-like retail arrangement", "meandering commercial pathways"],
                "movement_pattern": ["circular crowd motion", "directional pedestrian flow", "scattered individual movement", "converging foot traffic", "diverging pedestrian patterns"],

                "stall_elements": ["food vendors with steaming woks", "trinket sellers with colorful displays", "lantern-lit stalls", "bamboo-framed shops", "canvas-covered market stands"],
                "asian_elements": ["hanging red lanterns", "character-based signage", "ornate temple decorations", "traditional paper decorations", "stylized gateway arches"],
                "cultural_lighting": ["paper lantern illumination", "neon character signs", "strung festival lights", "hanging light chains", "colorful shop front lighting"],
                "architectural_elements": ["tiered pagoda roofs", "ornate dragon sculptures", "stone guardian statues", "intricately carved railings", "traditional wooden beams"],
                "cultural_symbols": ["dharma wheels", "lotus motifs", "yin-yang symbols", "zodiac animal representations", "traditional calligraphy"],
                "architectural_style": ["Baroque facades", "Gothic spires", "Renaissance colonnades", "Neoclassical pediments", "Medieval archways"],
                "european_features": ["cobblestone paving", "ornate fountains", "bronze statuary", "wrought iron lampposts", "cafe terraces"],

                "lighting_effects": ["streetlamp pools of light", "neon sign glow", "illuminated window squares", "headlight streams", "traffic signal flashes"],
                "illuminated_elements": ["lit storefront windows", "glowing traffic signals", "illuminated advertising", "headlight-lit streets", "backlit silhouettes"],
                "neon_elements": ["colorful shop signs", "animated light displays", "illuminated brand logos", "glowing storefront outlines", "digital advertising screens"],
                "illuminated_signage": ["bright LED displays", "glowing brand names", "projected light advertisements", "illuminated menu boards", "digital information screens"],
                "colorful_lighting": ["multi-colored neon", "warm ambient illumination", "cool blue accent lights", "festive string lighting", "dynamic color-changing displays"],

                "transitional_elements": ["retractable glass walls", "indoor-outdoor bar counters", "terraced seating areas", "threshold planters", "partial canopy coverage"],
                "indoor_features": ["climate-controlled spaces", "soft seating arrangements", "interior decor accents", "mood lighting fixtures", "sound-dampened areas"],
                "outdoor_setting": ["sidewalk tables", "patio seating", "garden furniture", "open-air counters", "courtyard arrangements"],
                "seating_arrangement": ["tiered spectator stands", "premium viewing boxes", "courtside seating", "general admission benches", "stadium chair rows"],
                "playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
                "construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
                "medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
                "educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"]
            }

In [None]:
# %%writefile safety_templates.py
SAFETY_TEMPLATES = {
    "general": "Pay attention to {safety_element}.",
    "warning": "Be cautious of {hazard} in this environment.",
    "notice": "Note the presence of {element_of_interest}."
}

In [None]:
# %%writefile activity_templates.py

ACTIVITY_TEMPLATES = {
            "living_room": [
                "Watching TV",
                "Relaxing on the sofa",
                "Reading",
                "Socializing"
            ],
            "bedroom": [
                "Sleeping",
                "Resting",
                "Getting dressed",
                "Reading in bed"
            ],
            "dining_area": [
                "Eating a meal",
                "Having a conversation",
                "Working at table"
            ],
            "kitchen": [
                "Cooking",
                "Food preparation",
                "Cleaning dishes"
            ],
            "office_workspace": [
                "Working on computer",
                "Office work",
                "Virtual meetings",
                "Reading documents"
            ],
            "meeting_room": [
                "Group meeting",
                "Presentation",
                "Team discussion",
                "Collaboration"
            ],
            "city_street": [
                "Walking",
                "Commuting",
                "Shopping",
                "Waiting for transportation"
            ],
            "parking_lot": [
                "Parking vehicles",
                "Loading/unloading items",
                "Entering/exiting vehicles"
            ],
            "park_area": [
                "Walking",
                "Relaxing outdoors",
                "Exercising",
                "Social gathering"
            ],
            "retail_store": [
                "Shopping",
                "Browsing products",
                "Purchasing items"
            ],
            "supermarket": [
                "Grocery shopping",
                "Selecting products",
                "Checking out"
            ],
            "upscale_dining": [
                "Fine dining",
                "Social gathering",
                "Special occasion meal",
                "Family dinner",
                "Business meeting",
                "Celebratory meal"
            ],
            "asian_commercial_street": [
                "Shopping",
                "Sightseeing",
                "Walking to destinations",
                "Visiting local shops",
                "Cultural exploration",
                "Urban commuting",
                "Meeting friends"
            ],
            "financial_district": [
                "Commuting",
                "Business travel",
                "Urban transit",
                "Sightseeing",
                "City navigation",
                "Professional activities",
                "Corporate meetings"
            ],
            "urban_intersection": [
                "Street crossing",
                "Waiting for signals",
                "Urban navigation",
                "Commuting",
                "Group movement",
                "Following traffic patterns",
                "Pedestrian coordination"
            ],
            "transit_hub": [
                "Commuting",
                "Waiting for transportation",
                "Transferring between vehicles",
                "Starting/ending journeys",
                "Meeting travelers",
                "Checking transit schedules",
                "Urban transportation"
            ],
            "shopping_district": [
                "Retail shopping",
                "Window browsing",
                "Social shopping",
                "Product comparison",
                "Making purchases",
                "Brand exploration",
                "Recreational shopping"
            ],
            "bus_stop": [
                "Waiting for the bus",
                "Checking schedules",
                "Boarding or alighting",
                "Standing under shelter"
            ],
            "bus_station": [
                "Navigating between platforms",
                "Handling luggage",
                "Boarding buses",
                "Gathering at waiting areas"
            ],
            "zoo": [
                "Watching animal exhibits",
                "Taking photos of wildlife",
                "Walking along enclosures",
                "Reading informational signs"
            ],
            "harbor": [
                "Observing docked boats",
                "Commuting by watercraft",
                "Loading or unloading cargo",
                "Strolling along the pier"
            ],
            "playground": [
                "Playing ball games",
                "Swinging or sliding",
                "Running around",
                "Socializing with friends"
            ],
            "sports_field": [
                "Practicing ball drills",
                "Competing in matches",
                "Warming up or stretching",
                "Team training sessions"
            ],
            "narrow_commercial_alley": [
                "Walking through alley",
                "Browsing storefronts",
                "Navigating light traffic",
                "Carrying shopping bags"
            ],
            "daytime_shopping_street": [
                "Shopping",
                "Window browsing",
                "Street photography",
                "Commuting by vehicle"
            ],
            "urban_pedestrian_crossing": [
                "Crossing the street",
                "Waiting for signal",
                "Following traffic rules",
                "Checking for vehicles"
            ],
            "aerial_view_intersection": [
                "Crossing multiple directions",
                "Following traffic signals",
                "Navigating pedestrian paths",
                "Traffic management",
                "Multi-directional movement",
                "Organized crossing patterns",
                "Waiting at signals"
            ],
            "aerial_view_commercial_area": [
                "Shopping district navigation",
                "Retail browsing",
                "Store-to-store movement",
                "Commercial zone foot traffic",
                "Shopping center traversal",
                "Retail area engagement",
                "Walking between stores"
            ],
            "aerial_view_plaza": [
                "Public gathering",
                "Open space traversal",
                "Community congregation",
                "Plaza navigation",
                "Public square activities",
                "Urban space utilization"
            ],
            "asian_night_market": [
                "Street food sampling",
                "Night market browsing",
                "Evening shopping",
                "Cultural food exploration",
                "Vendor interaction",
                "Social night dining",
                "Market stall hopping"
            ],
            "asian_temple_area": [
                "Temple visiting",
                "Cultural site exploration",
                "Spiritual observance",
                "Traditional rituals",
                "Historical site appreciation",
                "Religious tourism",
                "Cultural photography"
            ],
            "european_plaza": [
                "Urban sightseeing",
                "Historical appreciation",
                "Tourist photography",
                "Public space relaxation",
                "Casual strolling"
            ],
            "nighttime_street": [
                "Evening commuting",
                "Night walking",
                "After-hours travel",
                "Nighttime navigation",
                "Evening errands",
                "Late-night transportation",
                "Nocturnal urban movement"
            ],
            "nighttime_commercial_district": [
                "Evening shopping",
                "Nightlife participation",
                "Nighttime entertainment",
                "After-dark dining",
                "Evening social gathering",
                "Night market browsing",
                "Illumination appreciation"
            ],
            "indoor_outdoor_cafe": [
                "Al fresco dining",
                "Sidewalk coffee enjoyment",
                "Indoor-outdoor socializing",
                "Patio relaxation",
                "Open-air refreshment",
                "Transitional space usage",
                "Weather-dependent positioning"
            ],
            "transit_station_platform": [
                "Transit waiting",
                "Platform navigation",
                "Boarding preparation",
                "Arrival monitoring",
                "Schedule checking",
                "Departure positioning",
                "Platform traversal"
            ],
            "sports_stadium": [
                "Spectator viewing",
                "Sports fan cheering",
                "Game attendance",
                "Stadium navigation",
                "Athletic event watching",
                "Audience participation",
                "Sports appreciation"
            ],
            "construction_site": [
                "Construction work",
                "Building development",
                "Site management",
                "Material handling",
                "Construction supervision",
                "Safety monitoring",
                "Building process"
            ],
            "medical_facility": [
                "Healthcare consultation",
                "Medical treatment",
                "Patient waiting",
                "Healthcare delivery",
                "Medical examination",
                "Professional care",
                "Health monitoring"
            ],
            "educational_setting": [
                "Classroom learning",
                "Educational instruction",
                "Student participation",
                "Academic engagement",
                "Knowledge acquisition",
                "Educational discussion",
                "Scholastic activities"
            ],
            "beach_water_recreation": [
                "Surfing",
                "Sunbathing",
                "Beach volleyball",
                "Swimming",
                "Relaxing by the water",
                "Flying beach kites",
                "Beach picnicking",
                "Coastal walking"
            ],
            "sports_venue": [
                "Professional game playing",
                "Sports competition",
                "Athletic training",
                "Team practice",
                "Spectator viewing",
                "Sports coaching",
                "Tournament participation",
                "Athletic performance"
            ],
            "professional_kitchen": [
                "Professional cooking",
                "Food preparation",
                "Meal service coordination",
                "Kitchen operations",
                "Culinary production",
                "Chef activities",
                "Commercial food handling",
                "Restaurant meal preparation"
            ]
        }

In [None]:
# %%writefile object_categories.py
OBJECT_CATEGORIES = {
                "furniture": [56, 57, 58, 59, 60, 61],
                "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
                "kitchen_items": [39, 40, 41, 42, 43, 44, 45],
                "food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
                "vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
                "personal_items": [24, 25, 26, 27, 28, 73, 78, 79]
            }

In [None]:
# %%writefile lighting_conditions.py

LIGHTING_CONDITIONS = {
    "time_descriptions": {
        "day_clear": {
        "general": "The scene is captured during clear daylight hours with bright natural lighting.",
        "bright": "The scene is brightly lit with strong, clear daylight.",
        "medium": "The scene is illuminated with moderate daylight under clear conditions.",
        "dim": "The scene is captured in soft daylight on a clear day."
        },
        "day_cloudy": {
        "general": "The scene is captured during daytime under overcast conditions.",
        "bright": "The scene has the diffused bright lighting of an overcast day.",
        "medium": "The scene has even, soft lighting typical of a cloudy day.",
        "dim": "The scene has the muted lighting of a heavily overcast day."
        },
        "sunset/sunrise": {
        "general": "The scene is captured during golden hour with warm lighting.",
        "bright": "The scene is illuminated with bright golden hour light with long shadows.",
        "medium": "The scene has the warm orange-yellow glow typical of sunset or sunrise.",
        "dim": "The scene has soft, warm lighting characteristic of early sunrise or late sunset."
        },
        "night": {
        "general": "The scene is captured at night with limited natural lighting.",
        "bright": "The scene is captured at night but well-lit with artificial lighting.",
        "medium": "The scene is captured at night with moderate artificial lighting.",
        "dim": "The scene is captured in low-light night conditions with minimal illumination."
        },
        "indoor_bright": {
        "general": "The scene is captured indoors with ample lighting.",
        "bright": "The indoor space is brightly lit, possibly with natural light from windows.",
        "medium": "The indoor space has good lighting conditions.",
        "dim": "The indoor space has adequate lighting."
        },
        "indoor_moderate": {
        "general": "The scene is captured indoors with moderate lighting.",
        "bright": "The indoor space has comfortable, moderate lighting.",
        "medium": "The indoor space has standard interior lighting.",
        "dim": "The indoor space has somewhat subdued lighting."
        },
        "indoor_dim": {
        "general": "The scene is captured indoors with dim or mood lighting.",
        "bright": "The indoor space has dim but sufficient lighting.",
        "medium": "The indoor space has low, atmospheric lighting.",
        "dim": "The indoor space has very dim, possibly mood-oriented lighting."
        },
        "beach_daylight": {
            "general": "The scene is captured during daytime at a beach with bright natural sunlight.",
            "bright": "The beach scene is intensely illuminated by direct sunlight.",
            "medium": "The coastal area has even natural daylight.",
            "dim": "The beach has softer lighting, possibly from a partially cloudy sky."
        },
        "sports_arena": {
            "general": "The scene is captured in a sports venue with specialized arena lighting.",
            "bright": "The sports facility is brightly illuminated with powerful overhead lights.",
            "medium": "The venue has standard sports event lighting providing clear visibility.",
            "dim": "The sports area has reduced illumination, possibly before or after an event."
        },
        "kitchen_working": {
            "general": "The scene is captured in a professional kitchen with task-oriented lighting.",
            "bright": "The kitchen is intensely illuminated with clear, functional lighting.",
            "medium": "The culinary space has standard working lights focused on preparation areas.",
            "dim": "The kitchen has reduced lighting, possibly during off-peak hours."
        },
        "unknown": {
        "general": "The lighting conditions in this scene are not easily determined."
        }
    },
    "template_modifiers": {
        "day_clear": "brightly-lit",
        "day_cloudy": "softly-lit",
        "sunset/sunrise": "warmly-lit",
        "night": "night-time",
        "indoor_bright": "well-lit indoor",
        "indoor_moderate": "indoor",
        "indoor_dim": "dimly-lit indoor",
        "indoor_commercial": "retail-lit",
        "indoor_restaurant": "atmospherically-lit",
        "neon_night": "neon-illuminated",
        "stadium_lighting": "flood-lit",
        "mixed_lighting": "transitionally-lit",
        "beach_lighting": "sun-drenched",
        "sports_venue_lighting": "arena-lit",
        "professional_kitchen_lighting": "kitchen-task lit",
        "unknown": ""
    },
    "activity_modifiers": {
        "day_clear": ["active", "lively", "busy"],
        "day_cloudy": ["calm", "relaxed", "casual"],
        "sunset/sunrise": ["peaceful", "transitional", "atmospheric"],
        "night": ["quiet", "subdued", "nocturnal"],
        "indoor_bright": ["focused", "productive", "engaged"],
        "indoor_moderate": ["comfortable", "social", "casual"],
        "indoor_dim": ["intimate", "relaxed", "private"],
        "indoor_commercial": ["shopping", "browsing", "consumer-oriented"],
        "indoor_restaurant": ["dining", "social", "culinary"],
        "neon_night": ["vibrant", "energetic", "night-life"],
        "stadium_lighting": ["event-focused", "spectator-oriented", "performance-based"],
        "mixed_lighting": ["transitional", "adaptable", "variable"],
        "unknown": []
    },
    "indoor_commercial": {
    "general": "The scene is captured inside a commercial setting with retail-optimized lighting.",
    "bright": "The space is brightly illuminated with commercial display lighting to highlight merchandise.",
    "medium": "The commercial interior has standard retail lighting that balances visibility and ambiance.",
    "dim": "The commercial space has subdued lighting creating an upscale or intimate shopping atmosphere."
    },
    "indoor_restaurant": {
        "general": "The scene is captured inside a restaurant with characteristic dining lighting.",
        "bright": "The restaurant is well-lit with clear illumination emphasizing food presentation.",
        "medium": "The dining space has moderate lighting striking a balance between functionality and ambiance.",
        "dim": "The restaurant features soft, low lighting creating an intimate dining atmosphere."
    },
    "neon_night": {
        "general": "The scene is captured at night with colorful neon lighting typical of entertainment districts.",
        "bright": "The night scene is illuminated by vibrant neon signs creating a lively, colorful atmosphere.",
        "medium": "The evening setting features moderate neon lighting creating a characteristic urban nightlife scene.",
        "dim": "The night area has subtle neon accents against the darkness, creating a moody urban atmosphere."
    },
    "stadium_lighting": {
        "general": "The scene is captured under powerful stadium lights designed for spectator events.",
        "bright": "The venue is intensely illuminated by stadium floodlights creating daylight-like conditions.",
        "medium": "The sports facility has standard event lighting providing clear visibility across the venue.",
        "dim": "The stadium has reduced illumination typical of pre-event or post-event conditions."
    },
    "mixed_lighting": {
        "general": "The scene features a mix of indoor and outdoor lighting creating transitional illumination.",
        "bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
        "medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
        "dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
    }
}

In [None]:
# %%writefile viewpoint_templates.py

VIEWPOINT_TEMPLATES = {
    "eye_level": {
        "prefix": "From a standard eye-level perspective, ",
        "observation": "the scene shows {scene_elements} arranged in a typical front-facing view."
    },
    "aerial": {
        "prefix": "From an aerial perspective, ",
        "observation": "the scene shows {scene_elements} as viewed from above, revealing the spatial layout."
    },
    "elevated": {
        "prefix": "From an elevated viewpoint, ",
        "observation": "the scene presents {scene_elements} with a slight downward angle."
    },
    "low_angle": {
        "prefix": "From a low angle, ",
        "observation": "the scene depicts {scene_elements} from below, emphasizing vertical elements."
    }
}

In [None]:
# %%writefile cultural_templates.py

CULTURAL_TEMPLATES = {
    "asian": {
        "elements": ["character signage", "lanterns", "dense urban layout"],
        "description": "The scene shows distinctive Asian cultural elements such as {elements}."
    },
    "european": {
        "elements": ["classical architecture", "cobblestone streets", "café terraces"],
        "description": "The environment has European characteristics including {elements}."
    },
    "middle_eastern": {
        "elements": ["ornate archways", "geometric patterns", "domed structures"],
        "description": "The scene contains Middle Eastern architectural features such as {elements}."
    },
    "north_american": {
        "elements": ["grid street pattern", "modern skyscrapers", "wide boulevards"],
        "description": "The layout shows typical North American urban design with {elements}."
    }
}

In [None]:
# %%writefile spatial_analyzer.py

import os
import numpy as np
from typing import Dict, List, Tuple, Any, Optional

# from scene_type import SCENE_TYPES
# from enhance_scene_describer import EnhancedSceneDescriber

class SpatialAnalyzer:
    """
    Analyzes spatial relationships between objects in an image.
    Handles region assignment, object positioning, and functional zone identification.
    """

    def __init__(self, class_names: Dict[int, str] = None, object_categories=None):
        """Initialize the spatial analyzer with image regions"""
        # Define regions of the image (3x3 grid)
        self.regions = {
            "top_left": (0, 0, 1/3, 1/3),
            "top_center": (1/3, 0, 2/3, 1/3),
            "top_right": (2/3, 0, 1, 1/3),
            "middle_left": (0, 1/3, 1/3, 2/3),
            "middle_center": (1/3, 1/3, 2/3, 2/3),
            "middle_right": (2/3, 1/3, 1, 2/3),
            "bottom_left": (0, 2/3, 1/3, 1),
            "bottom_center": (1/3, 2/3, 2/3, 1),
            "bottom_right": (2/3, 2/3, 1, 1)
        }

        self.class_names = class_names
        self.OBJECT_CATEGORIES = object_categories or {}
        self.enhance_descriptor = EnhancedSceneDescriber(scene_types=SCENE_TYPES)

        # Distances thresholds for proximity analysis (normalized)
        self.proximity_threshold = 0.2


    def _determine_region(self, x: float, y: float) -> str:
        """
        Determine which region a point falls into.

        Args:
            x: Normalized x-coordinate (0-1)
            y: Normalized y-coordinate (0-1)

        Returns:
            Region name
        """
        for region_name, (x1, y1, x2, y2) in self.regions.items():
            if x1 <= x < x2 and y1 <= y < y2:
                return region_name

        return "unknown"

    def _analyze_regions(self, detected_objects: List[Dict]) -> Dict:
        """
        Analyze object distribution across image regions.

        Args:
            detected_objects: List of detected objects with position information

        Returns:
            Dictionary with region analysis
        """
        # Count objects in each region
        region_counts = {region: 0 for region in self.regions.keys()}
        region_objects = {region: [] for region in self.regions.keys()}

        for obj in detected_objects:
            region = obj["region"]
            if region in region_counts:
                region_counts[region] += 1
                region_objects[region].append({
                    "class_id": obj["class_id"],
                    "class_name": obj["class_name"]
                })

        # Determine main focus regions (top 1-2 regions by object count)
        sorted_regions = sorted(region_counts.items(), key=lambda x: x[1], reverse=True)
        main_regions = [region for region, count in sorted_regions if count > 0][:2]

        return {
            "counts": region_counts,
            "main_focus": main_regions,
            "objects_by_region": region_objects
        }

    def _extract_detected_objects(self, detection_result: Any, confidence_threshold: float = 0.25) -> List[Dict]:
        """
        Extract detected objects from detection result with position information.

        Args:
            detection_result: Detection result from YOLOv8
            confidence_threshold: Minimum confidence threshold

        Returns:
            List of dictionaries with detected object information
        """
        boxes = detection_result.boxes.xyxy.cpu().numpy()
        classes = detection_result.boxes.cls.cpu().numpy().astype(int)
        confidences = detection_result.boxes.conf.cpu().numpy()

        # Image dimensions
        img_height, img_width = detection_result.orig_shape[:2]

        detected_objects = []
        for box, class_id, confidence in zip(boxes, classes, confidences):
            # Skip objects with confidence below threshold
            if confidence < confidence_threshold:
                continue

            x1, y1, x2, y2 = box
            width = x2 - x1
            height = y2 - y1

            # Center point
            center_x = (x1 + x2) / 2
            center_y = (y1 + y2) / 2

            # Normalized positions (0-1)
            norm_x = center_x / img_width
            norm_y = center_y / img_height
            norm_width = width / img_width
            norm_height = height / img_height

            # Area calculation
            area = width * height
            norm_area = area / (img_width * img_height)

            # Region determination
            object_region = self._determine_region(norm_x, norm_y)

            detected_objects.append({
                "class_id": int(class_id),
                "class_name": self.class_names[int(class_id)],
                "confidence": float(confidence),
                "box": [float(x1), float(y1), float(x2), float(y2)],
                "center": [float(center_x), float(center_y)],
                "normalized_center": [float(norm_x), float(norm_y)],
                "size": [float(width), float(height)],
                "normalized_size": [float(norm_width), float(norm_height)],
                "area": float(area),
                "normalized_area": float(norm_area),
                "region": object_region
            })

        return detected_objects


    def _detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
        """
        檢測場景視角並識別特殊場景模式。

        Args:
            detected_objects: 檢測到的物體列表

        Returns:
            Dict: 包含視角和場景模式信息的字典
        """
        if not detected_objects:
            return {"viewpoint": "eye_level", "patterns": []}

        # 從物體位置中提取信息
        patterns = []

        # 檢測行人位置模式
        pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]

        # 檢查是否有足夠的行人來識別模式
        if len(pedestrian_objs) >= 4:
            pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]

            # 檢測十字交叉模式
            if self._detect_cross_pattern(pedestrian_positions):
                patterns.append("crosswalk_intersection")

            # 檢測多方向行人流
            directions = self._analyze_movement_directions(pedestrian_positions)
            if len(directions) >= 2:
                patterns.append("multi_directional_movement")

        # 檢查物體的大小一致性 - 在空中俯視圖中，物體大小通常更一致
        if len(detected_objects) >= 5:
            sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
            size_variance = np.var(sizes) / (np.mean(sizes) ** 2)  # 標準化變異數，不會受到平均值影響

            if size_variance < 0.3:  # 低變異表示大小一致
                patterns.append("consistent_object_size")

        # 基本視角檢測
        viewpoint = self.enhance_descriptor._detect_viewpoint(detected_objects)

        # 根據檢測到的模式增強視角判斷
        if "crosswalk_intersection" in patterns and viewpoint != "aerial":
            # 如果檢測到斑馬線交叉但視角判斷不是空中視角，優先採用模式判斷
            viewpoint = "aerial"

        return {
            "viewpoint": viewpoint,
            "patterns": patterns
        }

    def _detect_cross_pattern(self, positions):
        """
        檢測位置中的十字交叉模式

        Args:
            positions: 位置列表 [[x1, y1], [x2, y2], ...]

        Returns:
            bool: 是否檢測到十字交叉模式
        """
        if len(positions) < 8:  # 需要足夠多的點
            return False

        # 提取 x 和 y 坐標
        x_coords = [pos[0] for pos in positions]
        y_coords = [pos[1] for pos in positions]

        # 檢測 x 和 y 方向的聚類
        x_clusters = []
        y_clusters = []

        # 簡化的聚類分析
        x_mean = np.mean(x_coords)
        y_mean = np.mean(y_coords)

        # 計算在中心線附近的點
        near_x_center = sum(1 for x in x_coords if abs(x - x_mean) < 0.1)
        near_y_center = sum(1 for y in y_coords if abs(y - y_mean) < 0.1)

        # 如果有足夠的點在中心線附近，可能是十字交叉
        return near_x_center >= 3 and near_y_center >= 3

    def _analyze_movement_directions(self, positions):
        """
        分析位置中的移動方向

        Args:
            positions: 位置列表 [[x1, y1], [x2, y2], ...]

        Returns:
            list: 檢測到的主要方向
        """
        if len(positions) < 6:
            return []

        # extract x 和 y 坐標
        x_coords = [pos[0] for pos in positions]
        y_coords = [pos[1] for pos in positions]

        directions = []

        # horizontal move (left --> right)
        x_std = np.std(x_coords)
        x_range = max(x_coords) - min(x_coords)

        # vertical move(up --> down)
        y_std = np.std(y_coords)
        y_range = max(y_coords) - min(y_coords)

        # 足夠大的範圍表示該方向有運動
        if x_range > 0.4:
            directions.append("horizontal")
        if y_range > 0.4:
            directions.append("vertical")

        return directions

    def _identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
        """
        Identify functional zones within the scene with improved detection for different viewpoints
        and cultural contexts.

        Args:
            detected_objects: List of detected objects
            scene_type: Identified scene type

        Returns:
            Dictionary of functional zones with their descriptions
        """
        # Group objects by category and region
        category_regions = {}

        for obj in detected_objects:
            # Find object category
            category = "other"
            for cat_name, cat_ids in self.OBJECT_CATEGORIES.items():
                if obj["class_id"] in cat_ids:
                    category = cat_name
                    break

            # Add to category-region mapping
            if category not in category_regions:
                category_regions[category] = {}

            region = obj["region"]
            if region not in category_regions[category]:
                category_regions[category][region] = []

            category_regions[category][region].append(obj)

        # Identify zones based on object groupings
        zones = {}

        # Detect viewpoint to adjust zone identification strategy
        viewpoint = self._detect_scene_viewpoint(detected_objects)

        # Choose appropriate zone identification strategy based on scene type and viewpoint
        if scene_type in ["living_room", "bedroom", "dining_area", "kitchen", "office_workspace", "meeting_room"]:
            # Indoor scenes
            zones.update(self._identify_indoor_zones(category_regions, detected_objects, scene_type))
        elif scene_type in ["city_street", "parking_lot", "park_area"]:
            # Outdoor general scenes
            zones.update(self._identify_outdoor_general_zones(category_regions, detected_objects, scene_type))
        elif "aerial" in scene_type or viewpoint == "aerial":
            # Aerial viewpoint scenes
            zones.update(self._identify_aerial_view_zones(category_regions, detected_objects, scene_type))
        elif "asian" in scene_type:
            # Asian cultural context scenes
            zones.update(self._identify_asian_cultural_zones(category_regions, detected_objects, scene_type))
        elif scene_type == "urban_intersection":
            # Specific urban intersection logic
            zones.update(self._identify_intersection_zones(category_regions, detected_objects, viewpoint))
        elif scene_type == "financial_district":
            # Financial district specific logic
            zones.update(self._identify_financial_district_zones(category_regions, detected_objects))
        elif scene_type == "upscale_dining":
            # Upscale dining specific logic
            zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
        else:
            # Default zone identification for other scene types
            zones.update(self._identify_default_zones(category_regions, detected_objects))

        # If no zones were identified, try the default approach
        if not zones:
            zones.update(self._identify_default_zones(category_regions, detected_objects))

        return zones

    def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
        """
        Identify functional zones for indoor scenes.

        Args:
            category_regions: Objects grouped by category and region
            detected_objects: List of detected objects
            scene_type: Specific indoor scene type

        Returns:
            Dict: Indoor functional zones
        """
        zones = {}

        # Seating/social zone
        if "furniture" in category_regions:
            furniture_regions = category_regions["furniture"]
            main_furniture_region = max(furniture_regions.items(),
                                    key=lambda x: len(x[1]),
                                    default=(None, []))

            if main_furniture_region[0] is not None and len(main_furniture_region[1]) >= 2:
                zone_objects = [obj["class_name"] for obj in main_furniture_region[1]]
                zones["social_zone"] = {
                    "region": main_furniture_region[0],
                    "objects": zone_objects,
                    "description": f"Social or seating area with {', '.join(zone_objects)}"
                }

        # Entertainment zone
        if "electronics" in category_regions:
            electronics_items = []
            for region_objects in category_regions["electronics"].values():
                electronics_items.extend([obj["class_name"] for obj in region_objects])

            if electronics_items:
                zones["entertainment_zone"] = {
                    "region": self._find_main_region(category_regions.get("electronics", {})),
                    "objects": electronics_items,
                    "description": f"Entertainment or media area with {', '.join(electronics_items)}"
                }

        # Dining/food zone
        food_zone_categories = ["kitchen_items", "food"]
        food_items = []
        food_regions = {}

        for category in food_zone_categories:
            if category in category_regions:
                for region, objects in category_regions[category].items():
                    if region not in food_regions:
                        food_regions[region] = []
                    food_regions[region].extend(objects)
                    food_items.extend([obj["class_name"] for obj in objects])

        if food_items:
            main_food_region = max(food_regions.items(),
                                key=lambda x: len(x[1]),
                                default=(None, []))

            if main_food_region[0] is not None:
                zones["dining_zone"] = {
                    "region": main_food_region[0],
                    "objects": list(set(food_items)),
                    "description": f"Dining or food preparation area with {', '.join(list(set(food_items))[:3])}"
                }

        # Work/study zone - enhanced to detect even when scene_type is not explicitly office
        work_items = []
        work_regions = {}

        for obj in detected_objects:
            if obj["class_id"] in [56, 60, 63, 64, 66, 73]:  # chair, table, laptop, mouse, keyboard, book
                region = obj["region"]
                if region not in work_regions:
                    work_regions[region] = []
                work_regions[region].append(obj)
                work_items.append(obj["class_name"])

        # Check for laptop and table/chair combinations that suggest a workspace
        has_laptop = any(obj["class_id"] == 63 for obj in detected_objects)
        has_keyboard = any(obj["class_id"] == 66 for obj in detected_objects)
        has_table = any(obj["class_id"] == 60 for obj in detected_objects)
        has_chair = any(obj["class_id"] == 56 for obj in detected_objects)

        # If we have electronics with furniture in the same region, likely a workspace
        workspace_detected = (has_laptop or has_keyboard) and (has_table or has_chair)

        if (workspace_detected or scene_type in ["office_workspace", "meeting_room"]) and work_items:
            main_work_region = max(work_regions.items(),
                                key=lambda x: len(x[1]),
                                default=(None, []))

            if main_work_region[0] is not None:
                zones["workspace_zone"] = {
                    "region": main_work_region[0],
                    "objects": list(set(work_items)),
                    "description": f"Work or study area with {', '.join(list(set(work_items))[:3])}"
                }

        # Bedroom-specific zones
        if scene_type == "bedroom":
            bed_objects = [obj for obj in detected_objects if obj["class_id"] == 59]  # Bed
            if bed_objects:
                bed_region = bed_objects[0]["region"]
                zones["sleeping_zone"] = {
                    "region": bed_region,
                    "objects": ["bed"],
                    "description": "Sleeping area with bed"
                }

        # Kitchen-specific zones
        if scene_type == "kitchen":
            # Look for appliances (refrigerator, oven, microwave, sink)
            appliance_ids = [68, 69, 71, 72]  # microwave, oven, sink, refrigerator
            appliance_objects = [obj for obj in detected_objects if obj["class_id"] in appliance_ids]

            if appliance_objects:
                appliance_regions = {}
                for obj in appliance_objects:
                    region = obj["region"]
                    if region not in appliance_regions:
                        appliance_regions[region] = []
                    appliance_regions[region].append(obj)

                if appliance_regions:
                    main_appliance_region = max(appliance_regions.items(),
                                            key=lambda x: len(x[1]),
                                            default=(None, []))

                    if main_appliance_region[0] is not None:
                        appliance_names = [obj["class_name"] for obj in main_appliance_region[1]]
                        zones["kitchen_appliance_zone"] = {
                            "region": main_appliance_region[0],
                            "objects": appliance_names,
                            "description": f"Kitchen appliance area with {', '.join(appliance_names)}"
                        }

        return zones

    def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
        """
        Identify functional zones for urban intersections with enhanced spatial awareness.

        Args:
            category_regions: Objects grouped by category and region
            detected_objects: List of detected objects
            viewpoint: Detected viewpoint

        Returns:
            Dict: Refined intersection functional zones
        """
        zones = {}

        # Get pedestrians, vehicles and traffic signals
        pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
        vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 7]]  # bicycle, car, motorcycle, bus, truck
        traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]

        # Create distribution maps for better spatial understanding
        regions_distribution = self._create_distribution_map(detected_objects)

        # Analyze pedestrian crossing patterns
        crossing_zones = self._analyze_crossing_patterns(pedestrian_objs, traffic_light_objs, regions_distribution)
        zones.update(crossing_zones)

        # Analyze vehicle traffic zones with directional awareness
        traffic_zones = self._analyze_traffic_zones(vehicle_objs, regions_distribution)
        zones.update(traffic_zones)

        # Identify traffic control zones based on signal placement
        if traffic_light_objs:
            # Group traffic lights by region for better organization
            signal_regions = {}
            for obj in traffic_light_objs:
                region = obj["region"]
                if region not in signal_regions:
                    signal_regions[region] = []
                signal_regions[region].append(obj)

            # Create traffic control zones for each region with signals
            for idx, (region, signals) in enumerate(signal_regions.items()):
                # Check if this region has a directional name
                direction = self._get_directional_description(region)

                zones[f"traffic_control_zone_{idx+1}"] = {
                    "region": region,
                    "objects": ["traffic light"] * len(signals),
                    "description": f"Traffic control area with {len(signals)} traffic signals" +
                                (f" in {direction} area" if direction else "")
                }

        return zones

    def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
                                region_distribution: Dict) -> Dict:
        """
        Analyze pedestrian crossing patterns to identify crosswalk zones.

        Args:
            pedestrians: List of pedestrian objects
            traffic_lights: List of traffic light objects
            region_distribution: Distribution of objects by region

        Returns:
            Dict: Identified crossing zones
        """
        crossing_zones = {}

        if not pedestrians:
            return crossing_zones

        # Group pedestrians by region
        pedestrian_regions = {}
        for p in pedestrians:
            region = p["region"]
            if region not in pedestrian_regions:
                pedestrian_regions[region] = []
            pedestrian_regions[region].append(p)

        # Sort regions by pedestrian count to find main crossing areas
        sorted_regions = sorted(pedestrian_regions.items(), key=lambda x: len(x[1]), reverse=True)

        # Create crossing zones for regions with pedestrians
        for idx, (region, peds) in enumerate(sorted_regions[:2]):  # Focus on top 2 regions
            # Check if there are traffic lights nearby to indicate a crosswalk
            has_nearby_signals = any(t["region"] == region for t in traffic_lights)

            # Create crossing zone with descriptive naming
            zone_name = f"crossing_zone_{idx+1}"
            direction = self._get_directional_description(region)

            description = f"Pedestrian crossing area with {len(peds)} "
            description += "person" if len(peds) == 1 else "people"
            if direction:
                description += f" in {direction} direction"
            if has_nearby_signals:
                description += " near traffic signals"

            crossing_zones[zone_name] = {
                "region": region,
                "objects": ["pedestrian"] * len(peds),
                "description": description
            }

        return crossing_zones

    def _analyze_traffic_zones(self, vehicles: List[Dict], region_distribution: Dict) -> Dict:
        """
        Analyze vehicle distribution to identify traffic zones with directional awareness.

        Args:
            vehicles: List of vehicle objects
            region_distribution: Distribution of objects by region

        Returns:
            Dict: Identified traffic zones
        """
        traffic_zones = {}

        if not vehicles:
            return traffic_zones

        # Group vehicles by region
        vehicle_regions = {}
        for v in vehicles:
            region = v["region"]
            if region not in vehicle_regions:
                vehicle_regions[region] = []
            vehicle_regions[region].append(v)

        # Create traffic zones for regions with vehicles
        main_traffic_region = max(vehicle_regions.items(), key=lambda x: len(x[1]), default=(None, []))

        if main_traffic_region[0] is not None:
            region = main_traffic_region[0]
            vehicles_in_region = main_traffic_region[1]

            # Get a list of vehicle types for description
            vehicle_types = [v["class_name"] for v in vehicles_in_region]
            unique_types = list(set(vehicle_types))

            # Get directional description
            direction = self._get_directional_description(region)

            # Create descriptive zone
            traffic_zones["vehicle_zone"] = {
                "region": region,
                "objects": vehicle_types,
                "description": f"Vehicle traffic area with {', '.join(unique_types[:3])}" +
                            (f" in {direction} area" if direction else "")
            }

            # If vehicles are distributed across multiple regions, create secondary zones
            if len(vehicle_regions) > 1:
                # Get second most populated region
                sorted_regions = sorted(vehicle_regions.items(), key=lambda x: len(x[1]), reverse=True)
                if len(sorted_regions) > 1:
                    second_region, second_vehicles = sorted_regions[1]
                    direction = self._get_directional_description(second_region)
                    vehicle_types = [v["class_name"] for v in second_vehicles]
                    unique_types = list(set(vehicle_types))

                    traffic_zones["secondary_vehicle_zone"] = {
                        "region": second_region,
                        "objects": vehicle_types,
                        "description": f"Secondary traffic area with {', '.join(unique_types[:2])}" +
                                    (f" in {direction} direction" if direction else "")
                    }

        return traffic_zones

    def _get_directional_description(self, region: str) -> str:
        """
        Convert region name to a directional description.

        Args:
            region: Region name from the grid

        Returns:
            str: Directional description
        """
        if "top" in region and "left" in region:
            return "northwest"
        elif "top" in region and "right" in region:
            return "northeast"
        elif "bottom" in region and "left" in region:
            return "southwest"
        elif "bottom" in region and "right" in region:
            return "southeast"
        elif "top" in region:
            return "north"
        elif "bottom" in region:
            return "south"
        elif "left" in region:
            return "west"
        elif "right" in region:
            return "east"
        else:
            return "central"

    def _create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
        """
        Create a distribution map of objects across regions for spatial analysis.

        Args:
            detected_objects: List of detected objects

        Returns:
            Dict: Distribution map of objects by region and class
        """
        distribution = {}

        # Initialize all regions
        for region in self.regions.keys():
            distribution[region] = {
                "total": 0,
                "objects": {},
                "density": 0
            }

        # Populate the distribution
        for obj in detected_objects:
            region = obj["region"]
            class_id = obj["class_id"]
            class_name = obj["class_name"]

            distribution[region]["total"] += 1

            if class_id not in distribution[region]["objects"]:
                distribution[region]["objects"][class_id] = {
                    "name": class_name,
                    "count": 0,
                    "positions": []
                }

            distribution[region]["objects"][class_id]["count"] += 1

            # Store position for spatial relationship analysis
            if "normalized_center" in obj:
                distribution[region]["objects"][class_id]["positions"].append(obj["normalized_center"])

        # Calculate object density for each region
        for region, data in distribution.items():
            # Assuming all regions are equal size in the grid
            data["density"] = data["total"] / 1

        return distribution

    def _identify_asian_cultural_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
        """
        Identify functional zones for scenes with Asian cultural context.

        Args:
            category_regions: Objects grouped by category and region
            detected_objects: List of detected objects
            scene_type: Specific scene type

        Returns:
            Dict: Asian cultural functional zones
        """
        zones = {}

        # Identify storefront zone
        storefront_items = []
        storefront_regions = {}

        # Since storefronts aren't directly detectable, infer from context
        # For example, look for regions with signs, people, and smaller objects
        sign_regions = set()
        for obj in detected_objects:
            if obj["class_id"] == 0:  # Person
                region = obj["region"]
                if region not in storefront_regions:
                    storefront_regions[region] = []
                storefront_regions[region].append(obj)

                # Add regions with people as potential storefront areas
                sign_regions.add(region)

        # Use the areas with most people as storefront zones
        if storefront_regions:
            main_storefront_regions = sorted(storefront_regions.items(),
                                        key=lambda x: len(x[1]),
                                        reverse=True)[:2]  # Top 2 regions

            for idx, (region, objs) in enumerate(main_storefront_regions):
                zones[f"commercial_zone_{idx+1}"] = {
                    "region": region,
                    "objects": [obj["class_name"] for obj in objs],
                    "description": f"Asian commercial storefront with pedestrian activity"
                }

        # Identify pedestrian pathway - enhanced to better detect linear pathways
        pathway_items = []
        pathway_regions = {}

        # Extract people for pathway analysis
        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]

        # Analyze if people form a line (typical of shopping streets)
        people_positions = [obj["normalized_center"] for obj in people_objs]

        structured_path = False
        if len(people_positions) >= 3:
            # Check if people are arranged along a similar y-coordinate (horizontal path)
            y_coords = [pos[1] for pos in people_positions]
            y_mean = sum(y_coords) / len(y_coords)
            y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)

            horizontal_path = y_variance < 0.05  # Low variance indicates horizontal alignment

            # Check if people are arranged along a similar x-coordinate (vertical path)
            x_coords = [pos[0] for pos in people_positions]
            x_mean = sum(x_coords) / len(x_coords)
            x_variance = sum((x - x_mean)**2 for x in x_coords) / len(x_coords)

            vertical_path = x_variance < 0.05  # Low variance indicates vertical alignment

            structured_path = horizontal_path or vertical_path
            path_direction = "horizontal" if horizontal_path else "vertical" if vertical_path else "meandering"

        # Collect pathway objects (people, bicycles, motorcycles in middle area)
        for obj in detected_objects:
            if obj["class_id"] in [0, 1, 3]:  # Person, bicycle, motorcycle
                y_pos = obj["normalized_center"][1]
                # Group by vertical position (middle of image likely pathway)
                if 0.25 <= y_pos <= 0.75:
                    region = obj["region"]
                    if region not in pathway_regions:
                        pathway_regions[region] = []
                    pathway_regions[region].append(obj)
                    pathway_items.append(obj["class_name"])

        if pathway_items:
            path_desc = "Pedestrian walkway with people moving through the commercial area"
            if structured_path:
                path_desc = f"{path_direction.capitalize()} pedestrian walkway with organized foot traffic"

            zones["pedestrian_pathway"] = {
                "region": "middle_center",  # Assumption: pathway often in middle
                "objects": list(set(pathway_items)),
                "description": path_desc
            }

        # Identify vendor zone (small stalls/shops - inferred from context)
        has_small_objects = any(obj["class_id"] in [24, 26, 39, 41] for obj in detected_objects)  # bags, bottles, cups
        has_people = any(obj["class_id"] == 0 for obj in detected_objects)

        if has_small_objects and has_people:
            # Likely vendor areas are where people and small objects cluster
            small_obj_regions = {}

            for obj in detected_objects:
                if obj["class_id"] in [24, 26, 39, 41, 67]:  # bags, bottles, cups, phones
                    region = obj["region"]
                    if region not in small_obj_regions:
                        small_obj_regions[region] = []
                    small_obj_regions[region].append(obj)

            if small_obj_regions:
                main_vendor_region = max(small_obj_regions.items(),
                                    key=lambda x: len(x[1]),
                                    default=(None, []))

                if main_vendor_region[0] is not None:
                    vendor_items = [obj["class_name"] for obj in main_vendor_region[1]]
                    zones["vendor_zone"] = {
                        "region": main_vendor_region[0],
                        "objects": list(set(vendor_items)),
                        "description": "Vendor or market stall area with small merchandise"
                    }

        # For night markets, identify illuminated zones
        if scene_type == "asian_night_market":
            # Night markets typically have bright spots for food stalls
            # This would be enhanced with lighting analysis integration
            zones["food_stall_zone"] = {
                "region": "middle_center",
                "objects": ["inferred food stalls"],
                "description": "Food stall area typical of Asian night markets"
            }

        return zones

    def _identify_upscale_dining_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
        """
        Identify functional zones for upscale dining settings.

        Args:
            category_regions: Objects grouped by category and region
            detected_objects: List of detected objects

        Returns:
            Dict: Upscale dining functional zones
        """
        zones = {}

        # Identify dining table zone
        dining_items = []
        dining_regions = {}

        for obj in detected_objects:
            if obj["class_id"] in [40, 41, 42, 43, 44, 45, 60]:  # Wine glass, cup, fork, knife, spoon, bowl, table
                region = obj["region"]
                if region not in dining_regions:
                    dining_regions[region] = []
                dining_regions[region].append(obj)
                dining_items.append(obj["class_name"])

        if dining_items:
            main_dining_region = max(dining_regions.items(),
                                key=lambda x: len(x[1]),
                                default=(None, []))

            if main_dining_region[0] is not None:
                zones["formal_dining_zone"] = {
                    "region": main_dining_region[0],
                    "objects": list(set(dining_items)),
                    "description": f"Formal dining area with {', '.join(list(set(dining_items))[:3])}"
                }

        # Identify decorative zone with enhanced detection
        decor_items = []
        decor_regions = {}

        # Look for decorative elements (vases, wine glasses, unused dishes)
        for obj in detected_objects:
            if obj["class_id"] in [75, 40]:  # Vase, wine glass
                region = obj["region"]
                if region not in decor_regions:
                    decor_regions[region] = []
                decor_regions[region].append(obj)
                decor_items.append(obj["class_name"])

        if decor_items:
            main_decor_region = max(decor_regions.items(),
                                key=lambda x: len(x[1]),
                                default=(None, []))

            if main_decor_region[0] is not None:
                zones["decorative_zone"] = {
                    "region": main_decor_region[0],
                    "objects": list(set(decor_items)),
                    "description": f"Decorative area with {', '.join(list(set(decor_items)))}"
                }

        # Identify seating arrangement zone
        chairs = [obj for obj in detected_objects if obj["class_id"] == 56]  # chairs
        if len(chairs) >= 2:
            chair_regions = {}
            for obj in chairs:
                region = obj["region"]
                if region not in chair_regions:
                    chair_regions[region] = []
                chair_regions[region].append(obj)

            if chair_regions:
                main_seating_region = max(chair_regions.items(),
                                    key=lambda x: len(x[1]),
                                    default=(None, []))

                if main_seating_region[0] is not None:
                    zones["dining_seating_zone"] = {
                        "region": main_seating_region[0],
                        "objects": ["chair"] * len(main_seating_region[1]),
                        "description": f"Formal dining seating arrangement with {len(main_seating_region[1])} chairs"
                    }

        # Identify serving area (if different from dining area)
        serving_items = []
        serving_regions = {}

        # Serving areas might have bottles, bowls, containers
        for obj in detected_objects:
            if obj["class_id"] in [39, 45]:  # Bottle, bowl
                # Check if it's in a different region from the main dining table
                if "formal_dining_zone" in zones and obj["region"] != zones["formal_dining_zone"]["region"]:
                    region = obj["region"]
                    if region not in serving_regions:
                        serving_regions[region] = []
                    serving_regions[region].append(obj)
                    serving_items.append(obj["class_name"])

        if serving_items:
            main_serving_region = max(serving_regions.items(),
                                key=lambda x: len(x[1]),
                                default=(None, []))

            if main_serving_region[0] is not None:
                zones["serving_zone"] = {
                    "region": main_serving_region[0],
                    "objects": list(set(serving_items)),
                    "description": f"Serving or sideboard area with {', '.join(list(set(serving_items)))}"
                }

        return zones

    def _identify_financial_district_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
        """
        Identify functional zones for financial district scenes.

        Args:
            category_regions: Objects grouped by category and region
            detected_objects: List of detected objects

        Returns:
            Dict: Financial district functional zones
        """
        zones = {}

        # Identify traffic zone
        traffic_items = []
        traffic_regions = {}

        for obj in detected_objects:
            if obj["class_id"] in [1, 2, 3, 5, 6, 7, 9]:  # Various vehicles and traffic lights
                region = obj["region"]
                if region not in traffic_regions:
                    traffic_regions[region] = []
                traffic_regions[region].append(obj)
                traffic_items.append(obj["class_name"])

        if traffic_items:
            main_traffic_region = max(traffic_regions.items(),
                                key=lambda x: len(x[1]),
                                default=(None, []))

            if main_traffic_region[0] is not None:
                zones["traffic_zone"] = {
                    "region": main_traffic_region[0],
                    "objects": list(set(traffic_items)),
                    "description": f"Urban traffic area with {', '.join(list(set(traffic_items))[:3])}"
                }

        # Building zones on the sides (inferred from scene context)
        # Enhanced to check if there are actual regions that might contain buildings
        # Check for regions without vehicles or pedestrians - likely building areas
        left_side_regions = ["top_left", "middle_left", "bottom_left"]
        right_side_regions = ["top_right", "middle_right", "bottom_right"]

        # Check left side
        left_building_evidence = True
        for region in left_side_regions:
            # If many vehicles or people in this region, less likely to be buildings
            vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
                                for obj in detected_objects)
            people_in_region = any(obj["region"] == region and obj["class_id"] == 0
                                for obj in detected_objects)

            if vehicle_in_region or people_in_region:
                left_building_evidence = False
                break

        # Check right side
        right_building_evidence = True
        for region in right_side_regions:
            # If many vehicles or people in this region, less likely to be buildings
            vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
                                for obj in detected_objects)
            people_in_region = any(obj["region"] == region and obj["class_id"] == 0
                                for obj in detected_objects)

            if vehicle_in_region or people_in_region:
                right_building_evidence = False
                break

        # Add building zones if evidence supports them
        if left_building_evidence:
            zones["building_zone_left"] = {
                "region": "middle_left",
                "objects": ["building"],  # Inferred
                "description": "Tall buildings line the left side of the street"
            }

        if right_building_evidence:
            zones["building_zone_right"] = {
                "region": "middle_right",
                "objects": ["building"],  # Inferred
                "description": "Tall buildings line the right side of the street"
            }

        # Identify pedestrian zone if people are present
        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
        if people_objs:
            people_regions = {}
            for obj in people_objs:
                region = obj["region"]
                if region not in people_regions:
                    people_regions[region] = []
                people_regions[region].append(obj)

            if people_regions:
                main_pedestrian_region = max(people_regions.items(),
                                        key=lambda x: len(x[1]),
                                        default=(None, []))

                if main_pedestrian_region[0] is not None:
                    zones["pedestrian_zone"] = {
                        "region": main_pedestrian_region[0],
                        "objects": ["person"] * len(main_pedestrian_region[1]),
                        "description": f"Pedestrian area with {len(main_pedestrian_region[1])} people navigating the financial district"
                    }

        return zones

    def _identify_aerial_view_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
        """
        Identify functional zones for scenes viewed from an aerial perspective.

        Args:
            category_regions: Objects grouped by category and region
            detected_objects: List of detected objects
            scene_type: Specific scene type

        Returns:
            Dict: Aerial view functional zones
        """
        zones = {}

        # For aerial views, we focus on patterns and flows rather than specific zones

        # Identify pedestrian patterns
        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
        if people_objs:
            # Convert positions to arrays for pattern analysis
            positions = np.array([obj["normalized_center"] for obj in people_objs])

            if len(positions) >= 3:
                # Calculate distribution metrics
                x_coords = positions[:, 0]
                y_coords = positions[:, 1]

                x_mean = np.mean(x_coords)
                y_mean = np.mean(y_coords)
                x_std = np.std(x_coords)
                y_std = np.std(y_coords)

                # Determine if people are organized in a linear pattern
                if x_std < 0.1 or y_std < 0.1:
                    # Linear distribution along one axis
                    pattern_direction = "vertical" if x_std < y_std else "horizontal"

                    zones["pedestrian_pattern"] = {
                        "region": "central",
                        "objects": ["person"] * len(people_objs),
                        "description": f"Aerial view shows a {pattern_direction} pedestrian movement pattern"
                    }
                else:
                    # More dispersed pattern
                    zones["pedestrian_distribution"] = {
                        "region": "wide",
                        "objects": ["person"] * len(people_objs),
                        "description": f"Aerial view shows pedestrians distributed across the area"
                    }

        # Identify vehicle patterns for traffic analysis
        vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
        if vehicle_objs:
            # Convert positions to arrays for pattern analysis
            positions = np.array([obj["normalized_center"] for obj in vehicle_objs])

            if len(positions) >= 2:
                # Calculate distribution metrics
                x_coords = positions[:, 0]
                y_coords = positions[:, 1]

                x_mean = np.mean(x_coords)
                y_mean = np.mean(y_coords)
                x_std = np.std(x_coords)
                y_std = np.std(y_coords)

                # Determine if vehicles are organized in lanes
                if x_std < y_std * 0.5:
                    # Vehicles aligned vertically - indicates north-south traffic
                    zones["vertical_traffic_flow"] = {
                        "region": "central_vertical",
                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
                        "description": "North-south traffic flow visible from aerial view"
                    }
                elif y_std < x_std * 0.5:
                    # Vehicles aligned horizontally - indicates east-west traffic
                    zones["horizontal_traffic_flow"] = {
                        "region": "central_horizontal",
                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
                        "description": "East-west traffic flow visible from aerial view"
                    }
                else:
                    # Vehicles in multiple directions - indicates intersection
                    zones["intersection_traffic"] = {
                        "region": "central",
                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
                        "description": "Multi-directional traffic at intersection visible from aerial view"
                    }

        # For intersection specific aerial views, identify crossing patterns
        if "intersection" in scene_type:
            # Check for traffic signals
            traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
            if traffic_light_objs:
                zones["traffic_control_pattern"] = {
                    "region": "intersection",
                    "objects": ["traffic light"] * len(traffic_light_objs),
                    "description": f"Intersection traffic control with {len(traffic_light_objs)} signals visible from above"
                }

            # Crosswalks are inferred from context in aerial views
            zones["crossing_pattern"] = {
                "region": "central",
                "objects": ["inferred crosswalk"],
                "description": "Crossing pattern visible from aerial perspective"
            }

        # For plaza aerial views, identify gathering patterns
        if "plaza" in scene_type:
            # Plazas typically have central open area with people
            if people_objs:
                # Check if people are clustered in central region
                central_people = [obj for obj in people_objs
                                if "middle" in obj["region"]]

                if central_people:
                    zones["central_gathering"] = {
                        "region": "middle_center",
                        "objects": ["person"] * len(central_people),
                        "description": f"Central plaza gathering area with {len(central_people)} people viewed from above"
                    }

        return zones

    def _identify_outdoor_general_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
        """
        Identify functional zones for general outdoor scenes.

        Args:
            category_regions: Objects grouped by category and region
            detected_objects: List of detected objects
            scene_type: Specific outdoor scene type

        Returns:
            Dict: Outdoor functional zones
        """
        zones = {}

        # Identify pedestrian zones
        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
        if people_objs:
            people_regions = {}
            for obj in people_objs:
                region = obj["region"]
                if region not in people_regions:
                    people_regions[region] = []
                people_regions[region].append(obj)

            if people_regions:
                # Find main pedestrian areas
                main_people_regions = sorted(people_regions.items(),
                                        key=lambda x: len(x[1]),
                                        reverse=True)[:2]  # Top 2 regions

                for idx, (region, objs) in enumerate(main_people_regions):
                    if len(objs) > 0:
                        zones[f"pedestrian_zone_{idx+1}"] = {
                            "region": region,
                            "objects": ["person"] * len(objs),
                            "description": f"Pedestrian area with {len(objs)} {'people' if len(objs) > 1 else 'person'}"
                        }

        # Identify vehicle zones for streets and parking lots
        vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
        if vehicle_objs:
            vehicle_regions = {}
            for obj in vehicle_objs:
                region = obj["region"]
                if region not in vehicle_regions:
                    vehicle_regions[region] = []
                vehicle_regions[region].append(obj)

            if vehicle_regions:
                main_vehicle_region = max(vehicle_regions.items(),
                                    key=lambda x: len(x[1]),
                                    default=(None, []))

                if main_vehicle_region[0] is not None:
                    vehicle_types = [obj["class_name"] for obj in main_vehicle_region[1]]
                    zones["vehicle_zone"] = {
                        "region": main_vehicle_region[0],
                        "objects": vehicle_types,
                        "description": f"Traffic area with {', '.join(list(set(vehicle_types))[:3])}"
                    }

        # For park areas, identify recreational zones
        if scene_type == "park_area":
            # Look for recreational objects (sports balls, kites, etc.)
            rec_items = []
            rec_regions = {}

            for obj in detected_objects:
                if obj["class_id"] in [32, 33, 34, 35, 38]:  # sports ball, kite, baseball bat, glove, tennis racket
                    region = obj["region"]
                    if region not in rec_regions:
                        rec_regions[region] = []
                    rec_regions[region].append(obj)
                    rec_items.append(obj["class_name"])

            if rec_items:
                main_rec_region = max(rec_regions.items(),
                                key=lambda x: len(x[1]),
                                default=(None, []))

                if main_rec_region[0] is not None:
                    zones["recreational_zone"] = {
                        "region": main_rec_region[0],
                        "objects": list(set(rec_items)),
                        "description": f"Recreational area with {', '.join(list(set(rec_items)))}"
                    }

        # For parking lots, identify parking zones
        if scene_type == "parking_lot":
            # Look for parked cars with consistent spacing
            car_objs = [obj for obj in detected_objects if obj["class_id"] == 2]  # cars

            if len(car_objs) >= 3:
                # Check if cars are arranged in patterns (simplified)
                car_positions = [obj["normalized_center"] for obj in car_objs]

                # Check for row patterns by analyzing vertical positions
                y_coords = [pos[1] for pos in car_positions]
                y_clusters = {}

                # Simplified clustering - group cars by similar y-coordinates
                for i, y in enumerate(y_coords):
                    assigned = False
                    for cluster_y in y_clusters.keys():
                        if abs(y - cluster_y) < 0.1:  # Within 10% of image height
                            y_clusters[cluster_y].append(i)
                            assigned = True
                            break

                    if not assigned:
                        y_clusters[y] = [i]

                # If we have row patterns
                if max(len(indices) for indices in y_clusters.values()) >= 2:
                    zones["parking_row"] = {
                        "region": "central",
                        "objects": ["car"] * len(car_objs),
                        "description": f"Organized parking area with vehicles arranged in rows"
                    }
                else:
                    zones["parking_area"] = {
                        "region": "wide",
                        "objects": ["car"] * len(car_objs),
                        "description": f"Parking area with {len(car_objs)} vehicles"
                    }

        return zones

    def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
        """
        Identify general functional zones when no specific scene type is matched.

        Args:
            category_regions: Objects grouped by category and region
            detected_objects: List of detected objects

        Returns:
            Dict: Default functional zones
        """
        zones = {}

        # Group objects by category and find main concentrations
        for category, regions in category_regions.items():
            if not regions:
                continue

            # Find region with most objects in this category
            main_region = max(regions.items(),
                        key=lambda x: len(x[1]),
                        default=(None, []))

            if main_region[0] is None or len(main_region[1]) < 2:
                continue

            # Create zone based on object category
            zone_objects = [obj["class_name"] for obj in main_region[1]]

            # Skip if too few objects
            if len(zone_objects) < 2:
                continue

            # Create appropriate zone name and description based on category
            if category == "furniture":
                zones["furniture_zone"] = {
                    "region": main_region[0],
                    "objects": zone_objects,
                    "description": f"Area with furniture including {', '.join(zone_objects[:3])}"
                }
            elif category == "electronics":
                zones["electronics_zone"] = {
                    "region": main_region[0],
                    "objects": zone_objects,
                    "description": f"Area with electronic devices including {', '.join(zone_objects[:3])}"
                }
            elif category == "kitchen_items":
                zones["dining_zone"] = {
                    "region": main_region[0],
                    "objects": zone_objects,
                    "description": f"Dining or food area with {', '.join(zone_objects[:3])}"
                }
            elif category == "vehicles":
                zones["vehicle_zone"] = {
                    "region": main_region[0],
                    "objects": zone_objects,
                    "description": f"Area with vehicles including {', '.join(zone_objects[:3])}"
                }
            elif category == "personal_items":
                zones["personal_items_zone"] = {
                    "region": main_region[0],
                    "objects": zone_objects,
                    "description": f"Area with personal items including {', '.join(zone_objects[:3])}"
                }

        # Check for people groups
        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
        if len(people_objs) >= 2:
            people_regions = {}
            for obj in people_objs:
                region = obj["region"]
                if region not in people_regions:
                    people_regions[region] = []
                people_regions[region].append(obj)

            if people_regions:
                main_people_region = max(people_regions.items(),
                                    key=lambda x: len(x[1]),
                                    default=(None, []))

                if main_people_region[0] is not None:
                    zones["people_zone"] = {
                        "region": main_people_region[0],
                        "objects": ["person"] * len(main_people_region[1]),
                        "description": f"Area with {len(main_people_region[1])} people"
                    }

        return zones

    def _find_main_region(self, region_objects_dict: Dict) -> str:
        """Find the main region with the most objects"""
        if not region_objects_dict:
            return "unknown"

        return max(region_objects_dict.items(),
                key=lambda x: len(x[1]),
                default=("unknown", []))[0]

    def _find_main_region(self, region_objects_dict: Dict) -> str:
        """Find the main region with the most objects"""
        if not region_objects_dict:
            return "unknown"

        return max(region_objects_dict.items(),
                 key=lambda x: len(x[1]),
                 default=("unknown", []))[0]

In [None]:
# %%writefile enhance_scene_describer.py
import os
import re
import json
import random
import numpy as np
from typing import Dict, List, Tuple, Any, Optional

# from scene_type import SCENE_TYPES
# from scene_detail_templates import SCENE_DETAIL_TEMPLATES
# from object_template_fillers import OBJECT_TEMPLATE_FILLERS
# from lighting_conditions import LIGHTING_CONDITIONS
# from viewpoint_templates import VIEWPOINT_TEMPLATES
# from cultural_templates import CULTURAL_TEMPLATES
# from confifence_templates import CONFIDENCE_TEMPLATES

class EnhancedSceneDescriber:
    """
    Enhanced scene description generator with improved template handling,
    viewpoint awareness, and cultural context recognition.
    Provides detailed natural language descriptions of scenes based on
    detection results and scene classification.
    """

    def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None):
        """
        Initialize the enhanced scene describer.

        Args:
            templates_db: Optional custom templates database
            scene_types: Dictionary of scene type definitions
        """
        # Load or use provided scene types
        self.scene_types = scene_types or self._load_default_scene_types()

        # Load templates database
        self.templates = templates_db or self._load_templates()

        # Initialize viewpoint detection parameters
        self._initialize_viewpoint_parameters()

    def _load_default_scene_types(self) -> Dict:
        """
        Load default scene types.

        Returns:
            Dict: Scene type definitions
        """

        return SCENE_TYPES

    def _load_templates(self) -> Dict:
        """
        Load description templates from imported Python modules.

        Returns:
            Dict: Template collections for different description components
        """
        templates = {}

        # 直接從導入的 Python 模組中獲取模板
        templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
        templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
        templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
        templates["cultural_templates"] = CULTURAL_TEMPLATES

        # 從 LIGHTING_CONDITIONS 獲取照明模板
        templates["lighting_templates"] = {
            key: data["general"] for key, data in LIGHTING_CONDITIONS.get("time_descriptions", {}).items()
        }

        # 設置默認的置信度模板
        templates["confidence_templates"] = {
            "high": "{description} {details}",
            "medium": "This appears to be {description} {details}",
            "low": "This might be {description}, but the confidence is low. {details}"
        }

        # 初始化其他必要的模板（現在這個函數簡化了很多）
        self._initialize_default_templates(templates)

        return templates

    def _initialize_default_templates(self, templates: Dict):
        """
        檢查模板字典並填充任何缺失的默認模板。

        在將模板移至專門的模組後，此方法主要作為安全機制，
        確保即使導入失敗或某些模板未在外部定義，系統仍能正常運行。

        Args:
            templates: 要檢查和更新的模板字典
        """
        # 檢查關鍵模板類型是否存在，如果不存在則添加默認值

        # 置信度模板 - 用於控制描述的語氣
        if "confidence_templates" not in templates:
            templates["confidence_templates"] = {
                "high": "{description} {details}",
                "medium": "This appears to be {description} {details}",
                "low": "This might be {description}, but the confidence is low. {details}"
            }

        # 場景細節模板 - 如果未從外部導入
        if "scene_detail_templates" not in templates:
            templates["scene_detail_templates"] = {
                "default": ["A space with various objects."]
            }

        # 物體填充模板 - 用於生成物體描述
        if "object_template_fillers" not in templates:
            templates["object_template_fillers"] = {
                "default": ["various items"]
            }

        # 視角模板 - 雖然我們現在從專門模組導入，但作為備份
        if "viewpoint_templates" not in templates:
            # 使用簡化版的默認視角模板
            templates["viewpoint_templates"] = {
                "eye_level": {
                    "prefix": "From eye level, ",
                    "observation": "the scene is viewed straight on."
                },
                "aerial": {
                    "prefix": "From above, ",
                    "observation": "the scene is viewed from a bird's-eye perspective."
                }
            }

        # 文化模板
        if "cultural_templates" not in templates:
            templates["cultural_templates"] = {
                "asian": {
                    "elements": ["cultural elements"],
                    "description": "The scene has Asian characteristics."
                },
                "european": {
                    "elements": ["architectural features"],
                    "description": "The scene has European characteristics."
                }
            }

        # 照明模板 - 用於描述光照條件
        if "lighting_templates" not in templates:
            templates["lighting_templates"] = {
                "day_clear": "The scene is captured during daylight.",
                "night": "The scene is captured at night.",
                "unknown": "The lighting conditions are not easily determined."
            }

    def _initialize_viewpoint_parameters(self):
        """
        Initialize parameters used for viewpoint detection.
        """
        self.viewpoint_params = {
            # Parameters for detecting aerial views
            "aerial_threshold": 0.7,  # High object density viewed from top
            "aerial_size_variance_threshold": 0.15,  # Low size variance in aerial views

            # Parameters for detecting low angle views
            "low_angle_threshold": 0.3,  # Bottom-heavy object distribution
            "vertical_size_ratio_threshold": 1.8,  # Vertical objects appear taller

            # Parameters for detecting elevated views
            "elevated_threshold": 0.6,  # Objects mostly in middle/bottom
            "elevated_top_threshold": 0.3  # Few objects at top of frame
        }


    def generate_description(self,
                        scene_type: str,
                        detected_objects: List[Dict],
                        confidence: float,
                        lighting_info: Optional[Dict] = None,
                        functional_zones: Optional[Dict] = None) -> str:
        """
        Generate enhanced scene description based on detection results, scene type,
        and additional contextual information.

        This is the main entry point that replaces the original _generate_scene_description.

        Args:
            scene_type: Identified scene type
            detected_objects: List of detected objects
            confidence: Scene classification confidence
            lighting_info: Optional lighting condition information
            functional_zones: Optional identified functional zones

        Returns:
            str: Natural language description of the scene
        """
        # Handle unknown scene type or very low confidence
        if scene_type == "unknown" or confidence < 0.4:
            return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))

        # Detect viewpoint
        viewpoint = self._detect_viewpoint(detected_objects)

        # Process aerial viewpoint scene types
        if viewpoint == "aerial":
            if "intersection" in scene_type or self._is_intersection(detected_objects):
                scene_type = "aerial_view_intersection"
            elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
                scene_type = "aerial_view_commercial_area"
            elif any(keyword in scene_type for keyword in ["plaza", "square"]):
                scene_type = "aerial_view_plaza"
            else:
                scene_type = "aerial_view_intersection"

        # Detect cultural context - only for non-aerial viewpoints
        cultural_context = None
        if viewpoint != "aerial":
            cultural_context = self._detect_cultural_context(scene_type, detected_objects)

        # Select appropriate template based on confidence
        if confidence > 0.75:
            confidence_level = "high"
        elif confidence > 0.5:
            confidence_level = "medium"
        else:
            confidence_level = "low"

        # Get base description for the scene type
        if viewpoint == "aerial":
            if 'base_description' not in locals():
                base_description = "An aerial view showing the layout and movement patterns from above"
        elif scene_type in self.scene_types:
            base_description = self.scene_types[scene_type].get("description", "A scene")
        else:
            base_description = "A scene"

        # Generate detailed scene information
        scene_details = self._generate_scene_details(
            scene_type,
            detected_objects,
            lighting_info,
            viewpoint
        )

        # Start with the base description
        description = base_description

        # If there's a secondary description from the scene type template, append it properly
        if scene_type in self.scene_types and "secondary_description" in self.scene_types[scene_type]:
            secondary_desc = self.scene_types[scene_type]["secondary_description"]
            if secondary_desc:
                description = self._smart_append(description, secondary_desc)

        # Improve description based on people count
        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # Person class
        if people_objs:
            people_count = len(people_objs)
            if people_count > 5:
                people_phrase = f"numerous people ({people_count})"
            else:
                people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"

            # Add people information to the scene details if not already mentioned
            if "people" not in description.lower() and "pedestrian" not in description.lower():
                description = self._smart_append(description, f"The scene includes {people_phrase}")

        # Apply cultural context if detected (only for non-aerial viewpoints)
        if cultural_context and viewpoint != "aerial":
            cultural_elements = self._generate_cultural_elements(cultural_context)
            if cultural_elements:
                description = self._smart_append(description, cultural_elements)

        # Now append the detailed scene information if available
        if scene_details:
            # Use smart_append to ensure proper formatting between base description and details
            description = self._smart_append(description, scene_details)

        # Include lighting information if available
        lighting_description = ""
        if lighting_info and "time_of_day" in lighting_info:
            lighting_type = lighting_info["time_of_day"]
            if lighting_type in self.templates.get("lighting_templates", {}):
                lighting_description = self.templates["lighting_templates"][lighting_type]

        # Add lighting description if available
        if lighting_description and lighting_description not in description:
            description = self._smart_append(description, lighting_description)

        # Process viewpoint information
        if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
            viewpoint_template = self.templates["viewpoint_templates"][viewpoint]

            # Special handling for viewpoint prefix
            prefix = viewpoint_template.get('prefix', '')
            if prefix and not description.startswith(prefix):
                # Prefix is a phrase like "From above, " that should precede the description
                if description and description[0].isupper():
                    # Maintain the flow by lowercasing the first letter after the prefix
                    description = prefix + description[0].lower() + description[1:]
                else:
                    description = prefix + description

            # Get appropriate scene elements description based on viewpoint
            if viewpoint == "aerial":
                scene_elements = "the crossing patterns and pedestrian movement"
            else:
                scene_elements = "objects and layout"

            viewpoint_desc = viewpoint_template.get("observation", "").format(
                scene_elements=scene_elements
            )

            # Add viewpoint observation if not already included
            if viewpoint_desc and viewpoint_desc not in description:
                description = self._smart_append(description, viewpoint_desc)

        # Add information about functional zones if available
        if functional_zones and len(functional_zones) > 0:
            zones_desc = self._describe_functional_zones(functional_zones)
            if zones_desc:
                description = self._smart_append(description, zones_desc)

        # Calculate actual people count
        people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])

        # Check for inconsistencies in people count descriptions
        if people_count > 5:
            # Identify fragments that might contain smaller people counts
            small_people_patterns = [
                r"Area with \d+ people\.",
                r"Area with \d+ person\.",
                r"with \d+ people",
                r"with \d+ person"
            ]

            # Check and remove each pattern
            filtered_description = description
            for pattern in small_people_patterns:
                matches = re.findall(pattern, filtered_description)
                for match in matches:
                    # Extract the number from the match
                    number_match = re.search(r'\d+', match)
                    if number_match:
                        try:
                            people_mentioned = int(number_match.group())
                            # If the mentioned count is less than total, remove the entire sentence
                            if people_mentioned < people_count:
                                # Split description into sentences
                                sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
                                # Remove sentences containing the match
                                filtered_sentences = []
                                for sentence in sentences:
                                    if match not in sentence:
                                        filtered_sentences.append(sentence)
                                # Recombine the description
                                filtered_description = " ".join(filtered_sentences)
                        except ValueError:
                            # Failed number conversion, continue processing
                            continue

            # Use the filtered description
            description = filtered_description

        # Final formatting to ensure correct punctuation and capitalization
        description = self._format_final_description(description)

        description_lines = description.split('\n')
        clean_description = []
        skip_block = False  # 添加這個變數的定義

        for line in description_lines:
            # 檢查是否需要跳過這行
            if line.strip().startswith(':param') or line.strip().startswith('"""'):
                continue
            if line.strip().startswith("Exercise") or "class SceneDescriptionSystem" in line:
                skip_block = True
                continue
            if ('def generate_scene_description' in line or
                'def enhance_scene_descriptions' in line or
                'def __init__' in line):
                skip_block = True
                continue
            if line.strip().startswith('#TEST'):
                skip_block = True
                continue

            # 空行結束跳過模式
            if skip_block and line.strip() == "":
                skip_block = False

            # 如果不需要跳過，添加這行到結果
            if not skip_block:
                clean_description.append(line)

        # 如果過濾後的描述為空，返回原始描述
        if not clean_description:
            return description
        else:
            return '\n'.join(clean_description)

    def _smart_append(self, current_text: str, new_fragment: str) -> str:
        """
        Intelligently append a new text fragment to the current text,
        handling punctuation and capitalization correctly.

        Args:
            current_text: The existing text to append to
            new_fragment: The new text fragment to append

        Returns:
            str: The combined text with proper formatting
        """
        # Handle empty cases
        if not new_fragment:
            return current_text

        if not current_text:
            # Ensure first character is uppercase for the first fragment
            return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""

        # Clean up existing text
        current_text = current_text.rstrip()

        # Check for ending punctuation
        ends_with_sentence = current_text.endswith(('.', '!', '?'))
        ends_with_comma = current_text.endswith(',')

        # Specifically handle the "A xxx A yyy" pattern that's causing issues
        if (current_text.startswith("A ") or current_text.startswith("An ")) and \
        (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
            return current_text + ". " + new_fragment

        # Decide how to join the texts
        if ends_with_sentence:
            # After a sentence, start with uppercase and add proper spacing
            joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
        elif ends_with_comma:
            # After a comma, maintain flow with lowercase unless it's a proper noun or special case
            if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
                joined_text = current_text + " " + new_fragment
            else:
                joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
        elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower():
            # When adding a new sentence about the scene, use a period
            joined_text = current_text + ". " + new_fragment
        else:
            # For other cases, decide based on the content
            if self._is_related_phrases(current_text, new_fragment):
                if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
                    joined_text = current_text + ", " + new_fragment
                else:
                    joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
            else:
                # Use period for unrelated phrases
                joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])

        return joined_text

    def _is_related_phrases(self, text1: str, text2: str) -> bool:
        """
        Determine if two phrases are related and should be connected with a comma
        rather than separated with a period.

        Args:
            text1: The first text fragment
            text2: The second text fragment to be appended

        Returns:
            bool: Whether the phrases appear to be related
        """
        # Check if either phrase starts with "A" or "An" - these are likely separate descriptions
        if (text1.startswith("A ") or text1.startswith("An ")) and \
        (text2.startswith("A ") or text2.startswith("An ")):
            return False  # These are separate descriptions, not related phrases

        # Check if the second phrase starts with a connecting word
        connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
                        "this", "these", "that", "those", "and", "or", "but"]

        first_word = text2.split()[0].lower() if text2 else ""
        if first_word in connecting_words:
            return True

        # Check if the first phrase ends with something that suggests continuity
        ending_patterns = ["such as", "including", "like", "especially", "particularly",
                        "for example", "for instance", "namely", "specifically"]

        for pattern in ending_patterns:
            if text1.lower().endswith(pattern):
                return True

        # Check if both phrases are about the scene
        if "scene" in text1.lower() and "scene" in text2.lower():
            return False  # Separate statements about the scene should be separate sentences

        return False

    def _format_final_description(self, text: str) -> str:
        """
        Format the final description text to ensure correct punctuation,
        capitalization, and spacing.

        Args:
            text: The text to format

        Returns:
            str: The properly formatted text
        """
        import re

        if not text:
            return ""

        # 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
        text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
        text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)

        # 2. 確保第一個字母大寫
        text = text[0].upper() + text[1:] if text else ""

        # 3. 修正詞之間的空格問題
        text = re.sub(r'\s{2,}', ' ', text)  # 多個空格改為一個
        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # 小寫後大寫間加空格

        # 4. 修正詞連接問題
        text = re.sub(r'([a-zA-Z])and', r'\1 and', text)  # "xxx"和"and"間加空格
        text = re.sub(r'([a-zA-Z])with', r'\1 with', text)  # "xxx"和"with"間加空格
        text = re.sub(r'plants(and|with|or)', r'plants \1', text)  # 修正"plantsand"這類問題

        # 5. 修正標點符號後的大小寫問題
        text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text)  # 句號後大寫

        # 6. 修正逗號後接大寫單詞的問題
        def fix_capitalization_after_comma(match):
            word = match.group(2)
            # 例外情況：保留專有名詞、人稱代詞等的大寫
            if word in ["I", "I'm", "I've", "I'd", "I'll"]:
                return match.group(0)  # 保持原樣

            # 保留月份、星期、地名等專有名詞的大寫
            proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
                            "August", "September", "October", "November", "December",
                            "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
            if word in proper_nouns:
                return match.group(0)  # 保持原樣

            # 其他情況：將首字母改為小寫
            return match.group(1) + word[0].lower() + word[1:]

        # 匹配逗號後接空格再接大寫單詞的模式
        text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)


        common_phrases = [
            (r'Social or seating area', r'social or seating area'),
            (r'Sleeping area', r'sleeping area'),
            (r'Dining area', r'dining area'),
            (r'Living space', r'living space')
        ]

        for phrase, replacement in common_phrases:
            # 只修改句中的術語，保留句首的大寫
            text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
            # 修改句中的術語，但保留句首的大寫
            text = re.sub(r'(?<=,\s)' + phrase, replacement, text)

        # 7. 確保標點符號後有空格
        text = re.sub(r'\s+([.,;:!?])', r'\1', text)  # 標點符號前不要空格
        text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text)  # 標點符號後要有空格

        # 8. 修正重複標點符號
        text = re.sub(r'\.{2,}', '.', text)  # 多個句號變一個
        text = re.sub(r',{2,}', ',', text)  # 多個逗號變一個

        # 9. 確保文本以標點結束
        if text and not text[-1] in '.!?':
            text += '.'

        return text

    def _is_intersection(self, detected_objects: List[Dict]) -> bool:
        """
        通過分析物體分佈來判斷場景是否為十字路口
        """
        # 檢查行人分佈模式
        pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]

        if len(pedestrians) >= 8:  # 需要足夠的行人來形成十字路口
            # 抓取行人位置
            positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]

            # 分析 x 和 y 坐標分佈
            x_coords = [pos[0] for pos in positions]
            y_coords = [pos[1] for pos in positions]

            # 計算 x 和 y 坐標的變異數
            x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
            y_variance = np.var(y_coords) if len(y_coords) > 1 else 0

            # 計算範圍
            x_range = max(x_coords) - min(x_coords)
            y_range = max(y_coords) - min(y_coords)

            # 如果 x 和 y 方向都有較大範圍且範圍相似，那就有可能是十字路口
            if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
                return True

        return False

    def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
        """
        Generate a generic description when scene type is unknown or confidence is very low.

        Args:
            detected_objects: List of detected objects
            lighting_info: Optional lighting condition information

        Returns:
            str: Generic description based on detected objects
        """
        # Count object occurrences
        obj_counts = {}
        for obj in detected_objects:
            class_name = obj["class_name"]
            if class_name not in obj_counts:
                obj_counts[class_name] = 0
            obj_counts[class_name] += 1

        # Get top objects by count
        top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5]

        if not top_objects:
            base_desc = "No clearly identifiable objects are visible in this scene."
        else:
            # Format object list
            objects_text = []
            for name, count in top_objects:
                if count > 1:
                    objects_text.append(f"{count} {name}s")
                else:
                    objects_text.append(name)

            if len(objects_text) == 1:
                objects_list = objects_text[0]
            elif len(objects_text) == 2:
                objects_list = f"{objects_text[0]} and {objects_text[1]}"
            else:
                objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}"

            base_desc = f"This scene contains {objects_list}."

        # Add lighting information if available
        if lighting_info and "time_of_day" in lighting_info:
            lighting_type = lighting_info["time_of_day"]
            if lighting_type in self.templates.get("lighting_templates", {}):
                lighting_desc = self.templates["lighting_templates"][lighting_type]
                base_desc += f" {lighting_desc}"

        return base_desc

    def _generate_scene_details(self,
                              scene_type: str,
                              detected_objects: List[Dict],
                              lighting_info: Optional[Dict] = None,
                              viewpoint: str = "eye_level") -> str:
        """
        Generate detailed description based on scene type and detected objects.

        Args:
            scene_type: Identified scene type
            detected_objects: List of detected objects
            lighting_info: Optional lighting condition information
            viewpoint: Detected viewpoint (aerial, eye_level, etc.)

        Returns:
            str: Detailed scene description
        """
        # Get scene-specific templates
        scene_details = ""
        scene_templates = self.templates.get("scene_detail_templates", {})

        # Handle specific scene types
        if scene_type in scene_templates:
            # Select a template appropriate for the viewpoint if available
            viewpoint_key = f"{scene_type}_{viewpoint}"

            if viewpoint_key in scene_templates:
                # We have a viewpoint-specific template
                templates_list = scene_templates[viewpoint_key]
            else:
                # Fall back to general templates for this scene type
                templates_list = scene_templates[scene_type]

            # Select a random template from the list
            if templates_list:
                detail_template = random.choice(templates_list)

                # Fill the template with object information
                scene_details = self._fill_detail_template(
                    detail_template,
                    detected_objects,
                    scene_type
                )
        else:
            # Use default templates if specific ones aren't available
            if "default" in scene_templates:
                detail_template = random.choice(scene_templates["default"])
                scene_details = self._fill_detail_template(
                    detail_template,
                    detected_objects,
                    "default"
                )
            else:
                # Fall back to basic description if no templates are available
                scene_details = self._generate_basic_details(scene_type, detected_objects)

        return scene_details

    def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str) -> str:
        """
        Fill a template with specific details based on detected objects.

        Args:
            template: Template string with placeholders
            detected_objects: List of detected objects
            scene_type: Identified scene type

        Returns:
            str: Filled template
        """
        # Find placeholders in the template using simple {placeholder} syntax
        import re
        placeholders = re.findall(r'\{([^}]+)\}', template)

        filled_template = template

        # Get object template fillers
        fillers = self.templates.get("object_template_fillers", {})

        # 為所有可能的變數設置默認值
        default_replacements = {
            # 室內相關
            "furniture": "various furniture pieces",
            "seating": "comfortable seating",
            "electronics": "entertainment devices",
            "bed_type": "a bed",
            "bed_location": "room",
            "bed_description": "sleeping arrangements",
            "extras": "personal items",
            "table_setup": "a dining table and chairs",
            "table_description": "a dining surface",
            "dining_items": "dining furniture and tableware",
            "appliances": "kitchen appliances",
            "kitchen_items": "cooking utensils and dishware",
            "cooking_equipment": "cooking equipment",
            "office_equipment": "work-related furniture and devices",
            "desk_setup": "a desk and chair",
            "computer_equipment": "electronic devices",

            # 室外/城市相關
            "traffic_description": "vehicles and pedestrians",
            "people_and_vehicles": "people and various vehicles",
            "street_elements": "urban infrastructure",
            "park_features": "benches and greenery",
            "outdoor_elements": "natural features",
            "park_description": "outdoor amenities",
            "store_elements": "merchandise displays",
            "shopping_activity": "customers browse and shop",
            "store_items": "products for sale",

            # 高級餐廳相關
            "design_elements": "elegant decor",
            "lighting": "stylish lighting fixtures",

            # 亞洲商業街相關
            "storefront_features": "compact shops",
            "pedestrian_flow": "people walking",
            "asian_elements": "distinctive cultural elements",
            "cultural_elements": "traditional design features",
            "signage": "colorful signs",
            "street_activities": "busy urban activity",

            # 金融區相關
            "buildings": "tall buildings",
            "traffic_elements": "vehicles",
            "skyscrapers": "high-rise buildings",
            "road_features": "wide streets",
            "architectural_elements": "modern architecture",
            "city_landmarks": "prominent structures",

            # 十字路口相關
            "crossing_pattern": "marked pedestrian crossings",
            "pedestrian_behavior": "careful walking",
            "pedestrian_density": "groups of pedestrians",
            "traffic_pattern": "regulated traffic flow",

            # 交通樞紐相關
            "transit_vehicles": "public transportation vehicles",
            "passenger_activity": "commuter movement",
            "transportation_modes": "various transit options",
            "passenger_needs": "waiting areas",
            "transit_infrastructure": "transit facilities",
            "passenger_movement": "commuter flow",

            # 購物區相關
            "retail_elements": "shops and displays",
            "store_types": "various retail establishments",
            "walkway_features": "pedestrian pathways",
            "commercial_signage": "store signs",
            "consumer_behavior": "shopping activities",

            # 空中視角相關
            "commercial_layout": "organized retail areas",
            "pedestrian_pattern": "people movement patterns",
            "gathering_features": "public gathering spaces",
            "movement_pattern": "crowd flow patterns",
            "urban_elements": "city infrastructure",
            "public_activity": "social interaction",

            # 文化特定元素
            "stall_elements": "vendor booths",
            "lighting_features": "decorative lights",
            "food_elements": "food offerings",
            "vendor_stalls": "market stalls",
            "nighttime_activity": "evening commerce",
            "cultural_lighting": "traditional lighting",
            "night_market_sounds": "lively market sounds",
            "evening_crowd_behavior": "nighttime social activity",
            "architectural_elements": "cultural buildings",
            "religious_structures": "sacred buildings",
            "decorative_features": "ornamental designs",
            "cultural_practices": "traditional activities",
            "temple_architecture": "religious structures",
            "sensory_elements": "atmospheric elements",
            "visitor_activities": "cultural experiences",
            "ritual_activities": "ceremonial practices",
            "cultural_symbols": "meaningful symbols",
            "architectural_style": "historical buildings",
            "historic_elements": "traditional architecture",
            "urban_design": "city planning elements",
            "social_behaviors": "public interactions",
            "european_features": "European architectural details",
            "tourist_activities": "visitor activities",
            "local_customs": "regional practices",

            # 時間特定元素
            "lighting_effects": "artificial lighting",
            "shadow_patterns": "light and shadow",
            "urban_features": "city elements",
            "illuminated_elements": "lit structures",
            "evening_activities": "nighttime activities",
            "light_sources": "lighting points",
            "lit_areas": "illuminated spaces",
            "shadowed_zones": "darker areas",
            "illuminated_signage": "bright signs",
            "colorful_lighting": "multicolored lights",
            "neon_elements": "neon signs",
            "night_crowd_behavior": "evening social patterns",
            "light_displays": "lighting installations",
            "building_features": "architectural elements",
            "nightlife_activities": "evening entertainment",
            "lighting_modifier": "bright",

            # 混合環境元素
            "transitional_elements": "connecting features",
            "indoor_features": "interior elements",
            "outdoor_setting": "exterior spaces",
            "interior_amenities": "inside comforts",
            "exterior_features": "outside elements",
            "inside_elements": "interior design",
            "outside_spaces": "outdoor areas",
            "dual_environment_benefits": "combined settings",
            "passenger_activities": "waiting behaviors",
            "transportation_types": "transit vehicles",
            "sheltered_elements": "covered areas",
            "exposed_areas": "open sections",
            "waiting_behaviors": "passenger activities",
            "indoor_facilities": "inside services",
            "platform_features": "transit platform elements",
            "transit_routines": "transportation procedures",

            # 專門場所元素
            "seating_arrangement": "spectator seating",
            "playing_surface": "athletic field",
            "sporting_activities": "sports events",
            "spectator_facilities": "viewer accommodations",
            "competition_space": "sports arena",
            "sports_events": "athletic competitions",
            "viewing_areas": "audience sections",
            "field_elements": "field markings and equipment",
            "game_activities": "competitive play",
            "construction_equipment": "building machinery",
            "building_materials": "construction supplies",
            "construction_activities": "building work",
            "work_elements": "construction tools",
            "structural_components": "building structures",
            "site_equipment": "construction gear",
            "raw_materials": "building supplies",
            "construction_process": "building phases",
            "medical_elements": "healthcare equipment",
            "clinical_activities": "medical procedures",
            "facility_design": "healthcare layout",
            "healthcare_features": "medical facilities",
            "patient_interactions": "care activities",
            "equipment_types": "medical devices",
            "care_procedures": "health services",
            "treatment_spaces": "clinical areas",
            "educational_furniture": "learning furniture",
            "learning_activities": "educational practices",
            "instructional_design": "teaching layout",
            "classroom_elements": "school equipment",
            "teaching_methods": "educational approaches",
            "student_engagement": "learning participation",
            "learning_spaces": "educational areas",
            "educational_tools": "teaching resources",
            "knowledge_transfer": "learning exchanges"
        }

        # For each placeholder, try to fill with appropriate content
        for placeholder in placeholders:
            if placeholder in fillers:
                # Get random filler for this placeholder
                options = fillers[placeholder]
                if options:
                    # Select 1-3 items from the options list
                    num_items = min(len(options), random.randint(1, 3))
                    selected_items = random.sample(options, num_items)

                    # Create a formatted list
                    if len(selected_items) == 1:
                        replacement = selected_items[0]
                    elif len(selected_items) == 2:
                        replacement = f"{selected_items[0]} and {selected_items[1]}"
                    else:
                        replacement = ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}"

                    # Replace the placeholder
                    filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
            else:
                # Try to fill with scene-specific logic
                replacement = self._generate_placeholder_content(placeholder, detected_objects, scene_type)
                if replacement:
                    filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
                elif placeholder in default_replacements:
                    # Use default replacement if available
                    filled_template = filled_template.replace(f"{{{placeholder}}}", default_replacements[placeholder])
                else:
                    # Last resort default
                    filled_template = filled_template.replace(f"{{{placeholder}}}", "various items")

        return filled_template

    def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str:
        """
        Generate content for a template placeholder based on scene-specific logic.

        Args:
            placeholder: Template placeholder
            detected_objects: List of detected objects
            scene_type: Identified scene type

        Returns:
            str: Content for the placeholder
        """
        # Handle different types of placeholders with custom logic
        if placeholder == "furniture":
            # Extract furniture items
            furniture_ids = [56, 57, 58, 59, 60, 61]  # Example furniture IDs
            furniture_objects = [obj for obj in detected_objects if obj["class_id"] in furniture_ids]

            if furniture_objects:
                furniture_names = [obj["class_name"] for obj in furniture_objects[:3]]
                return ", ".join(set(furniture_names))
            return "various furniture items"

        elif placeholder == "electronics":
            # Extract electronic items
            electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70]  # Example electronics IDs
            electronics_objects = [obj for obj in detected_objects if obj["class_id"] in electronics_ids]

            if electronics_objects:
                electronics_names = [obj["class_name"] for obj in electronics_objects[:3]]
                return ", ".join(set(electronics_names))
            return "electronic devices"

        elif placeholder == "people_count":
            # Count people
            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])

            if people_count == 0:
                return "no people"
            elif people_count == 1:
                return "one person"
            elif people_count < 5:
                return f"{people_count} people"
            else:
                return "several people"

        elif placeholder == "seating":
            # Extract seating items
            seating_ids = [56, 57]  # chair, sofa
            seating_objects = [obj for obj in detected_objects if obj["class_id"] in seating_ids]

            if seating_objects:
                seating_names = [obj["class_name"] for obj in seating_objects[:2]]
                return ", ".join(set(seating_names))
            return "seating arrangements"

        # Default case - empty string
        return ""

    def _generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str:
        """
        Generate basic details when templates aren't available.

        Args:
            scene_type: Identified scene type
            detected_objects: List of detected objects

        Returns:
            str: Basic scene details
        """
        # Handle specific scene types with custom logic
        if scene_type == "living_room":
            tv_objs = [obj for obj in detected_objects if obj["class_id"] == 62]  # TV
            sofa_objs = [obj for obj in detected_objects if obj["class_id"] == 57]  # Sofa

            if tv_objs and sofa_objs:
                tv_region = tv_objs[0]["region"]
                sofa_region = sofa_objs[0]["region"]

                arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, "
                arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. "

                return f"{arrangement}This appears to be a space designed for relaxation and entertainment."

        elif scene_type == "bedroom":
            bed_objs = [obj for obj in detected_objects if obj["class_id"] == 59]  # Bed

            if bed_objs:
                bed_region = bed_objs[0]["region"]
                extra_items = []

                for obj in detected_objects:
                    if obj["class_id"] == 74:  # Clock
                        extra_items.append("clock")
                    elif obj["class_id"] == 73:  # Book
                        extra_items.append("book")

                extras = ""
                if extra_items:
                    extras = f" There is also a {' and a '.join(extra_items)} visible."

                return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}"

        elif scene_type in ["dining_area", "kitchen"]:
            # Count food and dining-related items
            food_items = []
            for obj in detected_objects:
                if obj["class_id"] in [39, 41, 42, 43, 44, 45]:  # Kitchen items
                    food_items.append(obj["class_name"])

            food_str = ""
            if food_items:
                unique_items = list(set(food_items))
                if len(unique_items) <= 3:
                    food_str = f" with {', '.join(unique_items)}"
                else:
                    food_str = f" with {', '.join(unique_items[:3])} and other items"

            return f"{food_str}."

        elif scene_type == "city_street":
            # Count people and vehicles
            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
            vehicle_count = len([obj for obj in detected_objects
                               if obj["class_id"] in [1, 2, 3, 5, 7]])  # Bicycle, car, motorbike, bus, truck

            traffic_desc = ""
            if people_count > 0 and vehicle_count > 0:
                traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and "
                traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
            elif people_count > 0:
                traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}"
            elif vehicle_count > 0:
                traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"

            return f"{traffic_desc}."

        # Handle more specialized scenes
        elif scene_type == "asian_commercial_street":
            # Look for key urban elements
            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
            vehicle_count = len([obj for obj in detected_objects if obj["class_id"] in [1, 2, 3]])

            # Analyze pedestrian distribution
            people_positions = []
            for obj in detected_objects:
                if obj["class_id"] == 0:  # Person
                    people_positions.append(obj["normalized_center"])

            # Check if people are distributed along a line (indicating a walking path)
            structured_path = False
            if len(people_positions) >= 3:
                # Simplified check - see if y-coordinates are similar for multiple people
                y_coords = [pos[1] for pos in people_positions]
                y_mean = sum(y_coords) / len(y_coords)
                y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
                if y_variance < 0.05:  # Low variance indicates linear arrangement
                    structured_path = True

            street_desc = "A commercial street with "
            if people_count > 0:
                street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}"
                if vehicle_count > 0:
                    street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
            elif vehicle_count > 0:
                street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
            else:
                street_desc += "various commercial elements"

            if structured_path:
                street_desc += ". The pedestrians appear to be following a defined walking path"

            # Add cultural elements
            street_desc += ". The signage and architectural elements suggest an Asian urban setting."

            return street_desc

        # Default general description
        return "The scene contains various elements characteristic of this environment."

    def _detect_viewpoint(self, detected_objects: List[Dict]) -> str:
        """
        改進視角檢測，特別加強對空中俯視視角的識別。

        Args:
            detected_objects: 檢測到的物體列表

        Returns:
            str: 檢測到的視角類型
        """
        if not detected_objects:
            return "eye_level"  # default

        # 提取物體位置和大小
        top_region_count = 0
        bottom_region_count = 0
        total_objects = len(detected_objects)

        # 追蹤大小分布以檢測空中視角
        sizes = []

        # 垂直大小比例用於低角度檢測
        height_width_ratios = []

        # 用於檢測規則圖案的變數
        people_positions = []
        crosswalk_pattern_detected = False

        for obj in detected_objects:
            # 計算頂部/底部區域中的物體
            region = obj["region"]
            if "top" in region:
                top_region_count += 1
            elif "bottom" in region:
                bottom_region_count += 1

            # 計算標準化大小（面積）
            if "normalized_area" in obj:
                sizes.append(obj["normalized_area"])

            # 計算高度/寬度比例
            if "normalized_size" in obj:
                width, height = obj["normalized_size"]
                if width > 0:
                    height_width_ratios.append(height / width)

            # 收集人的位置用於圖案檢測
            if obj["class_id"] == 0:  # 人
                if "normalized_center" in obj:
                    people_positions.append(obj["normalized_center"])

        # 專門為斑馬線十字路口添加檢測邏輯
        # 檢查是否有明顯的垂直和水平行人分布
        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # 人

        if len(people_objs) >= 8:  # 需要足夠多的人才能形成十字路口模式
            # 檢查是否有斑馬線模式 - 新增功能
            if len(people_positions) >= 4:
                # 對位置進行聚類分析，尋找線性分布
                x_coords = [pos[0] for pos in people_positions]
                y_coords = [pos[1] for pos in people_positions]

                # 計算 x 和 y 坐標的變異數和範圍
                x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
                y_variance = np.var(y_coords) if len(y_coords) > 1 else 0

                x_range = max(x_coords) - min(x_coords)
                y_range = max(y_coords) - min(y_coords)

                # 嘗試檢測十字形分布
                # 如果 x 和 y 方向都有較大範圍，且範圍相似，可能是十字路口
                if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:

                    # 計算到中心點的距離
                    center_x = np.mean(x_coords)
                    center_y = np.mean(y_coords)

                    # 將點映射到十字架的軸上（水平和垂直）
                    x_axis_distance = [abs(x - center_x) for x in x_coords]
                    y_axis_distance = [abs(y - center_y) for y in y_coords]

                    # 點應該接近軸線（水平或垂直）
                    # 對於每個點，檢查它是否接近水平或垂直軸線
                    close_to_axis_count = 0
                    for i in range(len(x_coords)):
                        if x_axis_distance[i] < 0.1 or y_axis_distance[i] < 0.1:
                            close_to_axis_count += 1

                    # 如果足夠多的點接近軸線，認為是十字路口
                    if close_to_axis_count >= len(x_coords) * 0.6:
                        crosswalk_pattern_detected = True

                # 如果沒有檢測到十字形，嘗試檢測線性聚類分布
                if not crosswalk_pattern_detected:
                    # 檢查 x 和 y 方向的聚類
                    x_clusters = self._detect_linear_clusters(x_coords)
                    y_clusters = self._detect_linear_clusters(y_coords)

                    # 如果在 x 和 y 方向上都有多個聚類，可能是交叉的斑馬線
                    if len(x_clusters) >= 2 and len(y_clusters) >= 2:
                        crosswalk_pattern_detected = True

        # 檢測斑馬線模式 - 優先判斷
        if crosswalk_pattern_detected:
            return "aerial"

        # 檢測行人分布情況
        if len(people_objs) >= 10:
            people_region_counts = {}
            for obj in people_objs:
                region = obj["region"]
                if region not in people_region_counts:
                    people_region_counts[region] = 0
                people_region_counts[region] += 1

            # 計算不同區域中的行人數量
            region_count = len([r for r, c in people_region_counts.items() if c >= 2])

            # 如果行人分布在多個區域中，可能是空中視角
            if region_count >= 4:
                # 檢查行人分布的模式
                # 特別是檢查不同區域中行人數量的差異
                region_counts = list(people_region_counts.values())
                region_counts_variance = np.var(region_counts) if len(region_counts) > 1 else 0
                region_counts_mean = np.mean(region_counts) if region_counts else 0

                # 如果行人分布較為均勻（變異係數小），可能是空中視角
                if region_counts_mean > 0:
                    variation_coefficient = region_counts_variance / region_counts_mean
                    if variation_coefficient < 0.5:
                        return "aerial"

        # 計算指標
        top_ratio = top_region_count / total_objects if total_objects > 0 else 0
        bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0

        # 大小變異數（標準化）
        size_variance = 0
        if sizes:
            mean_size = sum(sizes) / len(sizes)
            size_variance = sum((s - mean_size) ** 2 for s in sizes) / len(sizes)
            size_variance = size_variance / (mean_size ** 2)  # 標準化

        # 平均高度/寬度比例
        avg_height_width_ratio = sum(height_width_ratios) / len(height_width_ratios) if height_width_ratios else 1.0

        # 空中視角：低大小差異，物體均勻分布，底部很少或沒有物體
        if (size_variance < self.viewpoint_params["aerial_size_variance_threshold"] and
            bottom_ratio < 0.3 and top_ratio > self.viewpoint_params["aerial_threshold"]):
            return "aerial"

        # 低角度視角：物體傾向於比寬高，頂部較多物體
        elif (avg_height_width_ratio > self.viewpoint_params["vertical_size_ratio_threshold"] and
            top_ratio > self.viewpoint_params["low_angle_threshold"]):
            return "low_angle"

        # 高視角：底部較多物體，頂部較少
        elif (bottom_ratio > self.viewpoint_params["elevated_threshold"] and
            top_ratio < self.viewpoint_params["elevated_top_threshold"]):
            return "elevated"

        # 默認：平視角
        return "eye_level"

    def _detect_linear_clusters(self, coords, threshold=0.05):
        """
        檢測坐標中的線性聚類

        Args:
            coords: 一維坐標列表
            threshold: 聚類閾值

        Returns:
            list: 聚類列表
        """
        if not coords:
            return []

        # 排序坐標
        sorted_coords = sorted(coords)

        clusters = []
        current_cluster = [sorted_coords[0]]

        for i in range(1, len(sorted_coords)):
            # 如果當前坐標與前一個接近，添加到當前聚類
            if sorted_coords[i] - sorted_coords[i-1] < threshold:
                current_cluster.append(sorted_coords[i])
            else:
                # 否則開始新的聚類
                if len(current_cluster) >= 2:  # 至少需要2個點形成聚類
                    clusters.append(current_cluster)
                current_cluster = [sorted_coords[i]]

        # 添加最後一個cluster
        if len(current_cluster) >= 2:
            clusters.append(current_cluster)

        return clusters

    def _detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
        """
        Detect the likely cultural context of the scene.

        Args:
            scene_type: Identified scene type
            detected_objects: List of detected objects

        Returns:
            Optional[str]: Detected cultural context (asian, european, etc.) or None
        """
        # Scene types with explicit cultural contexts
        cultural_scene_mapping = {
            "asian_commercial_street": "asian",
            "asian_night_market": "asian",
            "asian_temple_area": "asian",
            "european_plaza": "european"
        }

        # Check if scene type directly indicates cultural context
        if scene_type in cultural_scene_mapping:
            return cultural_scene_mapping[scene_type]

        # No specific cultural context detected
        return None

    def _generate_cultural_elements(self, cultural_context: str) -> str:
        """
        Generate description of cultural elements for the detected context.

        Args:
            cultural_context: Detected cultural context

        Returns:
            str: Description of cultural elements
        """
        # Get template for this cultural context
        cultural_templates = self.templates.get("cultural_templates", {})

        if cultural_context in cultural_templates:
            template = cultural_templates[cultural_context]
            elements = template.get("elements", [])

            if elements:
                # Select 1-2 random elements
                num_elements = min(len(elements), random.randint(1, 2))
                selected_elements = random.sample(elements, num_elements)

                # Format elements list
                elements_text = " and ".join(selected_elements) if num_elements == 2 else selected_elements[0]

                # Fill template
                return template.get("description", "").format(elements=elements_text)

        return ""

    def _optimize_object_description(self, description: str) -> str:
        """
        優化物品描述，避免重複列舉相同物品
        """
        import re

        # 處理床鋪重複描述
        if "bed in the room" in description:
            description = description.replace("a bed in the room", "a bed")

        # 處理重複的物品列表
        # 尋找格式如 "item, item, item" 的模式
        object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)

        for obj_list in object_lists:
            # 計算每個物品出現次數
            items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
            item_counts = {}

            for item in items:
                item = item.strip()
                if item and item not in ["and", "with"]:
                    if item not in item_counts:
                        item_counts[item] = 0
                    item_counts[item] += 1

            # 生成優化後的物品列表
            if item_counts:
                new_items = []
                for item, count in item_counts.items():
                    if count > 1:
                        new_items.append(f"{count} {item}s")
                    else:
                        new_items.append(item)

                # 格式化新列表
                if len(new_items) == 1:
                    new_list = new_items[0]
                elif len(new_items) == 2:
                    new_list = f"{new_items[0]} and {new_items[1]}"
                else:
                    new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"

                # 替換原始列表
                description = description.replace(obj_list, new_list)

        return description

    def _describe_functional_zones(self, functional_zones: Dict) -> str:
        """
        生成場景功能區域的描述，優化處理行人區域、人數統計和物品重複問題。

        Args:
            functional_zones: 識別出的功能區域字典

        Returns:
            str: 功能區域描述
        """
        if not functional_zones:
            return ""

        # 計算場景中的總人數
        total_people_count = 0
        people_by_zone = {}

        # 計算每個區域的人數並累計總人數
        for zone_name, zone_info in functional_zones.items():
            if "objects" in zone_info:
                zone_people_count = zone_info["objects"].count("person")
                people_by_zone[zone_name] = zone_people_count
                total_people_count += zone_people_count

        # 分類區域為行人區域和其他區域
        pedestrian_zones = []
        other_zones = []

        for zone_name, zone_info in functional_zones.items():
            # 檢查是否是行人相關區域
            if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]):
                pedestrian_zones.append((zone_name, zone_info))
            else:
                other_zones.append((zone_name, zone_info))

        # 獲取最重要的行人區域和其他區域
        main_pedestrian_zones = sorted(pedestrian_zones,
                                    key=lambda z: people_by_zone.get(z[0], 0),
                                    reverse=True)[:1]  # 最多1個主要行人區域

        top_other_zones = sorted(other_zones,
                            key=lambda z: len(z[1].get("objects", [])),
                            reverse=True)[:2]  # 最多2個其他區域

        # 合併區域
        top_zones = main_pedestrian_zones + top_other_zones

        if not top_zones:
            return ""

        # 生成匯總描述
        summary = ""
        max_mentioned_people = 0  # 跟踪已經提到的最大人數

        # 如果總人數顯著且還沒在主描述中提到，添加總人數描述
        if total_people_count > 5:
            summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
            max_mentioned_people = total_people_count  # 更新已提到的最大人數

        # 處理每個區域的描述，確保人數信息的一致性
        processed_zones = []

        for zone_name, zone_info in top_zones:
            zone_desc = zone_info.get("description", "a functional zone")
            zone_people_count = people_by_zone.get(zone_name, 0)

            # 檢查描述中是否包含人數信息
            contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())

            # 如果描述包含人數信息，且人數較小（小於已提到的最大人數），則修改描述
            if contains_people_info and zone_people_count < max_mentioned_people:
                parts = zone_desc.split("with")
                if len(parts) > 1:
                    # 移除人數部分
                    zone_desc = parts[0].strip() + " area"

            processed_zones.append((zone_name, {"description": zone_desc}))

        # 根據處理後的區域數量生成最終描述
        final_desc = ""

        if len(processed_zones) == 1:
            _, zone_info = processed_zones[0]
            zone_desc = zone_info["description"]
            final_desc = summary + f"The scene includes {zone_desc}."
        elif len(processed_zones) == 2:
            _, zone1_info = processed_zones[0]
            _, zone2_info = processed_zones[1]
            zone1_desc = zone1_info["description"]
            zone2_desc = zone2_info["description"]
            final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}."
        else:
            zones_desc = ["The scene contains multiple functional areas including"]
            zone_descriptions = [z[1]["description"] for z in processed_zones]

            # 格式化最終的多區域描述
            if len(zone_descriptions) == 3:
                formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}"
            else:
                formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}"

            final_desc = summary + f"{zones_desc[0]} {formatted_desc}."

        return self._optimize_object_description(final_desc)

In [None]:
# %%writefile lighting_analyzer.py
import numpy as np
import cv2
from typing import Dict, Any, Optional

class LightingAnalyzer:
    """
    分析圖像的光照條件，提供增強的室內or室外判斷和光照類型分類，並專注於光照分析。
    """

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        """
        初始化光照分析器。

        Args:
            config: 可選的配置字典，用於自定義分析參數
        """
        self.config = config or self._get_default_config()

    def analyze(self, image):
        """
        分析圖像的光照條件。

        主要分析入口點，計算基本特徵，判斷室內/室外，確定光照條件。

        Args:
            image: 輸入圖像 (numpy array 或 PIL Image)

        Returns:
            Dict: 包含光照分析結果的字典
        """
        try:
            # 轉換圖像格式
            if not isinstance(image, np.ndarray):
                image_np = np.array(image)
            else:
                image_np = image.copy()

            # 確保 RGB 格式
            if image_np.shape[2] == 3 and isinstance(image_np, np.ndarray):
                image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
            else:
                image_rgb = image_np

            # 計算基本特徵
            features = self._compute_basic_features(image_rgb)

            # 分析室內or室外
            indoor_result = self._analyze_indoor_outdoor(features)
            is_indoor = indoor_result["is_indoor"]
            indoor_probability = indoor_result["indoor_probability"]

            # 確定光照條件
            lighting_conditions = self._determine_lighting_conditions(features, is_indoor)

            # 整合結果
            result = {
                "time_of_day": lighting_conditions["time_of_day"],
                "confidence": float(lighting_conditions["confidence"]),
                "is_indoor": is_indoor,
                "indoor_probability": float(indoor_probability),
                "brightness": {
                    "average": float(features["avg_brightness"]),
                    "std_dev": float(features["brightness_std"]),
                    "dark_ratio": float(features["dark_pixel_ratio"])
                },
                "color_info": {
                    "blue_ratio": float(features["blue_ratio"]),
                    "yellow_orange_ratio": float(features["yellow_orange_ratio"]),
                    "gray_ratio": float(features["gray_ratio"]),
                    "avg_saturation": float(features["avg_saturation"]),
                    "sky_brightness": float(features["sky_brightness"]),
                    "color_atmosphere": features["color_atmosphere"],
                    "warm_ratio": float(features["warm_ratio"]),
                    "cool_ratio": float(features["cool_ratio"])
                }
            }

            # 添加診斷信息
            if self.config["include_diagnostics"]:
                result["diagnostics"] = {
                    "feature_contributions": indoor_result.get("feature_contributions", {}),
                    "lighting_diagnostics": lighting_conditions.get("diagnostics", {})
                }

            return result

        except Exception as e:
            print(f"Error in lighting analysis: {str(e)}")
            import traceback
            traceback.print_exc()
            return {
                "time_of_day": "unknown",
                "confidence": 0,
                "error": str(e)
            }

    def _compute_basic_features(self, image_rgb):
        """
        計算圖像的基本光照特徵（徹底優化版本）。

        Args:
            image_rgb: RGB 格式的圖像 (numpy array)

        Returns:
            Dict: 包含計算出的特徵值
        """
        # 獲取圖像尺寸
        height, width = image_rgb.shape[:2]

        # 根據圖像大小自適應縮放因子
        base_scale = 4
        scale_factor = base_scale + min(8, max(0, int((height * width) / (1000 * 1000))))

        # 創建縮小的圖像以加速處理
        small_rgb = cv2.resize(image_rgb, (width//scale_factor, height//scale_factor))

        # 一次性轉換所有顏色空間，避免重複計算
        hsv_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2HSV)
        gray_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
        small_gray = cv2.resize(gray_img, (width//scale_factor, height//scale_factor))

        # 分離HSV通道
        h_channel = hsv_img[:,:,0]
        s_channel = hsv_img[:,:,1]
        v_channel = hsv_img[:,:,2]

        # 基本亮度特徵
        avg_brightness = np.mean(v_channel)
        brightness_std = np.std(v_channel)
        dark_pixel_ratio = np.sum(v_channel < 50) / (height * width)

        # 顏色特徵
        yellow_orange_mask = ((h_channel >= 15) & (h_channel <= 40))
        yellow_orange_ratio = np.sum(yellow_orange_mask) / (height * width)

        blue_mask = ((h_channel >= 90) & (h_channel <= 130))
        blue_ratio = np.sum(blue_mask) / (height * width)

        # 特別檢查圖像上部區域，尋找藍天特徵
        upper_region_h = h_channel[:height//4, :]
        upper_region_s = s_channel[:height//4, :]
        upper_region_v = v_channel[:height//4, :]

        # 藍天通常具有高飽和度的藍色
        sky_blue_mask = ((upper_region_h >= 90) & (upper_region_h <= 130) &
                        (upper_region_s > 70) & (upper_region_v > 150))
        sky_blue_ratio = np.sum(sky_blue_mask) / max(1, upper_region_h.size)

        gray_mask = (s_channel < 50) & (v_channel > 100)
        gray_ratio = np.sum(gray_mask) / (height * width)

        avg_saturation = np.mean(s_channel)

        # 天空亮度
        upper_half = v_channel[:height//2, :]
        sky_brightness = np.mean(upper_half)

        # 色調分析
        warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
        warm_ratio = np.sum(warm_colors) / (height * width)

        cool_colors = (h_channel >= 180) & (h_channel <= 270)
        cool_ratio = np.sum(cool_colors) / (height * width)

        # 確定色彩氛圍
        if warm_ratio > 0.4:
            color_atmosphere = "warm"
        elif cool_ratio > 0.4:
            color_atmosphere = "cool"
        else:
            color_atmosphere = "neutral"

        # 只在縮小的圖像上計算梯度，大幅提高效能
        gx = cv2.Sobel(small_gray, cv2.CV_32F, 1, 0, ksize=3)
        gy = cv2.Sobel(small_gray, cv2.CV_32F, 0, 1, ksize=3)

        vertical_strength = np.mean(np.abs(gy))
        horizontal_strength = np.mean(np.abs(gx))
        gradient_ratio = vertical_strength / max(horizontal_strength, 1e-5)

        # -- 亮度均勻性 --
        brightness_uniformity = 1 - min(1, brightness_std / max(avg_brightness, 1e-5))

        # -- 高效的天花板分析 --
        # 使用更大的下採樣率分析頂部區域
        top_scale = scale_factor * 2  # 更積極的下採樣
        top_region = v_channel[:height//4:top_scale, ::top_scale]
        top_region_std = np.std(top_region)
        ceiling_uniformity = 1.0 - min(1, top_region_std / max(np.mean(top_region), 1e-5))

        # 使用更簡單的方法檢測上部水平線
        top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
        horizontal_lines_strength = np.mean(top_gradients)
        # 標準化
        horizontal_line_ratio = min(1, horizontal_lines_strength / 40)

        # 極簡的亮點檢測
        sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
        light_threshold = min(220, avg_brightness + 2*brightness_std)
        is_bright = sampled_v > light_threshold
        bright_spot_count = np.sum(is_bright)

        # 圓形光源分析的簡化替代方法
        circular_light_score = 0
        indoor_light_score = 0
        light_distribution_uniformity = 0.5

        # 只有當檢測到亮點，且不是大量亮點時（可能是室外光反射）才進行光源分析
        if 1 < bright_spot_count < 20:
            # 簡單統計亮點分布
            bright_y, bright_x = np.where(is_bright)
            if len(bright_y) > 1:
                # 檢查亮點是否成組出現 - 室內照明常見模式
                mean_x = np.mean(bright_x)
                mean_y = np.mean(bright_y)
                dist_from_center = np.sqrt((bright_x - mean_x)**2 + (bright_y - mean_y)**2)

                # 如果亮點分布較集中，可能是燈具
                if np.std(dist_from_center) < np.mean(dist_from_center):
                    circular_light_score = min(3, len(bright_y) // 2)
                    light_distribution_uniformity = 0.7

                # 評估亮點是否位於上部區域，常見於室內頂燈
                if np.mean(bright_y) < sampled_v.shape[0] / 2:
                    indoor_light_score = 0.6
                else:
                    indoor_light_score = 0.3

        # 使用邊緣區域梯度來快速估計邊界
        edge_scale = scale_factor * 2

        # 只採樣圖像邊緣部分進行分析
        left_edge = small_gray[:, :small_gray.shape[1]//6]
        right_edge = small_gray[:, 5*small_gray.shape[1]//6:]
        top_edge = small_gray[:small_gray.shape[0]//6, :]

        # 計算每個邊緣區域的梯度強度
        left_gradient = np.mean(np.abs(cv2.Sobel(left_edge, cv2.CV_32F, 1, 0, ksize=3)))
        right_gradient = np.mean(np.abs(cv2.Sobel(right_edge, cv2.CV_32F, 1, 0, ksize=3)))
        top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))

        # 標準化
        left_edge_density = min(1, left_gradient / 50)
        right_edge_density = min(1, right_gradient / 50)
        top_edge_density = min(1, top_gradient / 50)

        # 封閉環境通常在圖像邊緣有較強的梯度
        boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3

        # 簡單估計整體邊緣密度
        edges_density = min(1, (np.mean(np.abs(gx)) + np.mean(np.abs(gy))) / 100)

        street_line_score = 0

        # 檢查下半部分是否有強烈的垂直線條
        bottom_half = small_gray[small_gray.shape[0]//2:, :]
        bottom_vert_gradient = cv2.Sobel(bottom_half, cv2.CV_32F, 0, 1, ksize=3)
        strong_vert_lines = np.abs(bottom_vert_gradient) > 50
        if np.sum(strong_vert_lines) > (bottom_half.size * 0.05):  # 如果超過5%的像素是強垂直線
            street_line_score = 0.7

        # 整合所有特徵
        features = {
            # 基本亮度和顏色特徵
            "avg_brightness": avg_brightness,
            "brightness_std": brightness_std,
            "dark_pixel_ratio": dark_pixel_ratio,
            "yellow_orange_ratio": yellow_orange_ratio,
            "blue_ratio": blue_ratio,
            "sky_blue_ratio": sky_blue_ratio,
            "gray_ratio": gray_ratio,
            "avg_saturation": avg_saturation,
            "sky_brightness": sky_brightness,
            "color_atmosphere": color_atmosphere,
            "warm_ratio": warm_ratio,
            "cool_ratio": cool_ratio,

            # 結構特徵
            "gradient_ratio": gradient_ratio,
            "brightness_uniformity": brightness_uniformity,
            "bright_spot_count": bright_spot_count,
            "vertical_strength": vertical_strength,
            "horizontal_strength": horizontal_strength,

            # 室內/室外判斷特徵
            "ceiling_uniformity": ceiling_uniformity,
            "horizontal_line_ratio": horizontal_line_ratio,
            "indoor_light_score": indoor_light_score,
            "circular_light_count": circular_light_score,
            "light_distribution_uniformity": light_distribution_uniformity,
            "boundary_edge_score": boundary_edge_score,
            "top_region_std": top_region_std,
            "edges_density": edges_density,

            # 室外特定特徵
            "street_line_score": street_line_score
        }

        return features

    def _analyze_indoor_outdoor(self, features):
        """
        使用多特徵融合進行室內/室外判斷

        Args:
            features: 特徵字典

        Returns:
            Dict: 室內/室外判斷結果
        """
        # 獲取配置中的特徵權重
        weights = self.config["indoor_outdoor_weights"]

        # 初始概率值 - 開始時中性評估
        indoor_score = 0
        feature_contributions = {}
        diagnostics = {}

        # 1. 藍色區域（天空）特徵 - 藍色區域多通常表示室外
        if features.get("blue_ratio", 0) > 0.2:
            # 檢查是否有室內指標，如果有明顯的室內特徵，則減少藍色的負面影響
            if (features.get("ceiling_uniformity", 0) > 0.5 or
                features.get("boundary_edge_score", 0) > 0.3 or
                features.get("indoor_light_score", 0) > 0.2 or
                features.get("bright_spot_count", 0) > 0):
                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
            else:
                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
        else:
            blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15

        indoor_score += blue_score
        feature_contributions["blue_ratio"] = blue_score

        # 判斷視角 - 如果上部有藍天而上下亮度差異大，可能是仰視室外建築
        if (features.get("sky_blue_ratio", 0) > 0.01 and
            features["sky_brightness"] > features["avg_brightness"] * 1.1):
            viewpoint_outdoor_score = -1.8  # 強烈的室外指標
            indoor_score += viewpoint_outdoor_score
            feature_contributions["outdoor_viewpoint"] = viewpoint_outdoor_score

        # 2. 亮度均勻性特徵 - 室內通常光照更均勻
        uniformity_score = weights["brightness_uniformity"] * features["brightness_uniformity"]
        indoor_score += uniformity_score
        feature_contributions["brightness_uniformity"] = uniformity_score

        # 3. 天花板特徵 - 強化天花板檢測的權重
        ceiling_contribution = 0
        if "ceiling_uniformity" in features:
            ceiling_uniformity = features["ceiling_uniformity"]
            horizontal_line_ratio = features.get("horizontal_line_ratio", 0)

            # 增強天花板檢測的影響
            if ceiling_uniformity > 0.5:
                ceiling_weight = 3
                ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
                if horizontal_line_ratio > 0.2:  # 如果有水平線條，進一步增強
                    ceiling_contribution *= 1.5
            elif ceiling_uniformity > 0.4:
                ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2

            indoor_score += ceiling_contribution
            feature_contributions["ceiling_features"] = ceiling_contribution

        # 4. 強化吊燈的檢測
        light_contribution = 0
        if "indoor_light_score" in features:
            indoor_light_score = features["indoor_light_score"]
            circular_light_count = features.get("circular_light_count", 0)

            # 加強對特定類型光源的檢測
            if circular_light_count >= 1:  # 即便只有一個圓形光源也很可能是室內
                light_contribution = weights.get("light_features", 1.2) * 2.0
            elif indoor_light_score > 0.3:
                light_contribution = weights.get("light_features", 1.2) * 1.0

            indoor_score += light_contribution
            feature_contributions["light_features"] = light_contribution

        # 5. 環境封閉度特徵
        boundary_contribution = 0
        if "boundary_edge_score" in features:
            boundary_edge_score = features["boundary_edge_score"]
            edges_density = features.get("edges_density", 0)

            # 高邊界評分暗示封閉環境（室內）
            if boundary_edge_score > 0.3:
                boundary_contribution = weights.get("boundary_features", 1.2) * 2
            elif boundary_edge_score > 0.2:
                boundary_contribution = weights.get("boundary_features", 1.2) * 1.2

            indoor_score += boundary_contribution
            feature_contributions["boundary_features"] = boundary_contribution

        if (features.get("edges_density", 0) > 0.2 and
            features.get("bright_spot_count", 0) > 5 and
            features.get("vertical_strength", 0) > features.get("horizontal_strength", 0) * 1.5):
            # 商業街道特徵：高邊緣密度 + 多亮點 + 強垂直特徵
            street_feature_score = -weights.get("street_features", 1.2) * 1.5
            indoor_score += street_feature_score
            feature_contributions["street_features"] = street_feature_score

        # 添加對亞洲商業街道的專門檢測
        if (features.get("edges_density", 0) > 0.25 and  # 高邊緣密度
            features.get("vertical_strength", 0) > features.get("horizontal_strength", 0) * 1.8 and  # 更強的垂直結構
            features.get("brightness_uniformity", 0) < 0.6):  # 較低的亮度均勻性（招牌、燈光等造成）
            asian_street_score = -2.2  # 非常強的室外代表性特徵
            indoor_score += asian_street_score
            feature_contributions["asian_commercial_street"] = asian_street_score


        # 6. 垂直/水平梯度比率
        gradient_contribution = 0
        if features["gradient_ratio"] > 2.0:
            combined_uniformity = (features["brightness_uniformity"] +
                                features.get("ceiling_uniformity", 0)) / 2

            if combined_uniformity > 0.5:
                gradient_contribution = weights["gradient_ratio"] * 0.7
            else:
                gradient_contribution = -weights["gradient_ratio"] * 0.3

            indoor_score += gradient_contribution
            feature_contributions["gradient_ratio"] = gradient_contribution

        # 7. 亮點檢測（光源）
        bright_spot_contribution = 0
        bright_spot_count = features["bright_spot_count"]
        circular_light_count = features.get("circular_light_count", 0)

        # 調整亮點分析邏輯
        if circular_light_count >= 1:  # 即使只有一個圓形光源
            bright_spot_contribution = weights["bright_spots"] * 1.5
        elif bright_spot_count < 5:  # 適當放寬閾值
            bright_spot_contribution = weights["bright_spots"] * 0.5
        elif bright_spot_count > 15:  # 大量亮點比較有可能為室外
            bright_spot_contribution = -weights["bright_spots"] * 0.4

        indoor_score += bright_spot_contribution
        feature_contributions["bright_spots"] = bright_spot_contribution

        # 8. 色調分析
        yellow_contribution = 0
        if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
            if features.get("indoor_light_score", 0) > 0.2:
                yellow_contribution = weights["color_tone"] * 0.8
            else:
                yellow_contribution = weights["color_tone"] * 0.5

            indoor_score += yellow_contribution
            feature_contributions["yellow_tone"] = yellow_contribution

        if features.get("blue_ratio", 0) > 0.7:
            # 檢查是否有室內指標，如果有明顯的室內特徵，則減少藍色的負面影響
            if (features.get("ceiling_uniformity", 0) > 0.6 or
                features.get("boundary_edge_score", 0) > 0.3 or
                features.get("indoor_light_score", 0) > 0):
                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
            else:
                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
        else:
            blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
        # 9. 上半部與下半部亮度對比
        sky_contribution = 0
        if features["sky_brightness"] > features["avg_brightness"] * 1.3:
            if features["blue_ratio"] > 0.15:
                sky_contribution = -weights["sky_brightness"] * 0.9
            else:
                sky_contribution = -weights["sky_brightness"] * 0.6

            indoor_score += sky_contribution
            feature_contributions["sky_brightness"] = sky_contribution

        # 加入額外的餐廳特徵檢測邏輯
        dining_feature_contribution = 0

        # 檢測中央懸掛式燈具，有懸掛燈代表有天花板，就代表是室內
        if circular_light_count >= 1 and features.get("light_distribution_uniformity", 0) > 0.4:
            dining_feature_contribution = 1.5
            indoor_score += dining_feature_contribution
            feature_contributions["dining_features"] = dining_feature_contribution

        # 10. 增強的藍天的檢測，即便是小面積的藍天也是很強的室外指標
        sky_contribution = 0
        if "sky_blue_ratio" in features:
            # 只有當藍色區域集中在上部且亮度高時，才認為是藍天
            if features["sky_blue_ratio"] > 0.01 and features["sky_brightness"] > features.get("avg_brightness", 0) * 1.2:
                sky_outdoor_score = -2.5 * features["sky_blue_ratio"] * weights.get("blue_ratio", 1.2)
                indoor_score += sky_outdoor_score
                feature_contributions["sky_blue_detection"] = sky_outdoor_score

        asian_street_indicators = 0

        # 1: 高垂直結構強度
        vertical_ratio = features.get("vertical_strength", 0) / max(features.get("horizontal_strength", 1e-5), 1e-5)
        if vertical_ratio > 1.8:
            asian_street_indicators += 1

        # 2: 高邊緣密度 + 路面標記特徵
        if features.get("edges_density", 0) > 0.25 and features.get("street_line_score", 0) > 0.2:
            asian_street_indicators += 2

        # 3: 多個亮點 + 亮度不均勻
        if features.get("bright_spot_count", 0) > 5 and features.get("brightness_uniformity", 0) < 0.6:
            asian_street_indicators += 1

        # 4: 藍色區域小（天空被高樓遮擋）但亮度高
        if features.get("blue_ratio", 0) < 0.1 and features.get("sky_brightness", 0) > features.get("avg_brightness", 0) * 1.1:
            asian_street_indicators += 1

        # 如果滿足至少 3 個指標，調整權重變成偏向室外的判斷
        if asian_street_indicators >= 3:
            # 記錄檢測到的模式
            feature_contributions["asian_street_pattern"] = -2.5
            indoor_score += -2.5  # 明顯向室外傾斜

            # 降低室內指標的權重
            if "boundary_features" in feature_contributions:
                adjusted_contribution = feature_contributions["boundary_features"] * 0.4
                indoor_score -= (feature_contributions["boundary_features"] - adjusted_contribution)
                feature_contributions["boundary_features"] = adjusted_contribution

            if "ceiling_features" in feature_contributions:
                adjusted_contribution = feature_contributions["ceiling_features"] * 0.3
                indoor_score -= (feature_contributions["ceiling_features"] - adjusted_contribution)
                feature_contributions["ceiling_features"] = adjusted_contribution

            # 添加信息到診斷數據
            diagnostics["asian_street_detected"] = True
            diagnostics["asian_street_indicators"] = asian_street_indicators

        bedroom_indicators = 0

        # 1: 窗戶和牆壁形成的直角
        if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
            bedroom_indicators += 1.5  # 增加權重

        # 2: 天花板和光源
        if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
            bedroom_indicators += 2.5

        # 3: 良好對比度的牆壁顏色，適合臥房還有客廳
        if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
            bedroom_indicators += 1.5

        # 特殊的檢測 4: 檢測窗戶
        if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
            bedroom_indicators += 1.5

        # 如果滿足足夠的家居指標，提高多點室內判斷分數
        if bedroom_indicators >= 3:
            # 增加家居環境評分
            home_env_score = 3
            indoor_score += home_env_score
            feature_contributions["home_environment_pattern"] = home_env_score
        elif bedroom_indicators >= 2:
            # 適度增加家居環境評分
            home_env_score = 2
            indoor_score += home_env_score
            feature_contributions["home_environment_pattern"] = home_env_score

        # 根據總分轉換為概率（使用sigmoid函數）
        indoor_probability = 1 / (1 + np.exp(-indoor_score * 0.22))

        # 判斷結果
        is_indoor = indoor_probability > 0.5

        return {
            "is_indoor": is_indoor,
            "indoor_probability": indoor_probability,
            "indoor_score": indoor_score,
            "feature_contributions": feature_contributions,
            "diagnostics": diagnostics
        }

    def _determine_lighting_conditions(self, features, is_indoor):
        """
        基於特徵和室內/室外判斷確定光照條件。

        Args:
            features: 特徵字典
            is_indoor: 是否是室內環境

        Returns:
            Dict: 光照條件分析結果
        """
        # 初始化
        time_of_day = "unknown"
        confidence = 0.5
        diagnostics = {}

        avg_brightness = features["avg_brightness"]
        dark_pixel_ratio = features["dark_pixel_ratio"]
        yellow_orange_ratio = features["yellow_orange_ratio"]
        blue_ratio = features["blue_ratio"]
        gray_ratio = features["gray_ratio"]

        # 基於室內/室外分別判斷
        if is_indoor:
            # 計算室內住宅自然光指標
            natural_window_light = 0

            # 檢查窗戶特徵和光線特性
            if (features.get("blue_ratio", 0) > 0.1 and
                features.get("sky_brightness", 0) > avg_brightness * 1.1):
                natural_window_light += 1

            # 檢查均勻柔和的光線分布
            if (features.get("brightness_uniformity", 0) > 0.65 and
                features.get("brightness_std", 0) < 70):
                natural_window_light += 1

            # 檢查暖色調比例
            if features.get("warm_ratio", 0) > 0.2:
                natural_window_light += 1

            # 家居環境指標
            home_env_score = features.get("home_environment_pattern", 0)
            if home_env_score > 1.5:
                natural_window_light += 1

            # 1. 室內明亮環境，可能有窗戶自然光
            if avg_brightness > 130:
                # 檢測自然光住宅空間 - 新增類型!
                if natural_window_light >= 2 and home_env_score > 1.5:
                    time_of_day = "indoor_residential_natural"  # 家裡的自然光類型
                    confidence = 0.8
                    diagnostics["reason"] = "Bright residential space with natural window lighting"
                # 檢查窗戶特徵 - 如果有明亮的窗戶且色調為藍
                elif features.get("blue_ratio", 0) > 0.1 and features.get("sky_brightness", 0) > 150:
                    time_of_day = "indoor_bright"
                    confidence = 0.8
                    diagnostics["reason"] = "Bright indoor scene with window light"
                else:
                    time_of_day = "indoor_bright"
                    confidence = 0.75
                    diagnostics["reason"] = "High brightness in indoor environment"
            # 2. 室內中等亮度環境
            elif avg_brightness > 100:
                time_of_day = "indoor_moderate"
                confidence = 0.7
                diagnostics["reason"] = "Moderate brightness in indoor environment"
            # 3. 室內低光照環境
            else:
                time_of_day = "indoor_dim"
                confidence = 0.65 + dark_pixel_ratio / 3
                diagnostics["reason"] = "Low brightness in indoor environment"

            # 1. 檢測設計師風格住宅，可以偵測到比較多種類的狀況
            designer_residential_score = 0
            # 檢測特色燈具
            if (features.get("circular_light_count", 0) > 0 or features.get("bright_spot_count", 0) > 2):
                designer_residential_score += 1
            # 檢測高品質均勻照明
            if features.get("brightness_uniformity", 0) > 0.7:
                designer_residential_score += 1
            # 檢測溫暖色調
            if features.get("warm_ratio", 0) > 0.3:
                designer_residential_score += 1
            # 檢測家居環境特徵
            if home_env_score > 1.5:
                designer_residential_score += 1

            if designer_residential_score >= 3 and home_env_score > 1.5:
                time_of_day = "indoor_designer_residential"
                confidence = 0.85
                diagnostics["special_case"] = "Designer residential lighting with decorative elements"

            # 2. 檢測餐廳/酒吧場景
            elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
                if features["warm_ratio"] > 0.4:
                    time_of_day = "indoor_restaurant"
                    confidence = 0.65 + yellow_orange_ratio / 4
                    diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"

            # 3. 檢測商業照明空間
            elif avg_brightness > 120 and features["bright_spot_count"] > 4:
                # 增加商業照明判別的精確度
                commercial_score = 0
                # 多個亮點
                commercial_score += min(1.0, features["bright_spot_count"] * 0.05)
                # 不太可能是住宅的指標
                if features.get("home_environment_pattern", 0) < 1.5:
                    commercial_score += 0.5
                # 整體照明結構化布局
                if features.get("light_distribution_uniformity", 0) > 0.6:
                    commercial_score += 0.5

                if commercial_score > 0.6 and designer_residential_score < 3:
                    time_of_day = "indoor_commercial"
                    confidence = 0.7 + commercial_score / 5
                    diagnostics["special_case"] = "Multiple structured light sources suggest commercial lighting"
        else:
            # 室外場景判斷保持不變
            if avg_brightness < 90:  # 降低夜間判斷的亮度閾值
                # 檢測是否有車燈/街燈
                has_lights = features["bright_spot_count"] > 3

                if has_lights:
                    time_of_day = "night"
                    confidence = 0.8 + dark_pixel_ratio / 5
                    diagnostics["reason"] = "Low brightness with light sources detected"

                    # 檢查是否是霓虹燈場景
                    if yellow_orange_ratio > 0.15 and features["bright_spot_count"] > 5:
                        time_of_day = "neon_night"
                        confidence = 0.75 + yellow_orange_ratio / 3
                        diagnostics["special_case"] = "Multiple colorful light sources suggest neon lighting"
                else:
                    time_of_day = "night"
                    confidence = 0.7 + dark_pixel_ratio / 3
                    diagnostics["reason"] = "Low brightness outdoor scene"
            elif avg_brightness < 130 and yellow_orange_ratio > 0.2:
                time_of_day = "sunset/sunrise"
                confidence = 0.7 + yellow_orange_ratio / 3
                diagnostics["reason"] = "Moderate brightness with yellow-orange tones"
            elif avg_brightness > 150 and blue_ratio > 0.15:
                time_of_day = "day_clear"
                confidence = 0.7 + blue_ratio / 3
                diagnostics["reason"] = "High brightness with blue tones (likely sky)"
            elif avg_brightness > 130:
                time_of_day = "day_cloudy"
                confidence = 0.7 + gray_ratio / 3
                diagnostics["reason"] = "Good brightness with higher gray tones"
            else:
                # 默認判斷
                if yellow_orange_ratio > gray_ratio:
                    time_of_day = "sunset/sunrise"
                    confidence = 0.6 + yellow_orange_ratio / 3
                    diagnostics["reason"] = "Yellow-orange tones dominant"
                else:
                    time_of_day = "day_cloudy"
                    confidence = 0.6 + gray_ratio / 3
                    diagnostics["reason"] = "Gray tones dominant"

            # 檢查是否是特殊室外場景（如體育場）
            if avg_brightness > 120 and features["brightness_uniformity"] > 0.8:
                # 高亮度且非常均勻的光照可能是體育場燈光
                time_of_day = "stadium_lighting"
                confidence = 0.7
                diagnostics["special_case"] = "Uniform bright lighting suggests stadium/sports lighting"

            # 檢查是否是混合光照（如室內/室外過渡區）
            if 100 < avg_brightness < 150 and 0.1 < blue_ratio < 0.2:
                if features["gradient_ratio"] > 1.5:
                    time_of_day = "mixed_lighting"
                    confidence = 0.65
                    diagnostics["special_case"] = "Features suggest indoor-outdoor transition area"

        # 確保信心值在 0-1 範圍內
        confidence = min(0.95, max(0.5, confidence))

        if time_of_day in ["indoor_residential_natural", "indoor_designer_residential"] and hasattr(self, "config"):
            # 確保 LIGHTING_CONDITIONS 中有這些新類型的描述
            if time_of_day == "indoor_residential_natural":
                lightingType = {
                    "template_modifiers": {
                        "indoor_residential_natural": "naturally-lit residential"
                    },
                    "time_descriptions": {
                        "indoor_residential_natural": {
                            "general": "The scene is captured in a residential space with ample natural light from windows.",
                            "bright": "The residential space is brightly lit with natural daylight streaming through windows.",
                            "medium": "The home environment has good natural lighting providing a warm, inviting atmosphere.",
                            "dim": "The living space has soft natural light filtering through windows or openings."
                        }
                    }
                }
            elif time_of_day == "indoor_designer_residential":
                lightingType = {
                    "template_modifiers": {
                        "indoor_designer_residential": "designer-lit residential"
                    },
                    "time_descriptions": {
                        "indoor_designer_residential": {
                            "general": "The scene is captured in a residential space with carefully designed lighting elements.",
                            "bright": "The home features professionally designed lighting with decorative fixtures creating a bright atmosphere.",
                            "medium": "The residential interior showcases curated lighting design balancing form and function.",
                            "dim": "The living space has thoughtfully placed designer lighting creating an intimate ambiance."
                        }
                    }
                }

        return {
            "time_of_day": time_of_day,
            "confidence": confidence,
            "diagnostics": diagnostics
        }


    def _get_default_config(self):
        """
        返回優化版本的默認配置參數。
        """
        return {
            "indoor_outdoor_weights": {
                "blue_ratio": 0.6,
                "brightness_uniformity": 1.2,
                "gradient_ratio": 0.7,
                "bright_spots": 0.8,
                "color_tone": 0.5,
                "sky_brightness": 0.9,
                "brightness_variation": 0.7,
                "ceiling_features": 1.5,
                "light_features": 1.1,
                "boundary_features": 2.8,
                "street_features": 2,
                "building_features": 1.6
            },
            "include_diagnostics": True
        }

In [None]:
# %%writefile scene_description.py
import os
import json
from typing import Dict, List, Tuple, Any, Optional

# from scene_type import SCENE_TYPES
# from scene_detail_templates import SCENE_DETAIL_TEMPLATES
# from object_template_fillers import OBJECT_TEMPLATE_FILLERS
# from activity_templates import ACTIVITY_TEMPLATES
# from safety_templates import SAFETY_TEMPLATES
# from confifence_templates import CONFIDENCE_TEMPLATES

class SceneDescriptor:
    """
    Generates natural language descriptions of scenes.
    Handles scene descriptions, activity inference, and safety concerns identification.
    """

    def __init__(self, scene_types=None, object_categories=None):
        """
        Initialize the scene descriptor

        Args:
            scene_types: Dictionary of scene type definitions
        """
        self.scene_types = scene_types or {}
        self.SCENE_TYPES = scene_types or {}

        if object_categories:
            self.OBJECT_CATEGORIES = object_categories
        else:
            # 從 JSON 加載或使用默認值
            self.OBJECT_CATEGORIES = self._load_json_data("object_categories") or {
                "furniture": [56, 57, 58, 59, 60, 61],
                "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
                "kitchen_items": [39, 40, 41, 42, 43, 44, 45],
                "food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
                "vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
                "personal_items": [24, 25, 26, 27, 28, 73, 78, 79]
            }

        # 加載所有模板數據
        self._load_templates()

    def _load_templates(self):
        """Load all template data from script or fallback to imported defaults"""
        self.confidence_templates = CONFIDENCE_TEMPLATES
        self.scene_detail_templates = SCENE_DETAIL_TEMPLATES
        self.object_template_fillers = OBJECT_TEMPLATE_FILLERS
        self.safety_templates = SAFETY_TEMPLATES
        self.activity_templates = ACTIVITY_TEMPLATES


    def _initialize_fallback_templates(self):
        """Initialize fallback templates when no external data is available"""
        # 只在無法從文件或導入加載時使用
        self.confidence_templates = {
            "high": "{description} {details}",
            "medium": "This appears to be {description} {details}",
            "low": "This might be {description}, but the confidence is low. {details}"
        }

        # 僅提供最基本的模板作為後備
        self.scene_detail_templates = {
            "default": ["A space with various objects."]
        }

        self.object_template_fillers = {
            "default": ["various items"]
        }

        self.safety_templates = {
            "general": "Pay attention to {safety_element}."
        }

        self.activity_templates = {
            "default": ["General activity"]
        }

    def _get_alternative_scenes(self, scene_scores: Dict[str, float],
                            threshold: float, top_k: int = 2) -> List[Dict]:
        """
        Get alternative scene interpretations with their scores.

        Args:
            scene_scores: Dictionary of scene type scores
            threshold: Minimum confidence threshold
            top_k: Number of alternatives to return

        Returns:
            List of dictionaries with alternative scenes
        """
        # Sort scenes by score in descending order
        sorted_scenes = sorted(scene_scores.items(), key=lambda x: x[1], reverse=True)

        # Skip the first one (best match) and take the next top_k
        alternatives = []
        for scene_type, score in sorted_scenes[1:1+top_k]:
            if score >= threshold:
                alternatives.append({
                    "type": scene_type,
                    "name": self.SCENE_TYPES.get(scene_type, {}).get("name", "Unknown"),
                    "confidence": score
                })

        return alternatives


    def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict]) -> List[str]:
        """
        Infer possible activities based on scene type and detected objects.

        Args:
            scene_type: Identified scene type
            detected_objects: List of detected objects

        Returns:
            List of possible activities
        """
        activities = []

        if scene_type.startswith("aerial_view_"):
            if scene_type == "aerial_view_intersection":
                # 使用預定義的十字路口活動
                activities.extend(self.activity_templates.get("aerial_view_intersection", []))

                # 添加與行人和車輛相關的特定活動
                pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
                vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]  # Car, bus, truck

                if pedestrians and vehicles:
                    activities.append("Waiting for an opportunity to cross the street")
                    activities.append("Obeying traffic signals")

            elif scene_type == "aerial_view_commercial_area":
                activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))

            elif scene_type == "aerial_view_plaza":
                activities.extend(self.activity_templates.get("aerial_view_plaza", []))

            else:
                # 處理其他未明確定義的空中視角場景
                aerial_activities = [
                    "Street crossing",
                    "Waiting for signals",
                    "Following traffic rules",
                    "Pedestrian movement"
                ]
                activities.extend(aerial_activities)

        if scene_type in self.activity_templates:
            activities.extend(self.activity_templates[scene_type])
        elif "default" in self.activity_templates:
            activities.extend(self.activity_templates["default"])

        detected_class_ids = [obj["class_id"] for obj in detected_objects]

        # Add activities based on specific object combinations
        if 62 in detected_class_ids and 57 in detected_class_ids:  # TV and sofa
            activities.append("Watching shows or movies")

        if 63 in detected_class_ids:  # laptop
            activities.append("Using a computer/laptop")

        if 67 in detected_class_ids:  # cell phone
            activities.append("Using a mobile phone")

        if 73 in detected_class_ids:  # book
            activities.append("Reading")

        if any(food_id in detected_class_ids for food_id in [46, 47, 48, 49, 50, 51, 52, 53, 54, 55]):
            activities.append("Eating or preparing food")

        # Person-specific activities
        if 0 in detected_class_ids:  # Person
            if any(vehicle in detected_class_ids for vehicle in [1, 2, 3, 5, 7]):  # Vehicles
                activities.append("Commuting or traveling")

            if 16 in detected_class_ids:  # Dog
                activities.append("Walking a dog")

            if 24 in detected_class_ids or 26 in detected_class_ids:  # Backpack or handbag
                activities.append("Carrying personal items")

        # Remove duplicates
        return list(set(activities))

    def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
        """
        Identify potential safety concerns based on objects and scene type.

        Args:
            detected_objects: List of detected objects
            scene_type: Identified scene type

        Returns:
            List of potential safety concerns
        """
        concerns = []
        detected_class_ids = [obj["class_id"] for obj in detected_objects]

        # ORIGINAL SAFETY CONCERNS LOGIC

        # General safety concerns
        if 42 in detected_class_ids or 43 in detected_class_ids:  # Fork or knife
            concerns.append("Sharp utensils present")

        if 76 in detected_class_ids:  # Scissors
            concerns.append("Cutting tools present")

        # Traffic-related concerns
        if scene_type in ["city_street", "parking_lot"]:
            if 0 in detected_class_ids:  # Person
                if any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7, 8]):  # Vehicles
                    concerns.append("Pedestrians near vehicles")

            if 9 in detected_class_ids:  # Traffic light
                concerns.append("Monitor traffic signals")

        # Identify crowded scenes
        person_count = detected_class_ids.count(0)
        if person_count > 5:
            concerns.append(f"Crowded area with multiple people ({person_count})")

        # Scene-specific concerns
        if scene_type == "kitchen":
            if 68 in detected_class_ids or 69 in detected_class_ids:  # Microwave or oven
                concerns.append("Hot cooking equipment")

        # Potentially unstable objects
        for obj in detected_objects:
            if obj["class_id"] in [39, 40, 41, 45]:  # Bottle, wine glass, cup, bowl
                if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
                    concerns.append(f"Elevated {obj['class_name']} might be unstable")

        # NEW SAFETY CONCERNS LOGIC FOR ADDITIONAL SCENE TYPES

        # Upscale dining safety concerns
        if scene_type == "upscale_dining":
            # Check for fragile items
            if 40 in detected_class_ids:  # Wine glass
                concerns.append("Fragile glassware present")

            # Check for lit candles (can't directly detect but can infer from context)
            # Look for small bright spots that might be candles
            if any(obj["class_id"] == 41 for obj in detected_objects):  # Cup (which might include candle holders)
                # We can't reliably detect candles, but if the scene appears to be formal dining,
                # we can suggest this as a possibility
                concerns.append("Possible lit candles or decorative items requiring care")

            # Check for overcrowded table
            table_objs = [obj for obj in detected_objects if obj["class_id"] == 60]  # Dining table
            if table_objs:
                table_region = table_objs[0]["region"]
                items_on_table = 0

                for obj in detected_objects:
                    if obj["class_id"] in [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]:
                        if obj["region"] == table_region:
                            items_on_table += 1

                if items_on_table > 8:
                    concerns.append("Dining table has multiple items which should be handled with care")

        # Asian commercial street safety concerns
        elif scene_type == "asian_commercial_street":
            # Check for crowded walkways
            if 0 in detected_class_ids:  # Person
                person_count = detected_class_ids.count(0)
                if person_count > 3:
                    # Calculate person density (simplified)
                    person_positions = []
                    for obj in detected_objects:
                        if obj["class_id"] == 0:
                            person_positions.append(obj["normalized_center"])

                    if len(person_positions) >= 2:
                        # Calculate average distance between people
                        total_distance = 0
                        count = 0
                        for i in range(len(person_positions)):
                            for j in range(i+1, len(person_positions)):
                                p1 = person_positions[i]
                                p2 = person_positions[j]
                                distance = ((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)**0.5
                                total_distance += distance
                                count += 1

                        if count > 0:
                            avg_distance = total_distance / count
                            if avg_distance < 0.1:  # Close proximity
                                concerns.append("Crowded walkway with limited personal space")

            # Check for motorcycles/bicycles near pedestrians
            if (1 in detected_class_ids or 3 in detected_class_ids) and 0 in detected_class_ids:  # Bicycle/motorcycle and person
                concerns.append("Two-wheeled vehicles in pedestrian areas")

            # Check for potential trip hazards
            # We can't directly detect this, but can infer from context
            if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
                # If people are in bottom regions, they might be walking on uneven surfaces
                concerns.append("Potential uneven walking surfaces in commercial area")

        # Financial district safety concerns
        elif scene_type == "financial_district":
            # Check for heavy traffic conditions
            vehicle_count = sum(1 for obj_id in detected_class_ids if obj_id in [2, 5, 7])  # Car, bus, truck
            if vehicle_count > 5:
                concerns.append("Heavy vehicle traffic in urban area")

            # Check for pedestrians crossing busy streets
            if 0 in detected_class_ids:  # Person
                person_count = detected_class_ids.count(0)
                vehicle_nearby = any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7])

                if person_count > 0 and vehicle_nearby:
                    concerns.append("Pedestrians navigating busy urban traffic")

            # Check for traffic signals
            if 9 in detected_class_ids:  # Traffic light
                concerns.append("Observe traffic signals when navigating this area")
            else:
                # If no traffic lights detected but it's a busy area, it's worth noting
                if vehicle_count > 3:
                    concerns.append("Busy traffic area potentially without visible traffic signals in view")

            # Time of day considerations
            # We don't have direct time data, but can infer from vehicle lights
            vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
            if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
                # If vehicles are present and it might be evening/night
                concerns.append("Reduced visibility conditions during evening commute")

        # Urban intersection safety concerns
        elif scene_type == "urban_intersection":
            # Check for pedestrians in crosswalks
            pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
            vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 3, 5, 7]]

            if pedestrian_objs:
                # Calculate distribution of pedestrians to see if they're crossing
                pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]

                # Simplified check for pedestrians in crossing pattern
                if len(pedestrian_positions) >= 3:
                    # Check if pedestrians are distributed across different regions
                    pedestrian_regions = set(obj["region"] for obj in pedestrian_objs)
                    if len(pedestrian_regions) >= 2:
                        concerns.append("Multiple pedestrians crossing the intersection")

            # Check for traffic signal observation
            if 9 in detected_class_ids:  # Traffic light
                concerns.append("Observe traffic signals when crossing")

            # Check for busy intersection
            if len(vehicle_objs) > 3:
                concerns.append("Busy intersection with multiple vehicles")

            # Check for pedestrians potentially jay-walking
            if pedestrian_objs and not 9 in detected_class_ids:  # People but no traffic lights
                concerns.append("Pedestrians should use designated crosswalks")

            # Visibility concerns based on lighting
            # This would be better with actual lighting data
            pedestrian_count = len(pedestrian_objs)
            if pedestrian_count > 5:
                concerns.append("High pedestrian density at crossing points")

        # Transit hub safety concerns
        elif scene_type == "transit_hub":
            # These would be for transit areas like train stations or bus terminals
            if 0 in detected_class_ids:  # Person
                person_count = detected_class_ids.count(0)
                if person_count > 8:
                    concerns.append("Crowded transit area requiring careful navigation")

            # Check for luggage/bags that could be trip hazards
            if 24 in detected_class_ids or 28 in detected_class_ids:  # Backpack or suitcase
                concerns.append("Luggage and personal items may create obstacles")

            # Public transportation vehicles
            if any(vehicle in detected_class_ids for vehicle in [5, 6, 7]):  # Bus, train, truck
                concerns.append("Stay clear of arriving and departing transit vehicles")

        # Shopping district safety concerns
        elif scene_type == "shopping_district":
            # Check for crowded shopping areas
            if 0 in detected_class_ids:  # Person
                person_count = detected_class_ids.count(0)
                if person_count > 5:
                    concerns.append("Crowded shopping area with multiple people")

            # Check for shopping bags and personal items
            if 24 in detected_class_ids or 26 in detected_class_ids:  # Backpack or handbag
                concerns.append("Mind personal belongings in busy retail environment")

            # Check for store entrances/exits which might have automatic doors
            # We can't directly detect this, but can infer from context
            if scene_type == "shopping_district" and 0 in detected_class_ids:
                concerns.append("Be aware of store entrances and exits with potential automatic doors")

        return concerns

In [None]:
# %%writefile clip_prompts.py

# 場景類型提示
SCENE_TYPE_PROMPTS = {
    # 基本室內場景
    "living_room": "A photo of a living room with furniture and entertainment systems.",
    "bedroom": "A photo of a bedroom with a bed and personal items.",
    "dining_area": "A photo of a dining area with a table and chairs for meals.",
    "kitchen": "A photo of a kitchen with cooking appliances and food preparation areas.",
    "office_workspace": "A photo of an office workspace with desk, computer and work equipment.",
    "meeting_room": "A photo of a meeting room with a conference table and multiple chairs.",

    # 基本室外/城市場景
    "city_street": "A photo of a city street with traffic, pedestrians and urban buildings.",
    "parking_lot": "A photo of a parking lot with multiple parked vehicles.",
    "park_area": "A photo of a park or recreational area with greenery and outdoor facilities.",
    "retail_store": "A photo of a retail store with merchandise displays and shopping areas.",
    "supermarket": "A photo of a supermarket with food items, aisles and shopping carts.",

    # 特殊室內場景
    "upscale_dining": "A photo of an upscale dining area with elegant furniture and refined decor.",
    "conference_room": "A photo of a professional conference room with presentation equipment and seating.",
    "classroom": "A photo of a classroom with desks, chairs and educational equipment.",
    "library": "A photo of a library with bookshelves, reading areas and study spaces.",

    # 亞洲特色場景
    "asian_commercial_street": "A photo of an Asian commercial street with dense signage, shops and pedestrians.",
    "asian_night_market": "A photo of an Asian night market with food stalls, crowds and colorful lights.",
    "asian_temple_area": "A photo of an Asian temple with traditional architecture and cultural elements.",

    # 交通相關場景
    "financial_district": "A photo of a financial district with tall office buildings and business activity.",
    "urban_intersection": "A photo of an urban intersection with crosswalks, traffic lights and pedestrians crossing.",
    "transit_hub": "A photo of a transportation hub with multiple modes of public transit and passengers.",
    "bus_stop": "A photo of a bus stop with people waiting and buses arriving or departing.",
    "bus_station": "A photo of a bus terminal with multiple buses and traveler facilities.",
    "train_station": "A photo of a train station with platforms, trains and passenger activity.",
    "airport": "A photo of an airport with planes, terminals and traveler activity.",

    # 商業場景
    "shopping_district": "A photo of a shopping district with multiple retail stores and consumer activity.",
    "cafe": "A photo of a cafe with coffee service, seating and casual dining.",
    "restaurant": "A photo of a restaurant with dining tables, food service and eating areas.",

    # 空中視角場景
    "aerial_view_intersection": "An aerial view of an intersection showing crosswalks and traffic patterns from above.",
    "aerial_view_commercial_area": "An aerial view of a commercial area showing shopping districts from above.",
    "aerial_view_plaza": "An aerial view of a public plaza or square showing patterns of people movement from above.",

    # 娛樂場景
    "zoo": "A photo of a zoo with animal enclosures, exhibits and visitors.",
    "playground": "A photo of a playground with recreational equipment and children playing.",
    "sports_field": "A photo of a sports field with playing surfaces and athletic equipment.",
    "sports_stadium": "A photo of a sports stadium with spectator seating and athletic facilities.",

    # 水相關場景
    "harbor": "A photo of a harbor with boats, docks and waterfront activity.",
    "beach_water_recreation": "A photo of a beach area with water activities, sand and recreational equipment like surfboards.",

    # 文化時間特定場景
    "nighttime_street": "A photo of a street at night with artificial lighting and evening activity.",
    "nighttime_commercial_district": "A photo of a commercial district at night with illuminated signs and evening shopping.",
    "european_plaza": "A photo of a European-style plaza with historic architecture and public gathering spaces.",

    # 混合環境場景
    "indoor_outdoor_cafe": "A photo of a cafe with both indoor seating and outdoor patio areas.",
    "transit_station_platform": "A photo of a transit station platform with waiting areas and arriving vehicles.",

    # 工作場景
    "construction_site": "A photo of a construction site with building materials, equipment and workers.",
    "medical_facility": "A photo of a medical facility with healthcare equipment and professional staff.",
    "educational_setting": "A photo of an educational setting with learning spaces and academic resources.",
    "professional_kitchen": "A photo of a professional commercial kitchen with industrial cooking equipment and food preparation stations."
}

# 文化特定場景提示
CULTURAL_SCENE_PROMPTS = {
    "asian_commercial_street": [
        "A busy Asian shopping street with neon signs and dense storefronts.",
        "A commercial street in Asia with multi-level signage and narrow walkways.",
        "A street scene in Taiwan or Hong Kong with vertical signage and compact shops.",
        "A crowded commercial alley in an Asian city with signs in Chinese characters.",
        "A narrow shopping street in Asia with small shops on both sides.",
        "An outdoor shopping district in an East Asian city with electronic billboards.",
        "A bustling commercial street in Taiwan with food vendors and retail shops.",
        "A pedestrian shopping area with Korean or Chinese signs and storefronts.",
        "A daytime shopping street in an Asian urban center with vertical development."
    ],
    "asian_night_market": [
        "A vibrant night market in Asia with food stalls and large crowds.",
        "An evening street market in Taiwan with street food vendors and bright lights.",
        "A busy night bazaar in Asia with illuminated stalls and local food.",
        "A crowded night street food market in an Asian city with vendor carts.",
        "An Asian night market with steam from cooking food and hanging lanterns.",
        "A nocturnal food street in East Asia with vendor canopies and neon lights.",
        "A bustling evening market with rows of food stalls and plastic stools.",
        "A lively Asian street food scene at night with cooking stations and crowds."
    ],
    "asian_temple_area": [
        "A traditional Asian temple with ornate roof details and religious symbols.",
        "A Buddhist temple complex in East Asia with multiple pavilions and prayer areas.",
        "A sacred site in Asia with incense burners and ceremonial elements.",
        "A temple courtyard with stone statues and traditional Asian architecture.",
        "A spiritual center in East Asia with pagoda-style structures and visitors.",
        "An ancient temple site with Asian architectural elements and cultural symbols.",
        "A religious compound with characteristic Asian roof curves and decorative features."
    ],
    "european_plaza": [
        "A historic European city square with classical architecture and cafes.",
        "An old-world plaza in Europe with cobblestone paving and historic buildings.",
        "A public square in a European city with fountains and surrounding architecture.",
        "A central plaza in Europe with outdoor seating areas and historic monuments.",
        "A traditional European town square with surrounding shops and restaurants.",
        "A historic gathering space in Europe with distinctive architecture and pedestrians."
    ]
}

# 對比類別提示
COMPARATIVE_PROMPTS = {
    "indoor_vs_outdoor": [
        "An indoor shopping mall corridor with controlled lighting and storefronts.",
        "An outdoor commercial street with natural lighting and urban storefronts.",
        "An enclosed shopping gallery with artificial lighting and climate control.",
        "An open-air market street with natural light and weather exposure."
    ],
    "professional_vs_home": [
        "A professional commercial kitchen with stainless steel equipment and workstations.",
        "A home kitchen with residential appliances and family cooking space.",
        "A restaurant kitchen with multiple cooking stations and chef activity.",
        "A family kitchen with standard household equipment and personal touches."
    ],
    "sports_venue_vs_park": [
        "A professional sports stadium with designated playing areas and audience seating.",
        "A public park with casual recreation space and community greenery.",
        "An athletic venue with specialized sports equipment and competitive playing surfaces.",
        "An outdoor community space with general purpose areas and natural elements."
    ],
    "asian_vs_western_commercial": [
        "An Asian shopping street with vertical signage and compact multi-level shops.",
        "A Western commercial street with horizontal storefronts and wider sidewalks.",
        "An East Asian retail area with dense signage in Asian scripts and narrow walkways.",
        "A Western shopping district with uniform building heights and Latin alphabetic signs."
    ],
    "daytime_vs_nighttime": [
        "A daytime urban scene with natural sunlight illuminating streets and buildings.",
        "A nighttime city scene with artificial lighting from stores, signs and streetlights.",
        "A commercial district during daylight hours with natural shadows and visibility.",
        "An evening urban setting with illuminated storefronts and light patterns on streets."
    ],
    "aerial_vs_street_level": [
        "An aerial view showing urban patterns and layouts from above.",
        "A street-level view showing pedestrian perspective and immediate surroundings.",
        "A bird's-eye view of city organization and movement patterns from high above.",
        "An eye-level perspective showing direct human interaction with urban elements."
    ]
}

# 環境條件文本提示
LIGHTING_CONDITION_PROMPTS = {
    "day_clear": "A photo taken during daytime with clear skies and direct sunlight.",
    "day_cloudy": "A photo taken during daytime with overcast conditions and diffused light.",
    "sunset/sunrise": "A photo taken during sunset or sunrise with warm golden lighting and long shadows.",
    "night": "A photo taken at night with minimal natural light and artificial illumination.",
    "indoor_bright": "An indoor photo with bright, even artificial lighting throughout the space.",
    "indoor_moderate": "An indoor photo with moderate lighting creating a balanced indoor atmosphere.",
    "indoor_dim": "An indoor photo with low lighting levels creating a subdued environment.",
    "neon_night": "A night scene with colorful neon lighting creating vibrant illumination patterns.",
    "indoor_commercial": "An indoor retail environment with directed display lighting highlighting products.",
    "indoor_restaurant": "An indoor dining space with ambient mood lighting for atmosphere.",
    "stadium_lighting": "A sports venue with powerful floodlights creating intense, even illumination.",
    "mixed_lighting": "A scene with combined natural and artificial light sources creating transition zones.",
    "beach_daylight": "A photo taken at a beach with bright natural sunlight and reflections from water.",
    "sports_arena_lighting": "A photo of a sports venue illuminated by powerful overhead lighting systems.",
    "kitchen_task_lighting": "A photo of a kitchen with focused lighting concentrated on work surfaces."
}

# 針對新場景類型的特殊提示
SPECIALIZED_SCENE_PROMPTS = {
    "beach_water_recreation": [
        "A coastal beach scene with people surfing and sunbathing on sandy shores.",
        "Active water sports participants at a beach with surfboards and swimming areas.",
        "A sunny beach destination with recreational water equipment and beachgoers.",
        "A shoreline recreation area with surf gear and coastal activities.",
        "An oceanfront scene with people engaging in water sports and beach leisure.",
        "A popular beach spot with swimming areas and surfing zones.",
        "A coastal recreation setting with beach umbrellas and water activities."
    ],
    "sports_venue": [
        "An indoor sports arena with professional equipment and competition spaces.",
        "A sports stadium with marked playing areas and spectator seating arrangement.",
        "A specialized athletic venue with competition equipment and performance areas.",
        "A professional sports facility with game-related apparatus and audience zones.",
        "An organized sports center with competitive play areas and athletic equipment.",
        "A competition venue with sport-specific markings and professional setup.",
        "A formal athletic facility with standardized equipment and playing surfaces."
    ],
    "professional_kitchen": [
        "A commercial restaurant kitchen with multiple cooking stations and food prep areas.",
        "A professional culinary workspace with industrial appliances and chef activity.",
        "A busy restaurant back-of-house with stainless steel equipment and meal preparation.",
        "A commercial food service kitchen with chef workstations and specialized zones.",
        "An industrial kitchen facility with specialized cooking equipment and prep surfaces.",
        "A high-volume food production kitchen with professional-grade appliances.",
        "A restaurant kitchen with distinct cooking areas and culinary workflow design."
    ],
    "urban_intersection": [
        "A city intersection with crosswalks and traffic signals controlling movement.",
        "A busy urban crossroad with pedestrian crossings and vehicle traffic.",
        "A regulated street intersection with crosswalk markings and waiting pedestrians.",
        "A metropolitan junction with traffic lights and pedestrian crossing zones.",
        "A city street crossing with safety features for pedestrians and traffic flow.",
        "A controlled urban intersection with movement patterns for vehicles and people.",
        "A city center crossroad with traffic management features and pedestrian areas."
    ],
    "financial_district": [
        "A downtown business area with tall office buildings and commercial activity.",
        "An urban financial center with skyscrapers and professional environment.",
        "A city's business district with corporate headquarters and office towers.",
        "A metropolitan financial zone with high-rise buildings and business traffic.",
        "A corporate district in a city center with professional architecture.",
        "An urban area dominated by office buildings and business establishments.",
        "A city's economic center with banking institutions and corporate offices."
    ],
    "aerial_view_intersection": [
        "A bird's-eye view of a city intersection showing crossing patterns from above.",
        "An overhead perspective of an urban crossroad showing traffic organization.",
        "A top-down view of a street intersection revealing pedestrian crosswalks.",
        "An aerial shot of a city junction showing the layout of roads and crossings.",
        "A high-angle view of an intersection showing traffic and pedestrian flow patterns.",
        "A drone perspective of urban crossing design viewed from directly above.",
        "A vertical view of a street intersection showing crossing infrastructure."
    ]
}

VIEWPOINT_PROMPTS = {
    "eye_level": "A photo taken from normal human eye level showing a direct frontal perspective.",
    "aerial": "A photo taken from high above looking directly down at the scene below.",
    "elevated": "A photo taken from a higher than normal position looking down at an angle.",
    "low_angle": "A photo taken from a low position looking upward at the scene.",
    "bird_eye": "A photo taken from very high above showing a complete overhead perspective.",
    "street_level": "A photo taken from the perspective of someone standing on the street.",
    "interior": "A photo taken from inside a building showing the internal environment.",
    "vehicular": "A photo taken from inside or mounted on a moving vehicle."
}

OBJECT_COMBINATION_PROMPTS = {
    "dining_setting": "A scene with tables, chairs, plates, and eating utensils arranged for meals.",
    "office_setup": "A scene with desks, chairs, computers, and office supplies for work.",
    "living_space": "A scene with sofas, coffee tables, TVs, and comfortable seating arrangements.",
    "transportation_hub": "A scene with vehicles, waiting areas, passengers, and transit information.",
    "retail_environment": "A scene with merchandise displays, shoppers, and store fixtures.",
    "crosswalk_scene": "A scene with street markings, pedestrians crossing, and traffic signals.",
    "cooking_area": "A scene with stoves, prep surfaces, cooking utensils, and food items.",
    "recreational_space": "A scene with sports equipment, play areas, and activity participants."
}

ACTIVITY_PROMPTS = {
    "shopping": "People looking at merchandise, carrying shopping bags, and browsing stores.",
    "dining": "People eating food, sitting at tables, and using dining utensils.",
    "commuting": "People waiting for transportation, boarding vehicles, and traveling.",
    "working": "People using computers, attending meetings, and engaged in professional tasks.",
    "exercising": "People engaged in physical activities, using sports equipment, and training.",
    "cooking": "People preparing food, using kitchen equipment, and creating meals.",
    "crossing_street": "People walking across designated crosswalks and navigating intersections.",
    "recreational_activity": "People engaged in leisure activities, games, and social recreation."
}

In [None]:
# %%writefile clip_analyzer.py
import torch
import clip
import numpy as np
from PIL import Image
from typing import Dict, List, Tuple, Any, Optional, Union

# from clip_prompts import (
#     SCENE_TYPE_PROMPTS,
#     CULTURAL_SCENE_PROMPTS,
#     COMPARATIVE_PROMPTS,
#     LIGHTING_CONDITION_PROMPTS,
#     SPECIALIZED_SCENE_PROMPTS,
#     VIEWPOINT_PROMPTS,
#     OBJECT_COMBINATION_PROMPTS,
#     ACTIVITY_PROMPTS
# )

class CLIPAnalyzer:
    """
    Use Clip to intergrate scene understanding function
    """

    def __init__(self, model_name: str = "ViT-B/32", device: str = None):
        """
        初始化 CLIP 分析器。

        Args:
            model_name: CLIP Model name,  "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
            device: Use GPU if it can use
        """
        # 自動選擇設備
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device

        print(f"Loading CLIP model {model_name} on {self.device}...")
        try:
            self.model, self.preprocess = clip.load(model_name, device=self.device)
            print(f"CLIP model loaded successfully.")
        except Exception as e:
            print(f"Error loading CLIP model: {e}")
            raise

        self.scene_type_prompts = SCENE_TYPE_PROMPTS
        self.cultural_scene_prompts = CULTURAL_SCENE_PROMPTS
        self.comparative_prompts = COMPARATIVE_PROMPTS
        self.lighting_condition_prompts = LIGHTING_CONDITION_PROMPTS
        self.specialized_scene_prompts = SPECIALIZED_SCENE_PROMPTS
        self.viewpoint_prompts = VIEWPOINT_PROMPTS
        self.object_combination_prompts = OBJECT_COMBINATION_PROMPTS
        self.activity_prompts = ACTIVITY_PROMPTS

        # turn to CLIP format
        self._prepare_text_prompts()

    def _prepare_text_prompts(self):
        """準備所有文本提示的 CLIP 特徵"""
        # base prompt
        scene_texts = [self.scene_type_prompts[scene_type] for scene_type in self.scene_type_prompts]
        self.scene_type_tokens = clip.tokenize(scene_texts).to(self.device)

        # cultural
        self.cultural_tokens_dict = {}
        for scene_type, prompts in self.cultural_scene_prompts.items():
            self.cultural_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)

        # Light
        lighting_texts = [self.lighting_condition_prompts[cond] for cond in self.lighting_condition_prompts]
        self.lighting_tokens = clip.tokenize(lighting_texts).to(self.device)

        # specializes_status
        self.specialized_tokens_dict = {}
        for scene_type, prompts in self.specialized_scene_prompts.items():
            self.specialized_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)

        # view point
        viewpoint_texts = [self.viewpoint_prompts[viewpoint] for viewpoint in self.viewpoint_prompts]
        self.viewpoint_tokens = clip.tokenize(viewpoint_texts).to(self.device)

        # object combination
        object_combination_texts = [self.object_combination_prompts[combo] for combo in self.object_combination_prompts]
        self.object_combination_tokens = clip.tokenize(object_combination_texts).to(self.device)

        # activicty prompt
        activity_texts = [self.activity_prompts[activity] for activity in self.activity_prompts]
        self.activity_tokens = clip.tokenize(activity_texts).to(self.device)

    def analyze_image(self, image, include_cultural_analysis: bool = True) -> Dict[str, Any]:
        """
        分析圖像，預測場景類型和光照條件。

        Args:
            image: 輸入圖像 (PIL Image 或 numpy array)
            include_cultural_analysis: 是否包含文化場景的詳細分析

        Returns:
            Dict: 包含場景類型預測和光照條件的分析結果
        """
        try:
            # 確保圖像是 PIL 格式
            if not isinstance(image, Image.Image):
                if isinstance(image, np.ndarray):
                    image = Image.fromarray(image)
                else:
                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

            # 預處理圖像
            image_input = self.preprocess(image).unsqueeze(0).to(self.device)

            # 獲取圖像特徵
            with torch.no_grad():
                image_features = self.model.encode_image(image_input)
                image_features = image_features / image_features.norm(dim=-1, keepdim=True)

            # 分析場景類型
            scene_scores = self._analyze_scene_type(image_features)

            # 分析光照條件
            lighting_scores = self._analyze_lighting_condition(image_features)

            # 文化場景的增強分析
            cultural_analysis = {}
            if include_cultural_analysis:
                for scene_type in self.cultural_scene_prompts:
                    if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
                        cultural_analysis[scene_type] = self._analyze_cultural_scene(
                            image_features, scene_type
                        )

            specialized_analysis = {}
            for scene_type in self.specialized_scene_prompts:
                if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
                    specialized_analysis[scene_type] = self._analyze_specialized_scene(
                        image_features, scene_type
                    )

            viewpoint_scores = self._analyze_viewpoint(image_features)

            object_combination_scores = self._analyze_object_combinations(image_features)

            activity_scores = self._analyze_activities(image_features)

            # display results
            result = {
                "scene_scores": scene_scores,
                "top_scene": max(scene_scores.items(), key=lambda x: x[1]),
                "lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]),
                "embedding": image_features.cpu().numpy().tolist()[0] if self.device == "cuda" else image_features.numpy().tolist()[0],
                "viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]),
                "object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3],
                "activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3]
            }

            if cultural_analysis:
                result["cultural_analysis"] = cultural_analysis

            if specialized_analysis:
                result["specialized_analysis"] = specialized_analysis

            return result

        except Exception as e:
            print(f"Error analyzing image with CLIP: {e}")
            import traceback
            traceback.print_exc()
            return {"error": str(e)}

    def _analyze_scene_type(self, image_features: torch.Tensor) -> Dict[str, float]:
        """分析圖像特徵與各場景類型的相似度"""
        with torch.no_grad():
            # 計算場景類型文本特徵
            text_features = self.model.encode_text(self.scene_type_tokens)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

            # 計算相似度分數
            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

            # 建立場景分數字典
            scene_scores = {}
            for i, scene_type in enumerate(self.scene_type_prompts.keys()):
                scene_scores[scene_type] = float(similarity[i])

            return scene_scores

    def _analyze_lighting_condition(self, image_features: torch.Tensor) -> Dict[str, float]:
        """分析圖像的光照條件"""
        with torch.no_grad():
            # 計算光照條件文本特徵
            text_features = self.model.encode_text(self.lighting_tokens)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

            # 計算相似度分數
            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

            # 建立光照條件分數字典
            lighting_scores = {}
            for i, lighting_type in enumerate(self.lighting_condition_prompts.keys()):
                lighting_scores[lighting_type] = float(similarity[i])

            return lighting_scores

    def _analyze_cultural_scene(self, image_features: torch.Tensor, scene_type: str) -> Dict[str, Any]:
        """針對特定文化場景進行深入分析"""
        if scene_type not in self.cultural_tokens_dict:
            return {"error": f"No cultural analysis available for {scene_type}"}

        with torch.no_grad():
            # 獲取特定文化場景的文本特徵
            cultural_tokens = self.cultural_tokens_dict[scene_type]
            text_features = self.model.encode_text(cultural_tokens)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

            # 計算相似度分數
            similarity = (100 * image_features @ text_features.T)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

            # 找到最匹配的文化描述
            prompts = self.cultural_scene_prompts[scene_type]
            scores = [(prompts[i], float(similarity[i])) for i in range(len(prompts))]
            scores.sort(key=lambda x: x[1], reverse=True)

            return {
                "best_description": scores[0][0],
                "confidence": scores[0][1],
                "all_matches": scores
            }

    def _analyze_specialized_scene(self, image_features: torch.Tensor, scene_type: str) -> Dict[str, Any]:
        """針對特定專門場景進行深入分析"""
        if scene_type not in self.specialized_tokens_dict:
            return {"error": f"No specialized analysis available for {scene_type}"}

        with torch.no_grad():
            # 獲取特定專門場景的文本特徵
            specialized_tokens = self.specialized_tokens_dict[scene_type]
            text_features = self.model.encode_text(specialized_tokens)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

            # 計算相似度分數
            similarity = (100 * image_features @ text_features.T)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

            # 找到最匹配的專門描述
            prompts = self.specialized_scene_prompts[scene_type]
            scores = [(prompts[i], float(similarity[i])) for i in range(len(prompts))]
            scores.sort(key=lambda x: x[1], reverse=True)

            return {
                "best_description": scores[0][0],
                "confidence": scores[0][1],
                "all_matches": scores
            }

    def _analyze_viewpoint(self, image_features: torch.Tensor) -> Dict[str, float]:
        """分析圖像的拍攝視角"""
        with torch.no_grad():
            # 計算視角文本特徵
            text_features = self.model.encode_text(self.viewpoint_tokens)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

            # 計算相似度分數
            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

            # 建立視角分數字典
            viewpoint_scores = {}
            for i, viewpoint in enumerate(self.viewpoint_prompts.keys()):
                viewpoint_scores[viewpoint] = float(similarity[i])

            return viewpoint_scores

    def _analyze_object_combinations(self, image_features: torch.Tensor) -> Dict[str, float]:
        """分析圖像中的物體組合"""
        with torch.no_grad():
            # 計算物體組合文本特徵
            text_features = self.model.encode_text(self.object_combination_tokens)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

            # 計算相似度分數
            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

            # 建立物體組合分數字典
            combination_scores = {}
            for i, combination in enumerate(self.object_combination_prompts.keys()):
                combination_scores[combination] = float(similarity[i])

            return combination_scores

    def _analyze_activities(self, image_features: torch.Tensor) -> Dict[str, float]:
        """分析圖像中的活動"""
        with torch.no_grad():
            # 計算活動文本特徵
            text_features = self.model.encode_text(self.activity_tokens)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

            # 計算相似度分數
            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

            # 建立活動分數字典
            activity_scores = {}
            for i, activity in enumerate(self.activity_prompts.keys()):
                activity_scores[activity] = float(similarity[i])

            return activity_scores

    def get_image_embedding(self, image) -> np.ndarray:
        """
        獲取圖像的 CLIP 嵌入表示

        Args:
            image: PIL Image 或 numpy array

        Returns:
            np.ndarray: 圖像的 CLIP 特徵向量
        """
        # 確保圖像是 PIL 格式
        if not isinstance(image, Image.Image):
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
            else:
                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

        # 預處理並編碼
        image_input = self.preprocess(image).unsqueeze(0).to(self.device)

        with torch.no_grad():
            image_features = self.model.encode_image(image_input)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # 轉換為 numpy 並返回
        return image_features.cpu().numpy()[0] if self.device == "cuda" else image_features.numpy()[0]

    def text_to_embedding(self, text: str) -> np.ndarray:
        """
        將文本轉換為 CLIP 嵌入表示

        Args:
            text: 輸入文本

        Returns:
            np.ndarray: 文本的 CLIP 特徵向量
        """
        text_token = clip.tokenize([text]).to(self.device)

        with torch.no_grad():
            text_features = self.model.encode_text(text_token)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        return text_features.cpu().numpy()[0] if self.device == "cuda" else text_features.numpy()[0]

    def calculate_similarity(self, image, text_queries: List[str]) -> Dict[str, float]:
        """
        計算圖像與多個文本查詢的相似度

        Args:
            image: PIL Image 或 numpy array
            text_queries: 文本查詢列表

        Returns:
            Dict: 每個查詢的相似度分數
        """
        # 獲取圖像嵌入
        if isinstance(image, np.ndarray) and len(image.shape) == 1:
            # 已經是嵌入向量
            image_features = torch.tensor(image).unsqueeze(0).to(self.device)
        else:
            # 是圖像，需要提取嵌入
            image_features = torch.tensor(self.get_image_embedding(image)).unsqueeze(0).to(self.device)

        # calulate similarity
        text_tokens = clip.tokenize(text_queries).to(self.device)

        with torch.no_grad():
            text_features = self.model.encode_text(text_tokens)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

        # display results
        result = {}
        for i, query in enumerate(text_queries):
            result[query] = float(similarity[i])

        return result

In [None]:
# %%writefile scene_analyzer.py
import os
import numpy as np
from typing import Dict, List, Tuple, Any, Optional

# from spatial_analyzer import SpatialAnalyzer
# from scene_description import SceneDescriptor
# from enhance_scene_describer import EnhancedSceneDescriber
# from clip_analyzer import CLIPAnalyzer
# from llm_enhancer import LLMEnhancer
# from scene_type import SCENE_TYPES
# from object_categories import OBJECT_CATEGORIES

class SceneAnalyzer:
    """
    Core class for scene analysis and understanding based on object detection results.
    Analyzes detected objects, their relationships, and infers the scene type.
    """
    def __init__(self, class_names: Dict[int, str] = None, use_llm: bool = True, llm_model_path: str = None):
        """
        Initialize the scene analyzer with optional class name mappings.
        Args:
            class_names: Dictionary mapping class IDs to class names (optional)
        """
        self.class_names = class_names

        # 加載場景類型和物體類別
        self.SCENE_TYPES = SCENE_TYPES
        self.OBJECT_CATEGORIES = OBJECT_CATEGORIES

        # 初始化其他組件，將數據傳遞給 SceneDescriptor
        self.spatial_analyzer = SpatialAnalyzer(class_names=class_names, object_categories=self.OBJECT_CATEGORIES)
        self.descriptor = SceneDescriptor(scene_types=self.SCENE_TYPES, object_categories=self.OBJECT_CATEGORIES)
        self.scene_describer = EnhancedSceneDescriber(scene_types=self.SCENE_TYPES)

        # 初始化 CLIP 分析器
        try:
            self.clip_analyzer = CLIPAnalyzer()
            self.use_clip = True
        except Exception as e:
            print(f"Warning: Could not initialize CLIP analyzer: {e}")
            print("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
            self.use_clip = False

        # 初始化LLM Model
        self.use_llm = use_llm
        if use_llm:
            try:
                # from llm_enhancer import LLMEnhancer
                self.llm_enhancer = LLMEnhancer(model_path=llm_model_path)
                print(f"LLM enhancer initialized successfully.")
            except Exception as e:
                print(f"Warning: Could not initialize LLM enhancer: {e}")
                print("Scene analysis will proceed without LLM. Make sure required packages are installed.")
                self.use_llm = False

    def generate_scene_description(self,
                             scene_type,
                             detected_objects,
                             confidence,
                             lighting_info=None,
                             functional_zones=None):
        """
        生成場景描述。
        Args:
            scene_type: 識別的場景類型
            detected_objects: 檢測到的物體列表
            confidence: 場景分類置信度
            lighting_info: 照明條件信息（可選）
            functional_zones: 功能區域信息（可選）
        Returns:
            str: 生成的場景描述
        """
        return self.scene_describer.generate_description(
            scene_type,
            detected_objects,
            confidence,
            lighting_info,
            functional_zones
        )

    def _generate_scene_description(self, scene_type, detected_objects, confidence, lighting_info=None):
        """
        Use new implement
        """
        # get the functional zones info
        functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, scene_type)

        return self.generate_scene_description(
            scene_type,
            detected_objects,
            confidence,
            lighting_info,
            functional_zones
        )

    def _define_image_regions(self):
        """Define regions of the image for spatial analysis (3x3 grid)"""
        self.regions = {
            "top_left": (0, 0, 1/3, 1/3),
            "top_center": (1/3, 0, 2/3, 1/3),
            "top_right": (2/3, 0, 1, 1/3),
            "middle_left": (0, 1/3, 1/3, 2/3),
            "middle_center": (1/3, 1/3, 2/3, 2/3),
            "middle_right": (2/3, 1/3, 1, 2/3),
            "bottom_left": (0, 2/3, 1/3, 1),
            "bottom_center": (1/3, 2/3, 2/3, 1),
            "bottom_right": (2/3, 2/3, 1, 1)
        }


    def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
        """
        Analyze detection results to determine scene type and provide understanding.
        Args:
            detection_result: Detection result from YOLOv8
            lighting_info: Optional lighting condition analysis results
            class_confidence_threshold: Minimum confidence to consider an object
            scene_confidence_threshold: Minimum confidence to determine a scene
        Returns:
            Dictionary with scene analysis results
        """
        # If no result or no detections, handle with LLM if possible
        if detection_result is None or len(detection_result.boxes) == 0:
            if self.use_llm and self.use_clip and detection_result is not None:
                # 使用CLIP和LLM分析無物體檢測的情況
                try:
                    original_image = detection_result.orig_img
                    clip_analysis = self.clip_analyzer.analyze_image(original_image)
                    llm_description = self.llm_enhancer.handle_no_detection(clip_analysis)

                    return {
                        "scene_type": "llm_inferred",
                        "confidence": clip_analysis.get("top_scene", ("unknown", 0))[1],
                        "description": "No objects detected by standard detection.",
                        "enhanced_description": llm_description,
                        "objects_present": [],
                        "object_count": 0,
                        "regions": {},
                        "possible_activities": [],
                        "safety_concerns": [],
                        "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
                    }
                except Exception as e:
                    print(f"Error in LLM no-detection handling: {e}")

            # 如果無法使用LLM/CLIP或處理失敗，返回原始的無檢測結果
            return {
                "scene_type": "unknown",
                "confidence": 0,
                "description": "No objects detected in the image.",
                "objects_present": [],
                "object_count": 0,
                "regions": {},
                "possible_activities": [],
                "safety_concerns": [],
                "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
            }

        # Get class names from detection result if not already set
        if self.class_names is None:
            self.class_names = detection_result.names
            # Also update class names in spatial analyzer
            self.spatial_analyzer.class_names = self.class_names

        # Extract detected objects with confidence above threshold
        detected_objects = self.spatial_analyzer._extract_detected_objects(
            detection_result,
            confidence_threshold=class_confidence_threshold
        )

        # No objects above confidence threshold
        if not detected_objects:
            return {
                "scene_type": "unknown",
                "confidence": 0,
                "description": "No objects with sufficient confidence detected.",
                "objects_present": [],
                "object_count": 0,
                "regions": {},
                "possible_activities": [],
                "safety_concerns": [],
                "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
            }

        # Analyze object distribution in regions
        region_analysis = self.spatial_analyzer._analyze_regions(detected_objects)

        # Compute scene type scores based on object detection
        yolo_scene_scores = self._compute_scene_scores(detected_objects)

        # 使用 CLIP 分析圖像
        clip_scene_scores = {}
        clip_analysis = None
        if self.use_clip:
            try:
                # 獲取原始圖像
                original_image = detection_result.orig_img

                # Use CLIP analyze image
                clip_analysis = self.clip_analyzer.analyze_image(original_image)

                # get CLIP's score
                clip_scene_scores = clip_analysis.get("scene_scores", {})

                if "asian_commercial_street" in clip_scene_scores and clip_scene_scores["asian_commercial_street"] > 0.2:
                    # 使用對比提示進一步區分室內/室外
                    comparative_results = self.clip_analyzer.calculate_similarity(
                        original_image,
                        self.clip_analyzer.comparative_prompts["indoor_vs_outdoor"]
                    )

                    # 分析對比結果
                    indoor_score = sum(s for p, s in comparative_results.items() if "indoor" in p or "enclosed" in p)
                    outdoor_score = sum(s for p, s in comparative_results.items() if "outdoor" in p or "open-air" in p)

                    # 如果 CLIP 認為這是室外場景，且光照分析認為是室內
                    if outdoor_score > indoor_score and lighting_info and lighting_info.get("is_indoor", False):
                        # 修正光照分析結果
                        print(f"CLIP indicates outdoor commercial street (score: {outdoor_score:.2f} vs {indoor_score:.2f}), adjusting lighting analysis")
                        lighting_info["is_indoor"] = False
                        lighting_info["indoor_probability"] = 0.3
                        # 把CLIP 分析結果加到光照診斷
                        if "diagnostics" not in lighting_info:
                            lighting_info["diagnostics"] = {}
                        lighting_info["diagnostics"]["clip_override"] = {
                            "reason": "CLIP detected outdoor commercial street",
                            "outdoor_score": float(outdoor_score),
                            "indoor_score": float(indoor_score)
                        }

                # 如果 CLIP 檢測到了光照條件但沒有提供 lighting_info
                if not lighting_info and "lighting_condition" in clip_analysis:
                    lighting_type, lighting_conf = clip_analysis["lighting_condition"]
                    lighting_info = {
                        "time_of_day": lighting_type,
                        "confidence": lighting_conf
                    }
            except Exception as e:
                print(f"Error in CLIP analysis: {e}")

        # 融合 YOLO 和 CLIP 的場景分數
        scene_scores = self._fuse_scene_scores(yolo_scene_scores, clip_scene_scores)

        # Determine best matching scene type
        best_scene, scene_confidence = self._determine_scene_type(scene_scores)

        # Generate possible activities based on scene
        activities = self.descriptor._infer_possible_activities(best_scene, detected_objects)

        # Identify potential safety concerns
        safety_concerns = self.descriptor._identify_safety_concerns(detected_objects, best_scene)

        # Calculate functional zones
        functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, best_scene)

        # Generate scene description
        scene_description = self.generate_scene_description(
            best_scene,
            detected_objects,
            scene_confidence,
            lighting_info=lighting_info,
            functional_zones=functional_zones
        )

        # 使用LLM進行增強處理
        enhanced_description = None
        llm_verification = None

        if self.use_llm:
            try:
                # 準備用於LLM的場景數據
                scene_data = {
                    "original_description": scene_description,
                    "scene_type": best_scene,
                    "scene_name": self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown"),
                    "detected_objects": detected_objects,
                    "confidence": scene_confidence,
                    "lighting_info": lighting_info,
                    "functional_zones": functional_zones,
                    "activities": activities,
                    "safety_concerns": safety_concerns,
                    "clip_analysis": clip_analysis
                }

                # 如果CLIP和YOLO結果之間存在顯著差異，使用LLM進行驗證
                if self.use_clip and clip_analysis and "top_scene" in clip_analysis:
                    clip_top_scene = clip_analysis["top_scene"][0]
                    clip_confidence = clip_analysis["top_scene"][1]

                    # 如果CLIP和YOLO的場景預測不同且都有較高的置信度，進行驗證
                    if clip_top_scene != best_scene and clip_confidence > 0.4 and scene_confidence > 0.4:
                        llm_verification = self.llm_enhancer.verify_detection(
                            detected_objects,
                            clip_analysis,
                            best_scene,
                            self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown"),
                            scene_confidence
                        )

                        # 將驗證結果添加到場景數據中
                        scene_data["verification_result"] = llm_verification.get("verification_text", "")

                # 使用LLM生成增強描述
                enhanced_description = self.llm_enhancer.enhance_description(scene_data)

            except Exception as e:
                print(f"Error in LLM enhancement: {e}")
                import traceback
                traceback.print_exc()
                enhanced_description = None

        # Return comprehensive analysis
        result = {
            "scene_type": best_scene if scene_confidence >= scene_confidence_threshold else "unknown",
            "scene_name": self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown")
                        if scene_confidence >= scene_confidence_threshold else "Unknown Scene",
            "confidence": scene_confidence,
            "description": scene_description,
            "enhanced_description": enhanced_description,  # 添加LLM增強的描述
            "objects_present": [
                {"class_id": obj["class_id"],
                "class_name": obj["class_name"],
                "confidence": obj["confidence"]}
                for obj in detected_objects
            ],
            "object_count": len(detected_objects),
            "regions": region_analysis,
            "possible_activities": activities,
            "safety_concerns": safety_concerns,
            "functional_zones": functional_zones,
            "alternative_scenes": self.descriptor._get_alternative_scenes(scene_scores, scene_confidence_threshold, top_k=2),
            "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
        }

        # 如果有LLM驗證結果，添加到輸出中
        if llm_verification:
            result["llm_verification"] = llm_verification.get("verification_text")
            if llm_verification.get("has_errors", False):
                result["detection_warnings"] = "LLM detected potential issues with object recognition"

        # 添加 CLIP 特定的結果
        if clip_analysis and "error" not in clip_analysis:
            result["clip_analysis"] = {
                "top_scene": clip_analysis.get("top_scene", ("unknown", 0)),
                "cultural_analysis": clip_analysis.get("cultural_analysis", {})
            }

        return result

    def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
        """
        Compute confidence scores for each scene type based on detected objects.
        Args:
            detected_objects: List of detected objects
        Returns:
            Dictionary mapping scene types to confidence scores
        """
        scene_scores = {}
        detected_class_ids = [obj["class_id"] for obj in detected_objects]
        detected_classes_set = set(detected_class_ids)

        # Count occurrence of each class
        class_counts = {}
        for obj in detected_objects:
            class_id = obj["class_id"]
            if class_id not in class_counts:
                class_counts[class_id] = 0
            class_counts[class_id] += 1

        # Evaluate each scene type
        for scene_type, scene_def in self.SCENE_TYPES.items():
            # Count required objects present
            required_objects = set(scene_def["required_objects"])
            required_present = required_objects.intersection(detected_classes_set)

            # Count optional objects present
            optional_objects = set(scene_def["optional_objects"])
            optional_present = optional_objects.intersection(detected_classes_set)

            # Skip if minimum required objects aren't present
            if len(required_present) < scene_def["minimum_required"]:
                scene_scores[scene_type] = 0
                continue

            # Base score from required objects
            required_ratio = len(required_present) / max(1, len(required_objects))
            required_score = required_ratio * 0.7  # 70% of score from required objects

            # Additional score from optional objects
            optional_ratio = len(optional_present) / max(1, len(optional_objects))
            optional_score = optional_ratio * 0.3  # 30% of score from optional objects

            # Bonus for having multiple instances of key objects
            multiple_bonus = 0
            for class_id in required_present:
                if class_counts.get(class_id, 0) > 1:
                    multiple_bonus += 0.05  # 5% bonus per additional key object type

            # Cap the bonus at 15%
            multiple_bonus = min(0.15, multiple_bonus)

            # Calculate final score
            final_score = required_score + optional_score + multiple_bonus

            if "priority" in scene_def:
                final_score *= scene_def["priority"]

            # Normalize to 0-1 range
            scene_scores[scene_type] = min(1.0, final_score)

        return scene_scores

    def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
        """
        Determine the most likely scene type based on scores.
        Args:
            scene_scores: Dictionary mapping scene types to confidence scores
        Returns:
            Tuple of (best_scene_type, confidence)
        """
        if not scene_scores:
            return "unknown", 0

        # Find scene with highest score
        best_scene = max(scene_scores, key=scene_scores.get)
        best_score = scene_scores[best_scene]

        return best_scene, best_score


    def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
        """
        融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
        Args:
            yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
            clip_scene_scores: 基於 CLIP 分析的場景分數
        Returns:
            Dict: 融合後的場景分數
        """
        # 如果沒有 CLIP 分數，直接返回 YOLO 分數
        if not clip_scene_scores:
            return yolo_scene_scores

        # 如果沒有 YOLO 分數，直接返回 CLIP 分數
        if not yolo_scene_scores:
            return clip_scene_scores

        # 融合分數
        fused_scores = {}

        # 獲取所有場景類型
        all_scene_types = set(list(yolo_scene_scores.keys()) + list(clip_scene_scores.keys()))

        for scene_type in all_scene_types:
            # 獲取兩個模型的分數
            yolo_score = yolo_scene_scores.get(scene_type, 0)
            clip_score = clip_scene_scores.get(scene_type, 0)

            # 設置基本權重
            yolo_weight = 0.7  # YOLO 可提供比較好的物體資訊
            clip_weight = 0.3  # CLIP 強項是理解整體的場景關係

            # 對特定類型場景調整權重
            # 文化特定場景或具有特殊布局的場景，CLIP可能比較能理解
            if any(keyword in scene_type for keyword in ["asian", "cultural", "aerial"]):
                yolo_weight = 0.3
                clip_weight = 0.7

            # 對室內家居場景，物體檢測通常更準確
            elif any(keyword in scene_type for keyword in ["room", "kitchen", "office", "bedroom"]):
                yolo_weight = 0.8
                clip_weight = 0.2
            elif scene_type == "beach_water_recreation":
                yolo_weight = 0.8  # 衝浪板等特定物品的檢測
                clip_weight = 0.2
            elif scene_type == "sports_venue":
                yolo_weight = 0.7
                clip_weight = 0.3
            elif scene_type == "professional_kitchen":
                yolo_weight = 0.8  # 廚房用具的檢測非常重要
                clip_weight = 0.2

            # 計算加權分數
            fused_scores[scene_type] = (yolo_score * yolo_weight) + (clip_score * clip_weight)

        return fused_scores

In [None]:
# %%writefile image_processor.py
import os
import numpy as np
import torch
import cv2
from PIL import Image
import tempfile
import uuid
from typing import Dict, List, Any, Optional, Tuple

# from detection_model import DetectionModel
# from color_mapper import ColorMapper
# from visualization_helper import VisualizationHelper
# from evaluation_metrics import EvaluationMetrics
# from lighting_analyzer import LightingAnalyzer
# from scene_analyzer import SceneAnalyzer

class ImageProcessor:
    """
    Class for handling image processing and object detection operations
    Separates processing logic from UI components
    """

    def __init__(self, use_llm=True, llm_model_path=None):
        """Initialize the image processor with required components"""
        self.color_mapper = ColorMapper()
        self.model_instances = {}
        self.lighting_analyzer = LightingAnalyzer()
        self.use_llm = use_llm
        self.llm_model_path = llm_model_path

    def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
        """
        Get or create a model instance based on model name

        Args:
            model_name: Name of the model to use
            confidence: Confidence threshold for detection
            iou: IoU threshold for non-maximum suppression

        Returns:
            DetectionModel instance
        """
        if model_name not in self.model_instances:
            print(f"Creating new model instance for {model_name}")
            self.model_instances[model_name] = DetectionModel(
                model_name=model_name,
                confidence=confidence,
                iou=iou
            )
        else:
            print(f"Using existing model instance for {model_name}")
            self.model_instances[model_name].confidence = confidence

        return self.model_instances[model_name]

    def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None) -> Dict:
        """
        Perform scene analysis on detection results

        Args:
            detection_result: Object detection result from YOLOv8
            lighting_info: Lighting condition analysis results (optional)

        Returns:
            Dictionary containing scene analysis results
        """
        try:
            # Initialize scene analyzer if not already done
            if not hasattr(self, 'scene_analyzer'):
                self.scene_analyzer = SceneAnalyzer(
                    class_names=detection_result.names,
                    use_llm=self.use_llm,
                    llm_model_path=self.llm_model_path
                )

            # 確保類名正確更新
            if self.scene_analyzer.class_names is None:
                self.scene_analyzer.class_names = detection_result.names
                self.scene_analyzer.spatial_analyzer.class_names = detection_result.names

            # Perform scene analysis with lighting info
            scene_analysis = self.scene_analyzer.analyze(
                detection_result=detection_result,
                lighting_info=lighting_info,
                class_confidence_threshold=0.35,
                scene_confidence_threshold=0.6
            )

            return scene_analysis
        except Exception as e:
            print(f"Error in scene analysis: {str(e)}")
            import traceback
            traceback.print_exc()
            return {
                "scene_type": "unknown",
                "confidence": 0.0,
                "description": f"Error during scene analysis: {str(e)}",
                "objects_present": [],
                "object_count": 0,
                "regions": {},
                "possible_activities": [],
                "safety_concerns": [],
                "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
            }

    def analyze_lighting_conditions(self, image):
        """
        分析光照條件。

        Args:
            image: 輸入圖像

        Returns:
            Dict: 光照分析結果
        """
        return self.lighting_analyzer.analyze(image)

    def process_image(self, image, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None) -> Tuple[Any, str, Dict]:
        """
        Process an image for object detection

        Args:
            image: Input image (numpy array or PIL Image)
            model_name: Name of the model to use
            confidence_threshold: Confidence threshold for detection
            filter_classes: Optional list of classes to filter results

        Returns:
            Tuple of (result_image, result_text, stats_data)
        """
        # Get model instance
        model_instance = self.get_model_instance(model_name, confidence_threshold)

        # Initialize key variables
        result = None
        stats = {}
        temp_path = None

        try:
            # Processing input image
            if isinstance(image, np.ndarray):
                # Convert BGR to RGB if needed
                if image.shape[2] == 3:
                    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                else:
                    image_rgb = image
                pil_image = Image.fromarray(image_rgb)
            elif image is None:
                return None, "No image provided. Please upload an image.", {}
            else:
                pil_image = image

            # Analyze lighting conditions
            lighting_info = self.analyze_lighting_conditions(pil_image)

            # Store temp files
            temp_dir = tempfile.gettempdir()  # Use system temp directory
            temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
            temp_path = os.path.join(temp_dir, temp_filename)
            pil_image.save(temp_path)

            # Object detection
            result = model_instance.detect(temp_path)

            if result is None:
                return None, "Detection failed. Please try again with a different image.", {}

            # Calculate stats
            stats = EvaluationMetrics.calculate_basic_stats(result)

            # Add space calculation
            spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
            stats["spatial_metrics"] = spatial_metrics

            # Add lighting information
            stats["lighting_conditions"] = lighting_info

            # Apply filter if specified
            if filter_classes and len(filter_classes) > 0:
                # Get classes, boxes, confidence
                classes = result.boxes.cls.cpu().numpy().astype(int)
                confs = result.boxes.conf.cpu().numpy()
                boxes = result.boxes.xyxy.cpu().numpy()

                mask = np.zeros_like(classes, dtype=bool)
                for cls_id in filter_classes:
                    mask = np.logical_or(mask, classes == cls_id)

                filtered_stats = {
                    "total_objects": int(np.sum(mask)),
                    "class_statistics": {},
                    "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
                    "spatial_metrics": stats["spatial_metrics"],
                    "lighting_conditions": lighting_info
                }

                # Update stats
                names = result.names
                for cls, conf in zip(classes[mask], confs[mask]):
                    cls_name = names[int(cls)]
                    if cls_name not in filtered_stats["class_statistics"]:
                        filtered_stats["class_statistics"][cls_name] = {
                            "count": 0,
                            "average_confidence": 0
                        }

                    filtered_stats["class_statistics"][cls_name]["count"] += 1
                    filtered_stats["class_statistics"][cls_name]["average_confidence"] = conf

                stats = filtered_stats

            viz_data = EvaluationMetrics.generate_visualization_data(
                result,
                self.color_mapper.get_all_colors()
            )

            result_image = VisualizationHelper.visualize_detection(
                temp_path, result, color_mapper=self.color_mapper, figsize=(12, 12), return_pil=True, filter_classes=filter_classes
            )

            result_text = EvaluationMetrics.format_detection_summary(viz_data)

            if result is not None:
                # Perform scene analysis with lighting info
                scene_analysis = self.analyze_scene(result, lighting_info)

                # Add scene analysis to stats
                stats["scene_analysis"] = scene_analysis

            return result_image, result_text, stats

        except Exception as e:
            error_message = f"Error Occurs: {str(e)}"
            import traceback
            traceback.print_exc()
            print(error_message)
            return None, error_message, {}

        finally:
            if temp_path and os.path.exists(temp_path):
                try:
                    os.remove(temp_path)
                except Exception as e:
                    print(f"Cannot delete temp files {temp_path}: {str(e)}")


    def format_result_text(self, stats: Dict) -> str:
        """
        Format detection statistics into readable text with improved spacing

        Args:
            stats: Dictionary containing detection statistics

        Returns:
            Formatted text summary
        """
        if not stats or "total_objects" not in stats:
            return "No objects detected."

        # 減少不必要的空行
        lines = [
            f"Detected {stats['total_objects']} objects.",
            f"Average confidence: {stats.get('average_confidence', 0):.2f}",
            "Objects by class:"
        ]

        if "class_statistics" in stats and stats["class_statistics"]:
            # 按計數排序類別
            sorted_classes = sorted(
                stats["class_statistics"].items(),
                key=lambda x: x[1]["count"],
                reverse=True
            )

            for cls_name, cls_stats in sorted_classes:
                count = cls_stats["count"]
                conf = cls_stats.get("average_confidence", 0)

                item_text = "item" if count == 1 else "items"
                lines.append(f"• {cls_name}: {count} {item_text} (avg conf: {conf:.2f})")
        else:
            lines.append("No class information available.")

        # 添加空間信息
        if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
            lines.append("Object Distribution:")

            dist = stats["spatial_metrics"]["spatial_distribution"]
            x_mean = dist.get("x_mean", 0)
            y_mean = dist.get("y_mean", 0)

            # 描述物體的大致位置
            if x_mean < 0.33:
                h_pos = "on the left side"
            elif x_mean < 0.67:
                h_pos = "in the center"
            else:
                h_pos = "on the right side"

            if y_mean < 0.33:
                v_pos = "in the upper part"
            elif y_mean < 0.67:
                v_pos = "in the middle"
            else:
                v_pos = "in the lower part"

            lines.append(f"• Most objects appear {h_pos} {v_pos} of the image")

        return "\n".join(lines)

    def format_json_for_display(self, stats: Dict) -> Dict:
        """
        Format statistics JSON for better display

        Args:
            stats: Raw statistics dictionary

        Returns:
            Formatted statistics structure for display
        """
        # Create a cleaner copy of the stats for display
        display_stats = {}

        # Add summary section
        display_stats["summary"] = {
            "total_objects": stats.get("total_objects", 0),
            "average_confidence": round(stats.get("average_confidence", 0), 3)
        }

        # Add class statistics in a more organized way
        if "class_statistics" in stats and stats["class_statistics"]:
            # Sort classes by count (descending)
            sorted_classes = sorted(
                stats["class_statistics"].items(),
                key=lambda x: x[1].get("count", 0),
                reverse=True
            )

            class_stats = {}
            for cls_name, cls_data in sorted_classes:
                class_stats[cls_name] = {
                    "count": cls_data.get("count", 0),
                    "average_confidence": round(cls_data.get("average_confidence", 0), 3)
                }

            display_stats["detected_objects"] = class_stats

        # Simplify spatial metrics
        if "spatial_metrics" in stats:
            spatial = stats["spatial_metrics"]

            # Simplify spatial distribution
            if "spatial_distribution" in spatial:
                dist = spatial["spatial_distribution"]
                display_stats["spatial"] = {
                    "distribution": {
                        "x_mean": round(dist.get("x_mean", 0), 3),
                        "y_mean": round(dist.get("y_mean", 0), 3),
                        "x_std": round(dist.get("x_std", 0), 3),
                        "y_std": round(dist.get("y_std", 0), 3)
                    }
                }

            # Add simplified size information
            if "size_distribution" in spatial:
                size = spatial["size_distribution"]
                display_stats["spatial"]["size"] = {
                    "mean_area": round(size.get("mean_area", 0), 3),
                    "min_area": round(size.get("min_area", 0), 3),
                    "max_area": round(size.get("max_area", 0), 3)
                }

        return display_stats

    def prepare_visualization_data(self, stats: Dict, available_classes: Dict[int, str]) -> Dict:
        """
        Prepare data for visualization based on detection statistics

        Args:
            stats: Detection statistics
            available_classes: Dictionary of available class IDs and names

        Returns:
            Visualization data dictionary
        """
        if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
            return {"error": "No detection data available"}

        # Prepare visualization data
        viz_data = {
            "total_objects": stats.get("total_objects", 0),
            "average_confidence": stats.get("average_confidence", 0),
            "class_data": []
        }

        # Class data
        for cls_name, cls_stats in stats.get("class_statistics", {}).items():
            # Search class ID
            class_id = -1
            for id, name in available_classes.items():
                if name == cls_name:
                    class_id = id
                    break

            cls_data = {
                "name": cls_name,
                "class_id": class_id,
                "count": cls_stats.get("count", 0),
                "average_confidence": cls_stats.get("average_confidence", 0),
                "color": self.color_mapper.get_color(class_id if class_id >= 0 else cls_name)
            }

            viz_data["class_data"].append(cls_data)

        # Descending order
        viz_data["class_data"].sort(key=lambda x: x["count"], reverse=True)

        return viz_data

In [None]:
# %%writefile video_processor.py
import cv2
import os
import tempfile
import uuid
from PIL import Image
import numpy as np
from typing import Dict, List, Tuple, Any, Optional
import time
from collections import defaultdict

# from image_processor import ImageProcessor
# from evaluation_metrics import EvaluationMetrics
# from scene_analyzer import SceneAnalyzer
# from detection_model import DetectionModel

class VideoProcessor:
    """
    Handles the processing of video files, including object detection
    and scene analysis on selected frames.
    """
    def __init__(self, image_processor: ImageProcessor):
        """
        Initializes the VideoProcessor.

        Args:
            image_processor (ImageProcessor): An initialized ImageProcessor instance.
        """
        self.image_processor = image_processor

    def process_video_file(self,
                           video_path: str,
                           model_name: str,
                           confidence_threshold: float,
                           process_interval: int = 5,
                           scene_desc_interval_sec: int = 3) -> Tuple[Optional[str], str, Dict]:
        """
        Processes an uploaded video file, performs detection and periodic scene analysis,
        and returns the path to the annotated output video file along with a summary.

        Args:
            video_path (str): Path to the input video file.
            model_name (str): Name of the YOLO model to use.
            confidence_threshold (float): Confidence threshold for object detection.
            process_interval (int): Process every Nth frame. Defaults to 5.
            scene_desc_interval_sec (int): Update scene description every N seconds. Defaults to 3.

        Returns:
            Tuple[Optional[str], str, Dict]: (Path to output video or None, Summary text, Statistics dictionary)
        """
        if not video_path or not os.path.exists(video_path):
            print(f"Error: Video file not found at {video_path}")
            return None, "Error: Video file not found.", {}

        print(f"Starting video processing for: {video_path}")
        start_time = time.time()

        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Error: Could not open video file {video_path}")
            return None, "Error opening video file.", {}

        # Get video properties
        fps = cap.get(cv2.CAP_PROP_FPS)
        if fps <= 0: # Handle case where fps is not available or invalid
             fps = 30 # Assume a default fps
             print(f"Warning: Could not get valid FPS for video. Assuming {fps} FPS.")
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_frames_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        print(f"Video properties: {width}x{height} @ {fps:.2f} FPS, Total Frames: {total_frames_video}")

        # Calculate description update interval in frames
        description_update_interval_frames = int(fps * scene_desc_interval_sec)
        if description_update_interval_frames < 1:
            description_update_interval_frames = int(fps) # Update at least once per second if interval is too short

        object_trackers = {}  # 儲存ID與物體的映射
        last_detected_objects = {}  # 儲存上一次檢測到的物體資訊
        next_object_id = 0  # 下一個可用的物體ID
        tracking_threshold = 0.6  # 相同物體的IoU
        object_colors = {}  # 每個被追蹤的物體分配固定顏色

        # Setup Output Video
        output_filename = f"processed_{uuid.uuid4().hex}_{os.path.basename(video_path)}"
        temp_dir = tempfile.gettempdir() # Use system's temp directory
        output_path = os.path.join(temp_dir, output_filename)
        # Ensure the output path has a compatible extension (like .mp4)
        if not output_path.lower().endswith(('.mp4', '.avi', '.mov')):
            output_path += ".mp4"

        # Use 'mp4v' for MP4, common and well-supported
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        if not out.isOpened():
            print(f"Error: Could not open VideoWriter for path: {output_path}")
            cap.release()
            return None, f"Error creating output video file at {output_path}.", {}
        print(f"Output video will be saved to: {output_path}")

        frame_count = 0
        processed_frame_count = 0
        all_stats = [] # Store stats for each processed frame
        summary_lines = []
        last_description = "Analyzing scene..." # Initial description
        frame_since_last_desc = description_update_interval_frames # Trigger analysis on first processed frame

        try:
            while True:
                ret, frame = cap.read()
                if not ret:
                    break # End of video

                frame_count += 1
                frame_since_last_desc += 1
                current_frame_annotated = False # Flag if this frame was processed and annotated

                # Process frame based on interval
                if frame_count % process_interval == 0:
                    processed_frame_count += 1
                    print(f"Processing frame {frame_count}...")
                    current_frame_annotated = True

                    # Use ImageProcessor for single-frame tasks
                    # 1. Convert frame format BGR -> RGB -> PIL
                    try:
                        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        pil_image = Image.fromarray(frame_rgb)
                    except Exception as e:
                        print(f"Error converting frame {frame_count}: {e}")
                        continue # Skip this frame

                    # 2. Get appropriate model instance
                    # Confidence is passed from UI, model_name too
                    model_instance = self.image_processor.get_model_instance(model_name, confidence_threshold)
                    if not model_instance or not model_instance.is_model_loaded:
                         print(f"Error: Model {model_name} not loaded. Skipping frame {frame_count}.")
                         # Draw basic frame without annotation
                         cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
                         cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
                         out.write(frame)
                         continue


                    # 3. Perform detection
                    detection_result = model_instance.detect(pil_image) # Use PIL image

                    current_description_for_frame = last_description # Default to last known description
                    scene_analysis_result = None
                    stats = {}

                    if detection_result and hasattr(detection_result, 'boxes') and len(detection_result.boxes) > 0:
                        # Ensure SceneAnalyzer is ready within ImageProcessor
                        if not hasattr(self.image_processor, 'scene_analyzer') or self.image_processor.scene_analyzer is None:
                             print("Initializing SceneAnalyzer...")
                             # Pass class names from the current detection result
                             self.image_processor.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
                        elif self.image_processor.scene_analyzer.class_names is None:
                             # Update class names if they were missing
                             self.image_processor.scene_analyzer.class_names = detection_result.names
                             if hasattr(self.image_processor.scene_analyzer, 'spatial_analyzer'):
                                 self.image_processor.scene_analyzer.spatial_analyzer.class_names = detection_result.names


                        # 4. Perform Scene Analysis (periodically)
                        if frame_since_last_desc >= description_update_interval_frames:
                            print(f"Analyzing scene at frame {frame_count} (threshold: {description_update_interval_frames} frames)...")
                            # Pass lighting_info=None for now, as it's disabled for performance
                            scene_analysis_result = self.image_processor.analyze_scene(detection_result, lighting_info=None)
                            current_description_for_frame = scene_analysis_result.get("description", last_description)
                            last_description = current_description_for_frame # Cache the new description
                            frame_since_last_desc = 0 # Reset counter

                        # 5. Calculate Statistics for this frame
                        stats = EvaluationMetrics.calculate_basic_stats(detection_result)
                        stats['frame_number'] = frame_count # Add frame number to stats
                        all_stats.append(stats)

                        # 6. Draw annotations
                        names = detection_result.names
                        boxes = detection_result.boxes.xyxy.cpu().numpy()
                        classes = detection_result.boxes.cls.cpu().numpy().astype(int)
                        confs = detection_result.boxes.conf.cpu().numpy()

                        def calculate_iou(box1, box2):
                            """Calculate Intersection IOU value"""
                            x1_1, y1_1, x2_1, y2_1 = box1
                            x1_2, y1_2, x2_2, y2_2 = box2

                            xi1 = max(x1_1, x1_2)
                            yi1 = max(y1_1, y1_2)
                            xi2 = min(x2_1, x2_2)
                            yi2 = min(y2_1, y2_2)

                            inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
                            box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
                            box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)

                            union_area = box1_area + box2_area - inter_area

                            return inter_area / union_area if union_area > 0 else 0

                        # 處理當前幀中的所有檢測
                        current_detected_objects = {}

                        for box, cls_id, conf in zip(boxes, classes, confs):
                            x1, y1, x2, y2 = map(int, box)

                            # 查找最匹配的已追蹤物體
                            best_match_id = None
                            best_match_iou = 0

                            for obj_id, (old_box, old_cls_id, _) in last_detected_objects.items():
                                if old_cls_id == cls_id:  # 同一類別才比較
                                    iou = calculate_iou(box, old_box)
                                    if iou > tracking_threshold and iou > best_match_iou:
                                        best_match_id = obj_id
                                        best_match_iou = iou

                            # 如果找到匹配，使用現有ID；否則分配新ID
                            if best_match_id is not None:
                                obj_id = best_match_id
                            else:
                                obj_id = next_object_id
                                next_object_id += 1

                                # 使用更明顯的顏色
                                bright_colors = [
                                    (0, 0, 255),    # red
                                    (0, 255, 0),    # green
                                    (255, 0, 0),    # blue
                                    (0, 255, 255),  # yellow
                                    (255, 0, 255),  # purple
                                    (255, 128, 0),  # orange
                                    (128, 0, 255)   # purple
                                ]
                                object_colors[obj_id] = bright_colors[obj_id % len(bright_colors)]

                            # update tracking info
                            current_detected_objects[obj_id] = (box, cls_id, conf)

                            color = object_colors.get(obj_id, (0, 255, 0))  # default is green
                            label = f"{names.get(cls_id, 'Unknown')}-{obj_id}: {conf:.2f}"

                            # 平滑化邊界框：如果是已知物體，與上一幀位置平均
                            if obj_id in last_detected_objects:
                                old_box, _, _ = last_detected_objects[obj_id]
                                old_x1, old_y1, old_x2, old_y2 = map(int, old_box)
                                # 平滑係數
                                alpha = 0.7  # current weight
                                beta = 0.3   # history weight

                                x1 = int(alpha * x1 + beta * old_x1)
                                y1 = int(alpha * y1 + beta * old_y1)
                                x2 = int(alpha * x2 + beta * old_x2)
                                y2 = int(alpha * y2 + beta * old_y2)

                            # draw box and label
                            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                            # add text
                            (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                            cv2.rectangle(frame, (x1, y1 - h - 10), (x1 + w, y1 - 10), color, -1)
                            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)

                        # update tracking info
                        last_detected_objects = current_detected_objects.copy()


                    # Draw the current scene description on the frame
                    cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA) # Black outline
                    cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA) # White text

                # Write the frame (annotated or original) to the output video
                # Draw last known description if this frame wasn't processed
                if not current_frame_annotated:
                    cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
                    cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)

                out.write(frame) # Write frame to output file

        except Exception as e:
            print(f"Error during video processing loop for {video_path}: {e}")
            import traceback
            traceback.print_exc()
            summary_lines.append(f"An error occurred during processing: {e}")
        finally:
            # Release resources
            cap.release()
            out.release()
            print(f"Video processing finished. Resources released. Output path: {output_path}")
            if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
                print(f"Error: Output video file was not created or is empty at {output_path}")
                summary_lines.append("Error: Failed to create output video.")
                output_path = None

        end_time = time.time()
        processing_time = end_time - start_time
        summary_lines.insert(0, f"Finished processing in {processing_time:.2f} seconds.")
        summary_lines.insert(1, f"Processed {processed_frame_count} frames out of {frame_count} (interval: {process_interval} frames).")
        summary_lines.insert(2, f"Scene description updated approximately every {scene_desc_interval_sec} seconds.")

        # Generate Aggregate Statistics
        aggregated_stats = {
            "total_frames_read": frame_count,
            "total_frames_processed": processed_frame_count,
            "avg_objects_per_processed_frame": 0, # Calculate below
            "cumulative_detections": {}, # Total times each class was detected
            "max_concurrent_detections": {} # Max count of each class in a single processed frame
            }
        object_cumulative_counts = {}
        object_max_concurrent_counts = {} # Store the max count found for each object type
        total_detected_in_processed = 0

        # Iterate through stats collected from each processed frame
        for frame_stats in all_stats:
            total_objects_in_frame = frame_stats.get("total_objects", 0)
            total_detected_in_processed += total_objects_in_frame

            # Iterate through object classes detected in this frame
            for obj_name, obj_data in frame_stats.get("class_statistics", {}).items():
                count_in_frame = obj_data.get("count", 0)

                # Cumulative count
                if obj_name not in object_cumulative_counts:
                    object_cumulative_counts[obj_name] = 0
                object_cumulative_counts[obj_name] += count_in_frame

                # Max concurrent count
                if obj_name not in object_max_concurrent_counts:
                    object_max_concurrent_counts[obj_name] = 0
                # Update the max count if the current frame's count is higher
                object_max_concurrent_counts[obj_name] = max(object_max_concurrent_counts[obj_name], count_in_frame)

        # Add sorted results to the final dictionary
        aggregated_stats["cumulative_detections"] = dict(sorted(object_cumulative_counts.items(), key=lambda item: item[1], reverse=True))
        aggregated_stats["max_concurrent_detections"] = dict(sorted(object_max_concurrent_counts.items(), key=lambda item: item[1], reverse=True))

        # Calculate average objects per processed frame
        if processed_frame_count > 0:
             aggregated_stats["avg_objects_per_processed_frame"] = round(total_detected_in_processed / processed_frame_count, 2)

        summary_text = "\n".join(summary_lines)
        print("Generated Summary:\n", summary_text)
        print("Aggregated Stats (Revised):\n", aggregated_stats) # Print the revised stats

        # Return the potentially updated output_path
        return output_path, summary_text, aggregated_stats

In [1]:
# %%writefile llm_enhancer.py
import re
import os
import torch
from typing import Dict, List, Tuple, Any, Optional
import logging

class LLMEnhancer:
    """
    負責使用LLM (Large Language Model) 增強場景理解和描述。
    未來可以再整合Llama或其他LLM模型進行場景描述的生成和豐富化。
    """

    def __init__(self,
                model_path: Optional[str] = None,
                tokenizer_path: Optional[str] = None,
                device: Optional[str] = None,
                max_length: int = 2048,
                temperature: float = 0.3,
                top_p: float = 0.85):
        """
        初始化LLM增強器

        Args:
            model_path: LLM模型的路徑或HuggingFace log in，默認使用Llama 3.2
            tokenizer_path: token處理器的路徑，通常與model_path相同
            device: 設備檢查 ('cpu'或'cuda')
            max_length: 生成文本的最大長度
            temperature: 生成文本的溫度（較高比較有創意，較低會偏保守）
            top_p: 生成文本時的核心採樣機率閾值
        """
        self.logger = logging.getLogger("LLMEnhancer")
        self.logger.setLevel(logging.INFO)
        handler = logging.StreamHandler()
        handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
        self.logger.addHandler(handler)

        # 默認用 Llama3.2
        self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
        self.tokenizer_path = tokenizer_path or self.model_path

        # 確定運行設備
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.logger.info(f"Using device: {self.device}")

        # create parameters
        self.max_length = max_length
        self.temperature = temperature
        self.top_p = top_p

        self.model = None
        self.tokenizer = None

        # 計數器，用來追蹤模型調用次數
        self.call_count = 0

        self._initialize_prompts()

        # only if need to load the model
        self._model_loaded = False

        try:
            self.hf_token = os.environ.get("HF_TOKEN")
            if self.hf_token:
                self.logger.info("Logging in to Hugging Face with token")
                from huggingface_hub import login
                login(token=self.hf_token)
            else:
                self.logger.warning("HF_TOKEN not found in environment variables. Access to gated models may be limited.")
        except Exception as e:
            self.logger.error(f"Error during Hugging Face login: {e}")

    def _load_model(self):
        """只在首次需要時加載，使用 8 位量化以節省記憶體"""
        if self._model_loaded:
            return

        try:
            self.logger.info(f"Loading LLM model from {self.model_path} with 8-bit quantization")
            import torch
            from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
            torch.cuda.empty_cache()

            if torch.cuda.is_available():
                free_in_GB = torch.cuda.get_device_properties(0).total_memory / 1024**3
                print(f"Total GPU memory: {free_in_GB:.2f} GB")

            # 設置 8 位元配置(節省記憶體空間)
            quantization_config = BitsAndBytesConfig(
                load_in_8bit=True,
                llm_int8_enable_fp32_cpu_offload=True
            )

            self.tokenizer = AutoTokenizer.from_pretrained(
                self.tokenizer_path,
                padding_side="left",
                use_fast=False,
                token=self.hf_token
            )

            # 特殊標記
            self.tokenizer.pad_token = self.tokenizer.eos_token

            # 加載 8 位量化模型
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_path,
                quantization_config=quantization_config,
                device_map="auto",
                low_cpu_mem_usage=True,
                token=self.hf_token
            )

            self.logger.info("Model loaded successfully with 8-bit quantization")
            self._model_loaded = True

        except Exception as e:
            self.logger.error(f"Error loading LLM model: {e}")
            import traceback
            traceback.print_exc()
            raise

    def _initialize_prompts(self):
        """Return an optimized prompt template specifically for Zephyr model"""
        # the critical prompt for the model
        self.enhance_description_template = """
            <|system|>
            You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.

            Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.

            </|system|>

            <|user|>
            Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.

            ORIGINAL:
            {original_description}

            CRITICAL RULES:
            1. NEVER assume room type, object function, or scene purpose unless directly stated.
            2. NEVER invent object types. You are limited to: {object_list}
            3. NEVER speculate on object quantity. If the description says "10 people" , DO NOT say "dozens" or "many". Maintain the original quantity unless specified.
            4. Use terms like "in the scene", "visible in the background", or "positioned in the lower left" instead of assuming direction or layout logic.
            5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
            6. Write 2–4 complete, well-structured sentences with punctuation.
            7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
            8. NEVER include explanations, reasoning, or tags. ONLY provide the enhanced description.
            9. Do not repeat any sentence structure or phrase more than once.
            </|user|>

            <|assistant|>
            """


        # 錯誤檢測的prompt
        self.verify_detection_template = """
            Task: You are an advanced vision system that verifies computer vision detections for accuracy.

            Analyze the following detection results and identify any potential errors or inconsistencies:

            SCENE TYPE: {scene_type}
            SCENE NAME: {scene_name}
            CONFIDENCE: {confidence:.2f}

            DETECTED OBJECTS: {detected_objects}

            CLIP ANALYSIS RESULTS:
            {clip_analysis}

            Possible Errors to Check:
            1. Objects misidentified (e.g., architectural elements labeled as vehicles)
            2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
            3. Objects that seem out of place for this type of scene
            4. Inconsistencies between different detection systems

            If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.

            Verification Results:
            """

        # 無檢測處理的prompt
        self.no_detection_template = """
            Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.

            Based on advanced image embeddings (CLIP analysis), we have the following information:

            MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
            VIEWPOINT: {viewpoint}
            LIGHTING: {lighting_condition}

            CULTURAL ANALYSIS: {cultural_analysis}

            Create a detailed description of what might be in this scene, considering:
            1. The most likely type of location or setting
            2. Possible architectural or natural elements present
            3. The lighting and atmosphere
            4. Potential cultural or regional characteristics

            Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.

            Scene Description:
            """

    def _clean_llama_response(self, response: str) -> str:
        """處理 Llama 模型特有的輸出格式問題"""
        # 首先應用通用清理
        response = self._clean_model_response(response)

        # 移除 Llama 常見的前綴短語
        prefixes_to_remove = [
            "Here's the enhanced description:",
            "Enhanced description:",
            "Here is the enhanced scene description:",
            "I've enhanced the description while preserving all factual details:"
        ]

        for prefix in prefixes_to_remove:
            if response.lower().startswith(prefix.lower()):
                response = response[len(prefix):].strip()

        # 移除可能的後綴說明
        suffixes_to_remove = [
            "I've maintained all the key factual elements",
            "I've preserved all the factual details",
            "All factual elements have been maintained"
        ]

        for suffix in suffixes_to_remove:
            if response.lower().endswith(suffix.lower()):
                response = response[:response.rfind(suffix)].strip()

        return response

    # For Future Usage
    def _detect_scene_type(self, detected_objects: List[Dict]) -> str:
        """
        Detect scene type based on object distribution and patterns
        """
        # Default scene type
        scene_type = "intersection"

        # Count objects by class
        object_counts = {}
        for obj in detected_objects:
            class_name = obj.get("class_name", "")
            if class_name not in object_counts:
                object_counts[class_name] = 0
            object_counts[class_name] += 1

        # 辨識人
        people_count = object_counts.get("person", 0)

        # 交通工具的
        car_count = object_counts.get("car", 0)
        bus_count = object_counts.get("bus", 0)
        truck_count = object_counts.get("truck", 0)
        total_vehicles = car_count + bus_count + truck_count

        # Simple scene type detection logic
        if people_count > 8 and total_vehicles < 2:
            scene_type = "pedestrian_crossing"
        elif people_count > 5 and total_vehicles > 2:
            scene_type = "busy_intersection"
        elif people_count < 3 and total_vehicles > 3:
            scene_type = "traffic_junction"

        return scene_type

    def _clean_scene_type(self, scene_type: str) -> str:
        """清理場景類型，使其更適合用於提示詞"""
        if not scene_type:
            return "scene"

        # replace underline to space or sometime capital letter
        if '_' in scene_type:
            return ' '.join(word.capitalize() for word in scene_type.split('_'))

        return scene_type

    def _clean_model_response(self, response: str) -> str:
        """清理模型回應以移除常見的標記和前綴"""
        # 移除任何可能殘留的系統樣式標記
        response = re.sub(r'<\|.*?\|>', '', response)

        # 移除任何 "This european_plaza" 或類似前綴
        response = re.sub(r'^This [a-z_]+\s+', '', response)

        # 確保響應以大寫字母開頭
        if response and not response[0].isupper():
            response = response[0].upper() + response[1:]

        return response.strip()

    def reset_context(self):
        """在處理新圖像前重置模型上下文"""
        if self._model_loaded:
            # 清除 GPU 緩存
            torch.cuda.empty_cache()
            self.logger.info("Model context reset")
        else:
            self.logger.info("Model not loaded, no context to reset")

    def _remove_introduction_sentences(self, response: str) -> str:
        """移除生成文本中可能的介紹性句子"""
        # 識別常見的介紹性模式
        intro_patterns = [
            r'^Here is the (?:rewritten|enhanced) .*?description:',
            r'^The (?:rewritten|enhanced) description:',
            r'^Here\'s the (?:rewritten|enhanced) description of .*?:'
        ]

        for pattern in intro_patterns:
            if re.match(pattern, response, re.IGNORECASE):
                # 找到冒號後的內容
                parts = re.split(r':', response, 1)
                if len(parts) > 1:
                    return parts[1].strip()

        return response

    def enhance_description(self, scene_data: Dict[str, Any]) -> str:
        """改進的場景描述增強器，處理各種場景類型並保留視角與光照資訊，並作為總窗口可運用於其他class"""
        try:
            # 重置上下文
            self.reset_context()

            # 確保模型已加載
            if not self._model_loaded:
                self._load_model()

            # extract original description
            original_desc = scene_data.get("original_description", "")
            if not original_desc:
                return "No original description provided."

            # 獲取scene type 並標準化
            scene_type = scene_data.get("scene_type", "unknown scene")
            scene_type = self._clean_scene_type(scene_type)

            # 提取檢測到的物件並過濾低信心度物件
            detected_objects = scene_data.get("detected_objects", [])
            filtered_objects = []

            # 高信心度閾值，嚴格過濾物件
            high_confidence_threshold = 0.65

            for obj in detected_objects:
                confidence = obj.get("confidence", 0)
                class_name = obj.get("class_name", "")

                # 為特殊類別設置更高閾值
                special_classes = ["airplane", "helicopter", "boat"]
                if class_name in special_classes:
                    if confidence < 0.75:  # 為這些類別設置更高閾值
                        continue

                # 只保留高信心度物件
                if confidence >= high_confidence_threshold:
                    filtered_objects.append(obj)

            # 計算物件列表和數量 - 僅使用過濾後的高信心度物件
            object_counts = {}
            for obj in filtered_objects:
                class_name = obj.get("class_name", "")
                if class_name not in object_counts:
                    object_counts[class_name] = 0
                object_counts[class_name] += 1

            # 將高置信度物件格式化為清單
            high_confidence_objects = ", ".join([f"{count} {obj}" for obj, count in object_counts.items()])

            # 如果沒有高信心度物件，回退到使用原始描述中的關鍵詞
            if not high_confidence_objects:
                # 從原始描述中提取物件提及
                object_keywords = self._extract_objects_from_description(original_desc)
                high_confidence_objects = ", ".join(object_keywords) if object_keywords else "objects visible in the scene"

            # 保留原始描述中的關鍵視角信息
            perspective = self._extract_perspective_from_description(original_desc)

            # 提取光照資訊
            lighting_description = "unknown lighting"
            if "lighting_info" in scene_data:
                lighting_info = scene_data.get("lighting_info", {})
                time_of_day = lighting_info.get("time_of_day", "unknown")
                is_indoor = lighting_info.get("is_indoor", False)
                lighting_description = f"{'indoor' if is_indoor else 'outdoor'} {time_of_day} lighting"

            # 創建prompt，整合所有關鍵資訊
            prompt = self.enhance_description_template.format(
                scene_type=scene_type,
                object_list=high_confidence_objects,
                original_description=original_desc,
                perspective=perspective,
                lighting_description=lighting_description
            )

            # 生成增強描述
            self.logger.info("Generating LLM response...")
            response = self._generate_llm_response(prompt)

            # 檢查回應完整性的更嚴格標準
            is_incomplete = (
                len(response) < 100 or  # too short
                (len(response) < 200 and "." not in response[-30:]) or  # 結尾沒有適當的標點符號
                any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"])  # 以不完整短語結尾
            )

            max_retries = 3
            attempts = 0
            while attempts < max_retries and is_incomplete:
                self.logger.warning(f"Generated incomplete response, retrying... Attempt {attempts+1}/{max_retries}")
                # 重新生成
                response = self._generate_llm_response(prompt)
                attempts += 1

                # 重新檢查完整性
                is_incomplete = (len(response) < 100 or
                                (len(response) < 200 and "." not in response[-30:]) or
                                any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"]))

            if not response or len(response.strip()) < 10:
                self.logger.warning("Generated response was empty or too short, returning original description")
                return original_desc

            # 使用與模型相符的清理方法
            if "llama" in self.model_path.lower():
                result = self._clean_llama_response(response)
            else:
                result = self._clean_model_response(response)

            # 移除介紹性type句子
            result = self._remove_introduction_sentences(result)

            # 移除explanation
            result = self._remove_explanatory_notes(result)

            # fact check
            result = self._verify_factual_accuracy(original_desc, result, high_confidence_objects)

            # 確保場景類型和視角一致性
            result = self._ensure_scene_type_consistency(result, scene_type, original_desc)
            if perspective and perspective.lower() not in result.lower():
                result = f"{perspective}, {result[0].lower()}{result[1:]}"

            return str(result)

        except Exception as e:
            self.logger.error(f"Enhancement failed: {str(e)}")
            import traceback
            self.logger.error(traceback.format_exc())
            return original_desc  # 發生任何錯誤時返回原始描述

    def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
        """驗證生成的描述不包含原始描述或物體列表中沒有的信息"""

        # 將原始描述和物體列表合併為授權詞彙源
        authorized_content = original.lower() + " " + object_list.lower()

        # 提取生成描述中具有實質意義的名詞
        # 創建常見地點、文化和地域詞彙的列表
        location_terms = ["plaza", "square", "market", "mall", "avenue", "boulevard"]
        cultural_terms = ["european", "asian", "american", "african", "western", "eastern"]

        # 檢查生成文本中的每個詞
        for term in location_terms + cultural_terms:
            # 僅當該詞出現在生成文本但不在授權內容中時進行替換
            if term in generated.lower() and term not in authorized_content:
                # 根據詞語類型選擇適當的替換詞
                if term in location_terms:
                    replacement = "area"
                else:
                    replacement = "scene"

                # 使用正則表達式進行完整詞匹配替換
                pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
                generated = pattern.sub(replacement, generated)

        return generated


    def verify_detection(self,
                       detected_objects: List[Dict],
                       clip_analysis: Dict[str, Any],
                       scene_type: str,
                       scene_name: str,
                       confidence: float) -> Dict[str, Any]:
        """
        驗證並可能修正YOLO的檢測結果

        Args:
            detected_objects: YOLO檢測到的物體列表
            clip_analysis: CLIP分析結果
            scene_type: 識別的場景類型
            scene_name: 場景名稱
            confidence: 場景分類的信心度

        Returns:
            Dict: 包含驗證結果和建議的字典
        """
        # 確保模型已加載
        self._load_model()

        # 格式化數據
        objects_str = self._format_objects_for_prompt(detected_objects)
        clip_str = self._format_clip_results(clip_analysis)

        # 構建提示
        prompt = self.verify_detection_template.format(
            scene_type=scene_type,
            scene_name=scene_name,
            confidence=confidence,
            detected_objects=objects_str,
            clip_analysis=clip_str
        )

        # 調用LLM進行驗證
        verification_result = self._generate_llm_response(prompt)

        # 解析驗證結果
        result = {
            "verification_text": verification_result,
            "has_errors": "appear accurate" not in verification_result.lower(),
            "corrected_objects": None  # 可能在未來版本實現詳細錯誤修正
        }

        return result

    def _remove_explanatory_notes(self, response: str) -> str:
        """移除解釋性注釋、說明和其他非描述性內容"""

        # 識別常見的注釋和解釋模式
        note_patterns = [
            r'(?:^|\n)Note:.*?(?:\n|$)',
            r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
            r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
            r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
        ]

        # 尋找第一段完整的描述內容
        paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]

        # 如果只有一個段落，檢查並清理它
        if len(paragraphs) == 1:
            for pattern in note_patterns:
                paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
            return paragraphs[0].strip()

        # 如果有多個段落，識別並移除注釋段落
        content_paragraphs = []
        for paragraph in paragraphs:
            is_note = False
            for pattern in note_patterns:
                if re.search(pattern, paragraph, flags=re.IGNORECASE):
                    is_note = True
                    break

            # 檢查段落是否以常見的注釋詞開頭
            if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
                is_note = True

            if not is_note:
                content_paragraphs.append(paragraph)

        # 返回清理後的內容
        return '\n\n'.join(content_paragraphs).strip()

    def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
        """
        處理YOLO未檢測到物體的情況

        Args:
            clip_analysis: CLIP分析結果

        Returns:
            str: 生成的場景描述
        """
        # 確保模型已加載
        self._load_model()

        # 提取CLIP結果
        top_scene, top_confidence = clip_analysis.get("top_scene", ("unknown", 0))
        viewpoint = clip_analysis.get("viewpoint", ("standard", 0))[0]
        lighting = clip_analysis.get("lighting_condition", ("unknown", 0))[0]

        # 格式化文化分析
        cultural_str = self._format_cultural_analysis(clip_analysis.get("cultural_analysis", {}))

        # 構建提示
        prompt = self.no_detection_template.format(
            top_scene=top_scene,
            top_confidence=top_confidence,
            viewpoint=viewpoint,
            lighting_condition=lighting,
            cultural_analysis=cultural_str
        )

        # 調用LLM生成描述
        description = self._generate_llm_response(prompt)

        # 優化輸出
        return self._clean_llm_response(description)

    def _clean_input_text(self, text: str) -> str:
        """
        對輸入文本進行通用的格式清理，處理常見的格式問題。

        Args:
            text: 輸入文本

        Returns:
            清理後的文本
        """
        if not text:
            return ""

        # 清理格式的問題
        # 1. 處理連續標點符號問題
        text = re.sub(r'([.,;:!?])\1+', r'\1', text)

        # 2. 修復不完整句子的標點（如 "Something," 後沒有繼續接續下去）
        text = re.sub(r',\s*$', '.', text)

        # 3. 修復如 "word." 後未加空格即接下一句的問題
        text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)

        # 4. 移除多餘空格
        text = re.sub(r'\s+', ' ', text).strip()

        # 5. 確保句子正確結束（句尾加句號）
        if text and not text[-1] in '.!?':
            text += '.'

        return text

    def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
        """
        驗證並可能修正增強後的描述，確保有保持事實準確性。

        Args:
            original_desc: 原始場景描述
            enhanced_desc: 增強後的描述待驗證
            scene_type: 場景類型
            detected_objects: 檢測到的物體名稱列表

        Returns:
            經過事實檢查的描述
        """
        # 如果增強描述為空或太短，返回原始描述
        if not enhanced_desc or len(enhanced_desc) < 30:
            return original_desc

        # 1. 檢查數值一致性（如人數、物體數量等）
        # 從原始描述中提取數字和相關名詞
        number_patterns = [
            (r'(\d+)\s+(people|person|pedestrians|individuals)', r'\1', r'\2'), # 人數
            (r'(\d+)\s+(cars|vehicles|automobiles)', r'\1', r'\2'),            # 車輛數
            (r'(\d+)\s+(buildings|structures)', r'\1', r'\2')                  # 建築數
        ]

        # 檢查原始描述中的每個數字
        for pattern, num_group, word_group in number_patterns:
            original_matches = re.finditer(pattern, original_desc, re.IGNORECASE)
            for match in original_matches:
                number = match.group(1)
                noun = match.group(2)

                # 檢查增強描述中是否保留了這個數字
                # 創建一個更通用的模式來檢查增強描述中是否包含此數字和對象類別
                enhanced_pattern = r'(\d+)\s+(' + re.escape(noun) + r'|' + re.escape(noun.rstrip('s')) + r'|' + re.escape(noun + 's') + r')'
                enhanced_matches = list(re.finditer(enhanced_pattern, enhanced_desc, re.IGNORECASE))

                if not enhanced_matches:
                    # 數字+名詞未在增強描述中找到
                    plural_form = noun if noun.endswith('s') or number == '1' else noun + 's'
                    if enhanced_desc.startswith("This") or enhanced_desc.startswith("The"):
                        enhanced_desc = enhanced_desc.replace("This ", f"This scene with {number} {plural_form} ", 1)
                        enhanced_desc = enhanced_desc.replace("The ", f"The scene with {number} {plural_form} ", 1)
                    else:
                        enhanced_desc = f"The scene includes {number} {plural_form}. " + enhanced_desc
                elif enhanced_matches and match.group(1) != number:
                    # 存在但數字不一致，就要更正數字
                    for ematch in enhanced_matches:
                        wrong_number = ematch.group(1)
                        enhanced_desc = enhanced_desc.replace(f"{wrong_number} {ematch.group(2)}", f"{number} {ematch.group(2)}")

        # 2. 檢查視角的一致性
        perspective_terms = {
            "aerial": ["aerial", "bird's-eye", "overhead", "top-down", "above", "looking down"],
            "ground": ["street-level", "ground level", "eye-level", "standing"],
            "indoor": ["inside", "interior", "indoor", "within"],
            "close-up": ["close-up", "detailed view", "close shot"]
        }

        # 確定原始視角
        original_perspective = None
        for persp, terms in perspective_terms.items():
            if any(term in original_desc.lower() for term in terms):
                original_perspective = persp
                break

        # 檢查是否保留了視角方面
        if original_perspective:
            enhanced_has_perspective = any(term in enhanced_desc.lower() for term in perspective_terms[original_perspective])

            if not enhanced_has_perspective:
                # 添加之前缺的視角方面
                perspective_prefixes = {
                    "aerial": "From an aerial perspective, ",
                    "ground": "From street level, ",
                    "indoor": "In this indoor setting, ",
                    "close-up": "In this close-up view, "
                }

                prefix = perspective_prefixes.get(original_perspective, "")
                if prefix:
                    if enhanced_desc[0].isupper():
                        enhanced_desc = prefix + enhanced_desc[0].lower() + enhanced_desc[1:]
                    else:
                        enhanced_desc = prefix + enhanced_desc

        # 3. 檢查場景類型一致性
        if scene_type and scene_type.lower() != "unknown" and scene_type.lower() not in enhanced_desc.lower():
            # 添加場景類型
            if enhanced_desc.startswith("This ") or enhanced_desc.startswith("The "):
                # 避免產生 "This scene" 和 "This intersection" 的重複
                if "scene" in enhanced_desc[:15].lower():
                    fixed_type = scene_type.lower()
                    enhanced_desc = enhanced_desc.replace("scene", fixed_type, 1)
                else:
                    enhanced_desc = enhanced_desc.replace("This ", f"This {scene_type} ", 1)
                    enhanced_desc = enhanced_desc.replace("The ", f"The {scene_type} ", 1)
            else:
                enhanced_desc = f"This {scene_type} " + enhanced_desc

        # 4. 確保文字長度適當，這邊的限制要與prompt相同,否則會產生矛盾
        words = enhanced_desc.split()
        if len(words) > 200:
            # 找尋接近字數限制的句子結束處
            truncated = ' '.join(words[:200])
            last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))

            if last_period > 0:
                enhanced_desc = truncated[:last_period+1]
            else:
                enhanced_desc = truncated + '.'

        return enhanced_desc

    def _extract_perspective_from_description(self, description: str) -> str:
        """從原始描述中提取視角/透視信息"""
        perspective_terms = {
            "aerial": ["aerial perspective", "aerial view", "bird's-eye view", "overhead view", "from above"],
            "ground": ["ground level", "eye level", "street level"],
            "indoor": ["indoor setting", "inside", "interior"]
        }

        for persp_type, terms in perspective_terms.items():
            for term in terms:
                if term.lower() in description.lower():
                    return term

        return ""

    def _extract_objects_from_description(self, description: str) -> List[str]:
        """從原始描述中提取物件提及"""
        # 常見物件正則表達式模式
        object_patterns = [
            r'(\d+)\s+(people|persons|pedestrians|individuals)',
            r'(\d+)\s+(cars|vehicles|automobiles)',
            r'(\d+)\s+(buildings|structures)',
            r'(\d+)\s+(plants|potted plants|flowers)',
            r'(\d+)\s+(beds|furniture|tables|chairs)'
        ]

        extracted_objects = []

        for pattern in object_patterns:
            matches = re.finditer(pattern, description, re.IGNORECASE)
            for match in matches:
                number = match.group(1)
                object_type = match.group(2)
                extracted_objects.append(f"{number} {object_type}")

        return extracted_objects

    def _ensure_scene_type_consistency(self, description: str, scene_type: str, original_desc: str) -> str:
        """確保描述中的場景類型與指定的場景類型一致"""
        # 禁止使用的錯誤場景詞列表
        prohibited_scene_words = ["plaza", "square", "european", "asian", "american"]

        # 檢查是否包含禁止的場景詞
        for word in prohibited_scene_words:
            if word in description.lower() and word not in original_desc.lower() and word not in scene_type.lower():
                # 替換錯誤場景詞為正確場景類型
                pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
                description = pattern.sub(scene_type, description)

        # 確保場景類型在描述中被提及
        if scene_type.lower() not in description.lower():
            # 尋找通用場景詞並替換
            for general_term in ["scene", "area", "place", "location"]:
                if general_term in description.lower():
                    pattern = re.compile(r'\b' + general_term + r'\b', re.IGNORECASE)
                    description = pattern.sub(scene_type, description, count=1)
                    break
            else:
                # 如果沒有找到通用詞，在開頭添加場景類型
                if description.startswith("The "):
                    description = description.replace("The ", f"The {scene_type} ", 1)
                elif description.startswith("This "):
                    description = description.replace("This ", f"This {scene_type} ", 1)
                else:
                    description = f"This {scene_type} " + description

        return description

    def _generate_llm_response(self, prompt: str) -> str:
        """生成 LLM 的回應"""
        self._load_model()

        try:
            self.call_count += 1
            self.logger.info(f"LLM call #{self.call_count}")

            # 清除 GPU 緩存
            torch.cuda.empty_cache()

            # 設置固定種子以提高一致性
            torch.manual_seed(42)

            # 準備輸入
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=self.max_length).to(self.device)

            # 根據模型類型調整參數
            generation_params = {
                "max_new_tokens": 120,
                "pad_token_id": self.tokenizer.eos_token_id,
                "attention_mask": inputs.attention_mask,
                "use_cache": True,
            }

            # 為 Llama 模型設置特定參數
            if "llama" in self.model_path.lower():
                generation_params.update({
                    "temperature": 0.4,        # 不要太高, 否則模型可能會太有主觀意見
                    "max_new_tokens": 600,
                    "do_sample": True,
                    "top_p": 0.8,
                    "repetition_penalty": 1.2,  # 重複的懲罰權重,可避免掉重複字
                    "num_beams": 4 ,
                    "length_penalty": 1.2,
                })

            else:
                # 如果用其他模型的參數
                generation_params.update({
                    "temperature": 0.6,
                    "max_new_tokens": 300,
                    "top_p": 0.9,
                    "do_sample": True,
                    "num_beams": 1,
                    "repetition_penalty": 1.05
                })

            # 生成回應
            with torch.no_grad():
                outputs = self.model.generate(inputs.input_ids, **generation_params)

            # 解碼完整輸出
            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # 提取生成的響應部分
            assistant_tag = "<|assistant|>"
            if assistant_tag in full_response:
                response = full_response.split(assistant_tag)[-1].strip()

                # 檢查是否有未閉合的 <|assistant|>
                user_tag = "<|user|>"
                if user_tag in response:
                    response = response.split(user_tag)[0].strip()
            else:
                # 移除輸入提示
                input_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
                response = full_response
                if response.startswith(input_text):
                    response = response[len(input_text):].strip()

            # 確保不返回空響應
            if not response or len(response.strip()) < 10:
                self.logger.warning("生成的回應為空的或太短，返回默認回應")
                return "No detailed description could be generated."

            return response

        except Exception as e:
            self.logger.error(f"生成 LLM 響應時出錯: {str(e)}")
            import traceback
            self.logger.error(traceback.format_exc())
            return "Unable to generate enhanced description."

    def _clean_llm_response(self, response: str) -> str:
        """
        Clean the LLM response to ensure the output contains only clean descriptive text.
        Sometimes it will not only display the description but display tags, notes...etc

        Args:
            response: Original response from the LLM

        Returns:
            Cleaned description text
        """
        if not response:
            return ""

        # Save original response as backup
        original_response = response

        # 1. Extract content between markers (if present)
        output_start = response.find("[OUTPUT_START]")
        output_end = response.find("[OUTPUT_END]")
        if output_start != -1 and output_end != -1 and output_end > output_start:
            response = response[output_start + len("[OUTPUT_START]"):output_end].strip()

        # 2. Remove all remaining section markers and instructions
        section_markers = [
            r'\[.*?\]',                      # [any text]
            r'OUTPUT_START\s*:|OUTPUT_END\s*:',  # OUTPUT_START: or OUTPUT_END:
            r'ENHANCED DESCRIPTION\s*:',      # ENHANCED DESCRIPTION:
            r'Scene Type\s*:.*?(?=\n|$)',    # Scene Type: text
            r'Original Description\s*:.*?(?=\n|$)', # Original Description: text
            r'GOOD\s*:|BAD\s*:',             # GOOD: or BAD:
            r'PROBLEM\s*:.*?(?=\n|$)',       # PROBLEM: text
            r'</?\|(?:assistant|system|user)\|>',  # Dialog markers
            r'\(Note:.*?\)',                 # Notes in parentheses
            r'\(.*?I\'ve.*?\)',              # Common explanatory content
            r'\(.*?as per your request.*?\)' # References to instructions
        ]

        for marker in section_markers:
            response = re.sub(marker, '', response, flags=re.IGNORECASE)

        # 3. Remove common prefixes and suffixes
        prefixes_to_remove = [
            "Enhanced Description:",
            "Scene Description:",
            "Description:",
            "Here is the enhanced description:",
            "Here's the enhanced description:"
        ]

        for prefix in prefixes_to_remove:
            if response.lower().startswith(prefix.lower()):
                response = response[len(prefix):].strip()

        # 4. Remove any Context tags or text containing Context
        response = re.sub(r'<\s*Context:.*?>', '', response)
        response = re.sub(r'Context:.*?(?=\n|$)', '', response)
        response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)

        # 5. Clean improper scene type references
        scene_type_pattern = r'This ([a-zA-Z_]+) (features|shows|displays|contains)'
        match = re.search(scene_type_pattern, response)
        if match and '_' in match.group(1):
            fixed_text = f"This scene {match.group(2)}"
            response = re.sub(scene_type_pattern, fixed_text, response)

        # 6. Reduce dash usage for more natural punctuation
        response = re.sub(r'—', ', ', response)
        response = re.sub(r' - ', ', ', response)

        # 7. Remove excess whitespace and line breaks
        response = response.replace('\r', ' ')
        response = re.sub(r'\n+', ' ', response)  # 將所有換行符替換為空格
        response = re.sub(r'\s{2,}', ' ', response)  # 將多個空格替換為單個空格

        # 8. Remove Markdown formatting
        response = re.sub(r'\*\*|\*|__|\|', '', response)  # Remove Markdown indicators

        # 9. Detect and remove sentence duplicates
        sentences = re.split(r'(?<=[.!?])\s+', response)
        unique_sentences = []
        seen_content = set()

        for sentence in sentences:
            # Skip empty sentences
            if not sentence.strip():
                continue

            # Create simplified version for comparison (lowercase, no punctuation)
            simplified = re.sub(r'[^\w\s]', '', sentence.lower())
            simplified = ' '.join(simplified.split())  # Standardize whitespace

            # Check if we've seen a similar sentence
            is_duplicate = False
            for existing in seen_content:
                if len(simplified) > 10 and (existing in simplified or simplified in existing):
                    is_duplicate = True
                    break

            if not is_duplicate and simplified:
                unique_sentences.append(sentence)
                seen_content.add(simplified)

        # Recombine unique sentences
        response = ' '.join(unique_sentences)

        # 10. Ensure word count is within limits (50-150 words)
        words = response.split()
        if len(words) > 200:
            # Find sentence ending near the word limit
            truncated = ' '.join(words[:200])
            last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))

            if last_period > 0:
                response = truncated[:last_period+1]
            else:
                response = truncated + "."

        # 11. Check sentence completeness
        if response and not response.strip()[-1] in ['.', '!', '?']:
            # Find the last preposition or conjunction
            common_prepositions = ["into", "onto", "about", "above", "across", "after", "along", "around", "at", "before", "behind", "below", "beneath", "beside", "between", "beyond", "by", "down", "during", "except", "for", "from", "in", "inside", "near", "of", "off", "on", "over", "through", "to", "toward", "under", "up", "upon", "with", "within"]

            # Check if ending with preposition or conjunction
            last_word = response.strip().split()[-1].lower() if response.strip().split() else ""
            if last_word in common_prepositions or last_word in ["and", "or", "but"]:
                # Find the last complete sentence
                last_period = max(response.rfind('.'), response.rfind('!'), response.rfind('?'))
                if last_period > 0:
                    response = response[:last_period+1]
                else:
                    # If no complete sentence found, modify the ending
                    words = response.strip().split()
                    if words:
                        # Remove the last preposition or conjunction
                        response = " ".join(words[:-1]) + "."

        # 12. Ensure haven't over-filtered
        if not response or len(response) < 40:
            # Try to get the first meaningful paragraph from the original response
            paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
            if paragraphs:
                # Choose the longest paragraph as it's most likely the actual description
                best_para = max(paragraphs, key=len)
                # Clean it using a subset of the above rules
                best_para = re.sub(r'\[.*?\]', '', best_para)  # Remove [SECTION] markers
                best_para = re.sub(r'\s{2,}', ' ', best_para).strip()  # Clean whitespace

                if len(best_para) >= 40:
                    return best_para

            # If still no good content, return a simple message
            return "Unable to generate a valid enhanced description."

        # 13. Final cleaning - catch any missed special cases
        response = re.sub(r'</?\|.*?\|>', '', response)  # Any remaining tags
        response = re.sub(r'\(.*?\)', '', response)  # Any remaining parenthetical content
        response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)  # Any remaining notes

        # Ensure proper spacing after punctuation
        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)

        # Ensure first letter is capitalized
        if response and response[0].islower():
            response = response[0].upper() + response[1:]

        # 14. 統一格式 - 確保輸出始終是單一段落
        response = re.sub(r'\s*\n\s*', ' ', response)  # 將所有換行符替換為空格
        response = ' '.join(response.split())

        return response.strip()

    def _format_objects_for_prompt(self, objects: List[Dict]) -> str:
        """格式化物體列表以用於提示"""
        if not objects:
            return "No objects detected"

        formatted = []
        for obj in objects:
            formatted.append(f"{obj['class_name']} (confidence: {obj['confidence']:.2f})")

        return "\n- " + "\n- ".join(formatted)


    def _format_clip_results(self, clip_analysis: Dict) -> str:
        """格式化CLIP分析結果以用於提示"""
        if not clip_analysis or "error" in clip_analysis:
            return "No CLIP analysis available"

        parts = ["CLIP Analysis Results:"]

        # 加上頂級場景
        top_scene, confidence = clip_analysis.get("top_scene", ("unknown", 0))
        parts.append(f"- Most likely scene: {top_scene} (confidence: {confidence:.2f})")

        # 加上視角
        viewpoint, vp_conf = clip_analysis.get("viewpoint", ("standard", 0))
        parts.append(f"- Camera viewpoint: {viewpoint} (confidence: {vp_conf:.2f})")

        # 加上物體組合
        if "object_combinations" in clip_analysis:
            combos = []
            for combo, score in clip_analysis["object_combinations"][:3]:
                combos.append(f"{combo} ({score:.2f})")
            parts.append(f"- Object combinations: {', '.join(combos)}")

        # 加上文化分析
        if "cultural_analysis" in clip_analysis:
            parts.append("- Cultural analysis:")
            for culture_type, data in clip_analysis["cultural_analysis"].items():
                best_desc = data.get("best_description", "")
                desc_conf = data.get("confidence", 0)
                parts.append(f"  * {culture_type}: {best_desc} ({desc_conf:.2f})")

        return "\n".join(parts)

    def _format_cultural_analysis(self, cultural_analysis: Dict) -> str:
        """格式化文化分析結果"""
        if not cultural_analysis:
            return "No specific cultural elements detected"

        parts = []
        for culture_type, data in cultural_analysis.items():
            best_desc = data.get("best_description", "")
            desc_conf = data.get("confidence", 0)
            parts.append(f"{culture_type}: {best_desc} (confidence: {desc_conf:.2f})")

        return "\n".join(parts)

Writing llm_enhancer.py


In [None]:
# %%writefile app.py
import re
import os
import numpy as np
import matplotlib.pyplot as plt
import gradio as gr
from typing import Dict, List, Any, Optional, Tuple
import cv2
from PIL import Image
import tempfile
import uuid
# import spaces

# from detection_model import DetectionModel
# from color_mapper import ColorMapper
# from evaluation_metrics import EvaluationMetrics
# from style import Style
# from image_processor import ImageProcessor
# from video_processor import VideoProcessor
# from llm_enhancer import LLMEnhancer

# Initialize Processors with LLM support
image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
video_processor = VideoProcessor(image_processor)

# Helper Function
def get_all_classes():
    """Gets all available COCO classes."""
    # Try to get from a loaded model first
    if image_processor and image_processor.model_instances:
         for model_instance in image_processor.model_instances.values():
              if model_instance and model_instance.is_model_loaded:
                   try:
                        # Ensure class_names is a dict {id: name}
                        if isinstance(model_instance.class_names, dict):
                             return sorted([(int(idx), name) for idx, name in model_instance.class_names.items()])
                   except Exception as e:
                        print(f"Error getting class names from model: {e}")

    # Fallback to standard COCO (ensure keys are ints)
    default_classes = {
        0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
        6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
        11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
        16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
        22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
        27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
        32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
        36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
        40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
        46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
        51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
        57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
        62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
        67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
        72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
        77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
    }
    return sorted(default_classes.items())

# @spaces.GPU
def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True):
    """Processes a single uploaded image."""
    print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}")
    try:
        image_processor.use_llm = use_llm
        if hasattr(image_processor, 'scene_analyzer'):
            image_processor.scene_analyzer.use_llm = use_llm
            print(f"Updated existing scene_analyzer use_llm setting to: {use_llm}")

        class_ids_to_filter = None
        if filter_classes:
            class_ids_to_filter = []
            available_classes_dict = dict(get_all_classes())
            name_to_id = {name: id for id, name in available_classes_dict.items()}
            for class_str in filter_classes:
                class_name_or_id = class_str.split(":")[0].strip()
                class_id = -1
                try:
                    class_id = int(class_name_or_id)
                    if class_id not in available_classes_dict:
                        class_id = -1
                except ValueError:
                    if class_name_or_id in name_to_id:
                        class_id = name_to_id[class_name_or_id]
                    elif class_str in name_to_id: # Check full string "id: name"
                        class_id = name_to_id[class_str]

                if class_id != -1:
                    class_ids_to_filter.append(class_id)
                else:
                    print(f"Warning: Could not parse class filter: {class_str}")
            print(f"Filtering image results for class IDs: {class_ids_to_filter}")

        # Call the existing image processing logic
        result_image, result_text, stats = image_processor.process_image(
            image,
            model_name,
            confidence_threshold,
            class_ids_to_filter
        )

        # Format stats for JSON display
        formatted_stats = image_processor.format_json_for_display(stats)

        # Prepare visualization data for the plot
        plot_figure = None
        if stats and "class_statistics" in stats and stats["class_statistics"]:
            available_classes_dict = dict(get_all_classes())
            viz_data = image_processor.prepare_visualization_data(stats, available_classes_dict)
            if "error" not in viz_data:
                 plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
            else:
                 fig, ax = plt.subplots(figsize=(8, 6))
                 ax.text(0.5, 0.5, viz_data["error"], ha='center', va='center', fontsize=12)
                 ax.axis('off')
                 plot_figure = fig
        else:
            fig, ax = plt.subplots(figsize=(8, 6))
            ax.text(0.5, 0.5, "No detection data for plot", ha='center', va='center', fontsize=12)
            ax.axis('off')
            plot_figure = fig

        # Extract scene analysis info
        scene_analysis = stats.get("scene_analysis", {})
        scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
        # Ensure scene_desc is a string before adding HTML
        if not isinstance(scene_desc, str):
            scene_desc = str(scene_desc)

        def clean_description(desc):
            if not desc:
                return ""

            # 先過濾問答格式
            if "Questions:" in desc:
                desc = desc.split("Questions:")[0].strip()
            if "Answers:" in desc:
                desc = desc.split("Answers:")[0].strip()

            # 然後按行過濾代碼和其他非敘述內容
            lines = desc.split('\n')
            clean_lines = []
            skip_block = False

            for line in lines:
                # 檢測問題格式
                if re.match(r'^\d+\.\s+(What|How|Why|When|Where|Who|The)', line):
                    continue

                # 檢查需要跳過的行
                if line.strip().startswith(':param') or line.strip().startswith('"""'):
                    continue
                if line.strip().startswith("Exercise") or "class SceneDescriptionSystem" in line:
                    skip_block = True
                    continue
                if ('def generate_scene_description' in line or
                    'def enhance_scene_descriptions' in line or
                    'def __init__' in line):
                    skip_block = True
                    continue
                if line.strip().startswith('#TEST'):
                    skip_block = True
                    continue

                if skip_block and line.strip() == "":
                    skip_block = False

                # 如果不需要跳過
                if not skip_block:
                    clean_lines.append(line)

            cleaned_text = '\n'.join(clean_lines)

            # 如果清理後為空，返回原始描述的第一段作為保險
            if not cleaned_text.strip():
                paragraphs = [p.strip() for p in desc.split('\n\n') if p.strip()]
                if paragraphs:
                    return paragraphs[0]
                return desc

            return cleaned_text

        # 獲取和處理場景描述
        scene_analysis = stats.get("scene_analysis", {})
        print("Processing scene_analysis:", scene_analysis.keys())

        # 獲取原始描述
        scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
        if not isinstance(scene_desc, str):
            scene_desc = str(scene_desc)

        print(f"Original scene description (first 50 chars): {scene_desc[:50]}...")

        # 確保使用的是有效的描述
        clean_scene_desc = clean_description(scene_desc)
        print(f"Cleaned scene description (first 50 chars): {clean_scene_desc[:50]}...")

        # 即使清理後為空也確保顯示原始內容
        if not clean_scene_desc.strip():
            clean_scene_desc = scene_desc

        # 創建原始描述的HTML
        scene_desc_html = f"<div>{clean_scene_desc}</div>"

        # 獲取LLM增強描述並且確保設置默認值為空字符串而非 None，不然會有None type Error
        enhanced_description = scene_analysis.get("enhanced_description", "")
        if enhanced_description is None:
            enhanced_description = ""

        if not enhanced_description or not enhanced_description.strip():
            print("WARNING: LLM enhanced description is empty!")

        # 準備徽章和描述標籤
        llm_badge = ""
        description_to_show = ""

        if use_llm and enhanced_description:
            llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background: linear-gradient(90deg, #38b2ac, #4299e1); color:white; font-size:0.7rem; font-weight:bold; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); border: 1px solid rgba(255, 255, 255, 0.2);">LLM Enhanced</span>'
            description_to_show = enhanced_description
            # 在 Original Scene Analysis 折疊區顯示原始的描述
        else:
            llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background-color:#718096; color:white; font-size:0.7rem; font-weight:bold;">Basic</span>'
            description_to_show = clean_scene_desc
            # 不使用 LLM 時，折疊區不顯示內容

        # 使用LLM敘述時會有徽章標籤在標題上
        scene_description_html = f'''
        <div>
            <div class="section-heading" style="font-size:1.2rem; margin-top:15px;">Scene Description {llm_badge}
                <span style="font-size:0.8rem; color:#666; font-weight:normal; display:block; margin-top:2px;">
                    {('(Enhanced by AI language model)' if use_llm and enhanced_description else '(Based on object detection)')}
                </span>
            </div>
            <div style="padding:15px; background-color:#ffffff; border-radius:8px; border:1px solid #e2e8f0; margin-bottom:20px; box-shadow:0 1px 3px rgba(0,0,0,0.05);">
                {description_to_show}
            </div>
        </div>
        '''

        # 原始描述只在使用 LLM 且有增強描述時在折疊區顯示
        original_desc_visibility = "block" if use_llm and enhanced_description else "none"
        original_desc_html = f'''
        <div id="original_scene_analysis_accordion" style="display: {original_desc_visibility};">
            <div style="padding:15px; background-color:#f0f0f0; border-radius:8px; border:1px solid #e2e8f0;">
                {clean_scene_desc}
            </div>
        </div>
        '''

        # Prepare activities list
        activities_list = scene_analysis.get("possible_activities", [])
        if not activities_list:
            activities_list_data = [["No specific activities inferred"]] # Data for Dataframe
        else:
            activities_list_data = [[activity] for activity in activities_list]

        # Prepare safety concerns list
        safety_concerns_list = scene_analysis.get("safety_concerns", [])
        if not safety_concerns_list:
            safety_data = [["No safety concerns detected"]] # Data for Dataframe
        else:
            safety_data = [[concern] for concern in safety_concerns_list]

        zones = scene_analysis.get("functional_zones", {})
        lighting = scene_analysis.get("lighting_conditions", {"time_of_day": "unknown", "confidence": 0})

        # 如果描述為空，記錄警告
        if not clean_scene_desc.strip():
            print("WARNING: Scene description is empty after cleaning!")
        if not enhanced_description.strip():
            print("WARNING: LLM enhanced description is empty!")

        return (result_image, result_text, formatted_stats, plot_figure,
            scene_description_html, original_desc_html,
            activities_list_data, safety_data, zones, lighting)

    except Exception as e:
        print(f"Error in handle_image_upload: {e}")
        import traceback
        error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
        fig, ax = plt.subplots()
        ax.text(0.5, 0.5, "Processing Error", color="red", ha="center", va="center")
        ax.axis('off')
        # Ensure return structure matches outputs even on error
        return (None, error_msg, {}, fig, f"<div>Error: {str(e)}</div>", "Error",
            [["Error"]], [["Error"]], {}, {"time_of_day": "error", "confidence": 0})

def download_video_from_url(video_url, max_duration_minutes=10):
    """
    Downloads a video from a YouTube URL and returns the local path to the downloaded file.

    Args:
        video_url (str): URL of the YouTube video to download
        max_duration_minutes (int): Maximum allowed video duration in minutes

    Returns:
        tuple: (Path to the downloaded video file or None, Error message or None)
    """
    try:
        # Create a temporary directory to store the video
        temp_dir = tempfile.gettempdir()
        output_filename = f"downloaded_{uuid.uuid4().hex}.mp4"
        output_path = os.path.join(temp_dir, output_filename)

        # Check if it's a YouTube URL
        if "youtube.com" in video_url or "youtu.be" in video_url:
            # Import yt-dlp here to avoid dependency if not needed
            import yt_dlp

            # Setup yt-dlp options
            ydl_opts = {
                'format': 'best[ext=mp4]/best',  # Best quality MP4 or best available format
                'outtmpl': output_path,
                'noplaylist': True,
                'quiet': False,  # Set to True to reduce output
                'no_warnings': False,
            }

            # First extract info to check duration
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                print(f"Extracting info from YouTube URL: {video_url}")
                info_dict = ydl.extract_info(video_url, download=False)

                # Check if video exists
                if not info_dict:
                    return None, "Could not retrieve video information. Please check the URL."

                video_title = info_dict.get('title', 'Unknown Title')
                duration = info_dict.get('duration', 0)

                print(f"Video title: {video_title}")
                print(f"Video duration: {duration} seconds")

                # Check video duration
                if duration > max_duration_minutes * 60:
                    return None, f"Video is too long ({duration} seconds). Maximum duration is {max_duration_minutes} minutes."

                # Download the video
                print(f"Downloading YouTube video: {video_title}")
                ydl.download([video_url])

            # Verify the file exists and has content
            if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
                return None, "Download failed: Empty or missing file."

            print(f"Successfully downloaded video to: {output_path}")
            return output_path, None
        else:
            return None, "Only YouTube URLs are supported at this time. Please enter a valid YouTube URL."

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Error downloading video: {e}\n{error_details}")
        return None, f"Error downloading video: {str(e)}"


# @spaces.GPU
def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
    """Handles video upload or URL input and calls the VideoProcessor."""

    print(f"Received video request: input_type={input_type}")
    video_path = None

    # Handle based on input type
    if input_type == "upload" and video_input:
        print(f"Processing uploaded video file")
        video_path = video_input
    elif input_type == "url" and video_url:
        print(f"Processing video from URL: {video_url}")
        # Download video from URL
        video_path, error_message = download_video_from_url(video_url)
        if error_message:
            error_html = f"<div class='video-summary-content-wrapper'><pre>{error_message}</pre></div>"
            return None, error_html, {"error": error_message}
    else:
        print("No valid video input provided.")
        return None, "<div class='video-summary-content-wrapper'><pre>Please upload a video file or provide a valid video URL.</pre></div>", {}

    print(f"Starting video processing with: model={model_name}, confidence={confidence_threshold}, interval={process_interval}")
    try:
        # Call the VideoProcessor method
        output_video_path, summary_text, stats_dict = video_processor.process_video_file(
            video_path=video_path,
            model_name=model_name,
            confidence_threshold=confidence_threshold,
            process_interval=int(process_interval) # Ensure interval is int
        )
        print(f"Video processing function returned: path={output_video_path}, summary length={len(summary_text)}")

        # Wrap processing summary in HTML tags for consistent styling with scene understanding page
        summary_html = f"<div class='video-summary-content-wrapper'><pre>{summary_text}</pre></div>"

        # Format statistics for better display
        formatted_stats = {}
        if stats_dict and isinstance(stats_dict, dict):
            formatted_stats = stats_dict

        return output_video_path, summary_html, formatted_stats

    except Exception as e:
        print(f"Error in handle_video_upload: {e}")
        import traceback
        error_msg = f"Error processing video: {str(e)}\n{traceback.format_exc()}"
        error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
        return None, error_html, {"error": str(e)}


# Create Gradio Interface
def create_interface():
    """Creates the Gradio interface with Tabs."""
    css = Style.get_css()
    available_models = DetectionModel.get_available_models()
    model_choices = [model["model_file"] for model in available_models]
    class_choices_formatted = [f"{id}: {name}" for id, name in get_all_classes()] # Use formatted choices

    with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:

        # Header
        with gr.Group(elem_classes="app-header"):
              gr.HTML("""
                    <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
                        <h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
                        <h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Object Detection and Scene Understanding</h2>
                        <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
                        <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis</div>
                        </div>
                         <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
                             <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
                                 <span style="margin-right: 5px;">📱</span> iPhone users: HEIC images may not be supported.
                                 <a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG</a> before uploading if needed.
                             </p>
                         </div>
                    </div>
                """)

        # Main Content with Tabs
        with gr.Tabs(elem_classes="tabs"):

            # Tab 1: Image Processing
            with gr.Tab("Image Processing"):
                current_image_model = gr.State("yolov8m.pt") # State for image model selection
                with gr.Row(equal_height=False): # Allow columns to have different heights
                    # Left Column: Image Input & Controls
                    with gr.Column(scale=4, elem_classes="input-panel"):
                        with gr.Group():
                            gr.HTML('<div class="section-heading">Upload Image</div>')
                            image_input = gr.Image(type="pil", label="Upload an image", elem_classes="upload-box")

                            with gr.Accordion("Image Analysis Settings", open=False):
                                image_model_dropdown = gr.Dropdown(
                                    choices=model_choices,
                                    value="yolov8m.pt", # Default for images
                                    label="Select Model",
                                    info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
                                )
                                # Display model info
                                image_model_info = gr.Markdown(DetectionModel.get_model_description("yolov8m.pt"))

                                image_confidence = gr.Slider(
                                    minimum=0.1, maximum=0.9, value=0.25, step=0.05,
                                    label="Confidence Threshold",
                                    info="Minimum confidence for displaying a detected object"
                                )

                                use_llm = gr.Checkbox(
                                    label="Use LLM for enhanced scene descriptions",
                                    value=True,
                                    info="Provides more detailed and natural language descriptions (may increase processing time)"
                                )

                                with gr.Accordion("Filter Classes", open=False):
                                     gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
                                     with gr.Row():
                                         people_btn = gr.Button("People", size="sm")
                                         vehicles_btn = gr.Button("Vehicles", size="sm")
                                         animals_btn = gr.Button("Animals", size="sm")
                                         objects_btn = gr.Button("Common Objects", size="sm")
                                     image_class_filter = gr.Dropdown(
                                         choices=class_choices_formatted, # Use formatted choices
                                         multiselect=True,
                                         label="Select Classes to Display",
                                         info="Leave empty to show all detected objects"
                                     )

                        image_detect_btn = gr.Button("Analyze Image", variant="primary", elem_classes="detect-btn")

                        with gr.Group(elem_classes="how-to-use"):
                             gr.HTML('<div class="section-heading">How to Use (Image)</div>')
                             gr.Markdown("""
                                    1. Upload an image or use the camera
                                    2. (Optional) Adjust settings like confidence threshold or model size (n, m=balanced, x=accurate)
                                    3. In Analysis Settings, you can uncheck "Use LLM for enhanced scene descriptions" if you prefer faster processing
                                    4. Optionally filter to specific object classes
                                    5. Click **Detect Objects** button
                                """)
                        # Image Examples
                        gr.Examples(
                            examples=[
                                "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_01.jpg",
                                "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_02.jpg",
                                "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_02.jpg",
                                "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_04.jpg"
                                ],
                            inputs=image_input,
                            label="Example Images"
                         )

                    # Right Column: Image Results
                    with gr.Column(scale=6, elem_classes="output-panel"):
                        with gr.Tabs(elem_classes="tabs"):
                            with gr.Tab("Detection Result"):
                                image_result_image = gr.Image(type="pil", label="Detection Result")
                                gr.HTML('<div class="section-heading">Detection Details</div>')
                                image_result_text = gr.Textbox(label=None, lines=10, elem_id="detection-details", container=False)

                            with gr.Tab("Scene Understanding"):
                                gr.HTML('<div class="section-heading">Scene Analysis</div>')
                                gr.HTML("""
                                    <details class="info-details" style="margin: 5px 0 15px 0;">
                                        <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
                                            🔍 The AI Vision Scout Report: Click for important notes about this analysis
                                        </summary>
                                        <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
                                            <p style="font-size: 13px; color: #718096; margin: 0;">
                                                <b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
                                                Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
                                                Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
                                            </p>
                                        </div>
                                    </details>
                                """)

                                gr.HTML('''
                                    <div style="margin-top: 5px; padding: 6px 10px; background-color: #f0f9ff; border-radius: 4px; border-left: 3px solid #63b3ed; font-size: 12px; margin-bottom: 10px;">
                                        <p style="margin: 0; color: #4a5568;">
                                            <b>Note:</b> AI descriptions may vary slightly with each generation, reflecting the creative nature of AI. This is similar to how a person might use different words each time they describe the same image. Processing time may be longer during first use or when analyzing complex scenes, as the LLM enhancement requires additional computational resources.
                                        </p>
                                    </div>
                                    ''')
                                image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")

                                # 使用LLM增強敘述時也會顯示原本敘述內容
                                with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
                                    image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")

                                with gr.Row():
                                     with gr.Column(scale=1):
                                         gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
                                         image_activities_list = gr.Dataframe(headers=["Activity"], datatype=["str"], row_count=5, col_count=1, wrap=True)

                                     with gr.Column(scale=1):
                                         gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
                                         image_safety_list = gr.Dataframe(headers=["Concern"], datatype=["str"], row_count=5, col_count=1, wrap=True)

                                gr.HTML('<div class="section-heading">Functional Zones</div>')
                                image_zones_json = gr.JSON(label=None, elem_classes="json-box")

                                gr.HTML('<div class="section-heading">Lighting Conditions</div>')
                                image_lighting_info = gr.JSON(label=None, elem_classes="json-box")

                            with gr.Tab("Statistics"):
                                with gr.Row():
                                    with gr.Column(scale=3, elem_classes="plot-column"):
                                        gr.HTML('<div class="section-heading">Object Distribution</div>')
                                        image_plot_output = gr.Plot(label=None, elem_classes="large-plot-container")
                                    with gr.Column(scale=2, elem_classes="stats-column"):
                                        gr.HTML('<div class="section-heading">Detection Statistics</div>')
                                        image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")

            # Tab 2: Video Processing
            with gr.Tab("Video Processing"):
                with gr.Row(equal_height=False):
                    # Left Column: Video Input & Controls
                    with gr.Column(scale=4, elem_classes="input-panel"):
                        with gr.Group():
                            gr.HTML('<div class="section-heading">Video Input</div>')

                            # Add input type selection
                            video_input_type = gr.Radio(
                                ["upload", "url"],
                                label="Input Method",
                                value="upload",
                                info="Choose how to provide the video"
                            )

                            # File upload (will be shown/hidden based on selection)
                            with gr.Group(elem_id="upload-video-group"):
                                video_input = gr.Video(
                                    label="Upload a video file (MP4, AVI, MOV)",
                                    sources=["upload"],
                                    visible=True
                                )

                            # URL input (will be shown/hidden based on selection)
                            with gr.Group(elem_id="url-video-group"):
                                video_url_input = gr.Textbox(
                                    label="Enter video URL (YouTube or direct video link)",
                                    placeholder="https://www.youtube.com/watch?v=...",
                                    visible=False,
                                    elem_classes="custom-video-url-input"
                                )
                                gr.HTML("""
                                    <div style="padding: 8px; margin-top: 5px; background-color: #fff8f8; border-radius: 4px; border-left: 3px solid #f87171; font-size: 12px;">
                                        <p style="margin: 0; color: #4b5563;">
                                            Note: Currently only YouTube URLs are supported. Maximum video duration is 10 minutes. Due to YouTube's anti-bot protection, some videos may not be downloadable. For protected videos, please upload a local video file instead.
                                        </p>
                                    </div>
                                """)

                            with gr.Accordion("Video Analysis Settings", open=True):
                                video_model_dropdown = gr.Dropdown(
                                    choices=model_choices,
                                    value="yolov8n.pt", # Default 'n' for video
                                    label="Select Model (Video)",
                                    info="Faster models (like 'n') are recommended"
                                )
                                video_confidence = gr.Slider(
                                    minimum=0.1, maximum=0.9, value=0.4, step=0.05,
                                    label="Confidence Threshold (Video)"
                                )
                                video_process_interval = gr.Slider(
                                    minimum=1, maximum=60, value=10, step=1, # Allow up to 60 frame interval
                                    label="Processing Interval (Frames)",
                                    info="Analyze every Nth frame (higher value = faster)"
                                )
                        video_process_btn = gr.Button("Process Video", variant="primary", elem_classes="detect-btn")

                        with gr.Group(elem_classes="how-to-use"):
                            gr.HTML('<div class="section-heading">How to Use (Video)</div>')
                            gr.Markdown("""
                            1. Choose your input method: Upload a file or enter a URL.
                            2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
                            3. Click "Process Video". **Processing can take a significant amount of time.**
                            4. The annotated video and summary will appear on the right when finished.
                            """)

                        # Add video examples
                        gr.HTML('<div class="section-heading">Example Videos</div>')
                        gr.HTML("""
                            <div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
                                <p style="font-size: 14px; color: #4A5568; margin: 0;">
                                    Upload any video containing objects that YOLO can detect. For testing, find sample videos
                                    <a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
                                </p>
                            </div>
                        """)

                    # Right Column: Video Results
                    with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
                        gr.HTML("""
                            <div class="section-heading">Video Result</div>
                            <details class="info-details" style="margin: 5px 0 15px 0;">
                                <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
                                    🎬 Video Processing Notes
                                </summary>
                                <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
                                    <p style="font-size: 13px; color: #718096; margin: 0;">
                                        The processed video includes bounding boxes around detected objects. For longer videos,
                                        consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
                                    </p>
                                </div>
                            </details>
                        """)
                        video_output = gr.Video(label="Processed Video", elem_classes="video-output-container") # Output for the processed video file

                        gr.HTML('<div class="section-heading">Processing Summary</div>')
                        # 使用HTML顯示影片的摘要
                        video_summary_text = gr.HTML(
                            label=None,
                            elem_id="video-summary-html-output"
                        )

                        gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
                        video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics

        # Event Listeners
        # Image Model Change Handler
        image_model_dropdown.change(
            fn=lambda model: (model, DetectionModel.get_model_description(model)),
            inputs=[image_model_dropdown],
            outputs=[current_image_model, image_model_info] # Update state and description
        )

        # Image Filter Buttons
        available_classes_list = get_all_classes() # Get list of (id, name)
        people_classes_ids = [0]
        vehicles_classes_ids = [1, 2, 3, 4, 5, 6, 7, 8]
        animals_classes_ids = list(range(14, 24))
        common_objects_ids = [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73] # Bottle, cup, fork, knife, spoon, bowl, chair, couch, table, tv, laptop, phone, book

        people_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids], outputs=image_class_filter)
        vehicles_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids], outputs=image_class_filter)
        animals_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids], outputs=image_class_filter)
        objects_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids], outputs=image_class_filter)

        video_input_type.change(
            fn=lambda input_type: [
                # Show/hide file upload
                gr.update(visible=(input_type == "upload")),
                # Show/hide URL input
                gr.update(visible=(input_type == "url"))
            ],
            inputs=[video_input_type],
            outputs=[video_input, video_url_input]
        )

        image_detect_btn.click(
            fn=handle_image_upload,
            inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm],
            outputs=[
                image_result_image, image_result_text, image_stats_json, image_plot_output,
                image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
                image_lighting_info
            ]
        )

        video_process_btn.click(
            fn=handle_video_upload,
            inputs=[
                video_input,
                video_url_input,
                video_input_type,
                video_model_dropdown,
                video_confidence,
                video_process_interval
            ],
            outputs=[video_output, video_summary_text, video_stats_json]
        )

        # Footer
        gr.HTML("""
             <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
                 <div style="margin-bottom: 15px;">
                     <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
                 </div>
                 <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
                     <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
                     <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
                         <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
                     </a>
                 </div>
             </div>
         """)

    return demo


if __name__ == "__main__":
    demo_interface = create_interface()

    demo_interface.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://117a130c808b33a7e3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Processing image with model: yolov8m.pt, confidence: 0.25, use_llm: True
Creating new model instance for yolov8m.pt
Loading model: yolov8m.pt
Successfully loaded model: yolov8m.pt
Number of classes the model can recognize: 80

image 1/1 /tmp/temp_768a6094d7514bc3a98abdfa6db793f7.jpg: 640x448 16 persons, 2 cars, 1 airplane, 2 skateboards, 134.4ms
Speed: 26.7ms preprocess, 134.4ms inference, 467.6ms postprocess per image at shape (1, 3, 640, 448)
Loading CLIP model ViT-B/32 on cuda...


2025-05-14 05:29:29,617 - LLMEnhancer - INFO - Using device: cuda
INFO:LLMEnhancer:Using device: cuda


CLIP model loaded successfully.
LLM enhancer initialized successfully.


2025-05-14 05:29:30,098 - LLMEnhancer - INFO - Model not loaded, no context to reset
INFO:LLMEnhancer:Model not loaded, no context to reset
2025-05-14 05:29:30,100 - LLMEnhancer - INFO - Loading LLM model from meta-llama/Llama-3.2-3B-Instruct with 8-bit quantization
INFO:LLMEnhancer:Loading LLM model from meta-llama/Llama-3.2-3B-Instruct with 8-bit quantization


Total GPU memory: 14.74 GB


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2025-05-14 05:30:10,074 - LLMEnhancer - INFO - Model loaded successfully with 8-bit quantization
INFO:LLMEnhancer:Model loaded successfully with 8-bit quantization
2025-05-14 05:30:10,075 - LLMEnhancer - INFO - Generating LLM response...
INFO:LLMEnhancer:Generating LLM response...
2025-05-14 05:30:10,077 - LLMEnhancer - INFO - LLM call #1
INFO:LLMEnhancer:LLM call #1


Processing scene_analysis: dict_keys(['scene_type', 'scene_name', 'confidence', 'description', 'enhanced_description', 'objects_present', 'object_count', 'regions', 'possible_activities', 'safety_concerns', 'functional_zones', 'alternative_scenes', 'lighting_conditions', 'clip_analysis'])
Original scene description (first 50 chars): From an aerial perspective,. An aerial view showin...
Cleaned scene description (first 50 chars): From an aerial perspective,. An aerial view showin...
Processing image with model: yolov8m.pt, confidence: 0.25, use_llm: True
Updated existing scene_analyzer use_llm setting to: True
Using existing model instance for yolov8m.pt

image 1/1 /tmp/temp_62ce2cb1c79a4beea9446907ef46f346.jpg: 640x448 1 person, 6 cars, 5 traffic lights, 1 handbag, 27.5ms
Speed: 3.6ms preprocess, 27.5ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 448)


2025-05-14 05:34:04,071 - LLMEnhancer - INFO - Model context reset
INFO:LLMEnhancer:Model context reset
2025-05-14 05:34:04,074 - LLMEnhancer - INFO - Generating LLM response...
INFO:LLMEnhancer:Generating LLM response...
2025-05-14 05:34:04,075 - LLMEnhancer - INFO - LLM call #2
INFO:LLMEnhancer:LLM call #2


Processing scene_analysis: dict_keys(['scene_type', 'scene_name', 'confidence', 'description', 'enhanced_description', 'objects_present', 'object_count', 'regions', 'possible_activities', 'safety_concerns', 'functional_zones', 'alternative_scenes', 'lighting_conditions', 'clip_analysis'])
Original scene description (first 50 chars): A busy urban crossroad with pedestrian crossings a...
Cleaned scene description (first 50 chars): A busy urban crossroad with pedestrian crossings a...
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://117a130c808b33a7e3.gradio.live


In [None]:
# %%writefile requirements.txt
# torch>=2.0.0
# torchvision>=0.15.0
# ultralytics>=8.0.0
# opencv-python>=4.7.0
# pillow>=9.4.0
# numpy>=1.23.5
# matplotlib>=3.7.0
# gradio>=3.32.0
# git+https://github.com/openai/CLIP.git
# yt-dlp>=2023.3.4
# requests>=2.28.1
# transformers
# accelerate
# bitsandbytes
# sentencepiece
# huggingface_hub>=0.19.0

In [2]:
# from google.colab import files

# files.download('detection_model.py')
# files.download('color_mapper.py')
# files.download('visualization_helper.py')
# files.download('evaluation_metrics.py')
# files.download('style.py')
# files.download('scene_type.py')
# files.download('confifence_templates.py')
# files.download('scene_detail_templates.py')
# files.download('object_template_fillers.py')
# files.download('safety_templates.py')
# files.download('activity_templates.py')
# files.download('object_categories.py')
# files.download('lighting_conditions.py')
# files.download('viewpoint_templates.py')
# files.download('cultural_templates.py')
# files.download('spatial_analyzer.py')
# files.download('enhance_scene_describer.py')
# files.download('lighting_analyzer.py')
# files.download('scene_description.py')
# files.download('clip_prompts.py')
# files.download('clip_analyzer.py')
# files.download('scene_analyzer.py')
# files.download('image_processor.py')
# files.download('video_processor.py')
# files.download('llm_enhancer.py')
# files.download('app.py')

# files.download('requirements.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#    # Set up example images
# examples=[
#             "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_01.jpg",
#             "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_02.jpg",
#             "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_02.jpg",
#             "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_04.jpg"
#             ],