In [None]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from collections import defaultdict
import yaml

# Add src to path
sys.path.insert(0, '../')
from src.data.dataset import GraffitiDataset, load_image_paths_from_file, get_label_path_from_image_path
from src.utils.visualization import draw_yolo_labels

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load Dataset Configuration

In [None]:
# Load dataset configuration
config_path = '../configs/dataset.yaml'

with open(config_path, 'r') as f:
    dataset_config = yaml.safe_load(f)

print("Dataset Configuration:")
print(f"  Path: {dataset_config['path']}")
print(f"  Number of classes: {dataset_config['nc']}")
print(f"  Class names: {dataset_config['names']}")

## 2. Load Image Paths

In [None]:
# Define data paths
data_root = Path(dataset_config['path'])
train_txt = data_root / dataset_config['train']
val_txt = data_root / dataset_config['val']
test_txt = data_root / dataset_config.get('test', 'test.txt')

# Load image paths
train_images = load_image_paths_from_file(str(train_txt), str(data_root)) if train_txt.exists() else []
val_images = load_image_paths_from_file(str(val_txt), str(data_root)) if val_txt.exists() else []
test_images = load_image_paths_from_file(str(test_txt), str(data_root)) if test_txt.exists() else []

print("Dataset Statistics:")
print(f"  Training images: {len(train_images)}")
print(f"  Validation images: {len(val_images)}")
print(f"  Test images: {len(test_images)}")
print(f"  Total images: {len(train_images) + len(val_images) + len(test_images)}")

## 3. Analyze Dataset Statistics

In [None]:
def analyze_images_and_labels(image_paths):
    """Analyze image dimensions and label statistics."""
    stats = {
        'widths': [],
        'heights': [],
        'num_objects': [],
        'class_counts': defaultdict(int),
        'box_widths': [],
        'box_heights': [],
        'box_areas': []
    }
    
    for img_path in image_paths:
        # Load image
        img = cv2.imread(img_path)
        if img is None:
            continue
        
        h, w = img.shape[:2]
        stats['widths'].append(w)
        stats['heights'].append(h)
        
        # Load labels
        label_path = get_label_path_from_image_path(img_path)
        
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                lines = [line.strip() for line in f if line.strip()]
            
            stats['num_objects'].append(len(lines))
            
            for line in lines:
                parts = line.split()
                if len(parts) >= 5:
                    class_id = int(parts[0])
                    box_w = float(parts[3])
                    box_h = float(parts[4])
                    
                    stats['class_counts'][class_id] += 1
                    stats['box_widths'].append(box_w)
                    stats['box_heights'].append(box_h)
                    stats['box_areas'].append(box_w * box_h)
        else:
            stats['num_objects'].append(0)
    
    return stats

print("Analyzing training set...")
train_stats = analyze_images_and_labels(train_images)

print("Analyzing validation set...")
val_stats = analyze_images_and_labels(val_images)

print("Analysis complete!")

## 4. Visualize Image Dimensions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Width distribution
axes[0].hist(train_stats['widths'], bins=30, alpha=0.7, label='Train', color='blue')
axes[0].hist(val_stats['widths'], bins=30, alpha=0.7, label='Val', color='orange')
axes[0].set_xlabel('Image Width (pixels)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Image Width Distribution')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Height distribution
axes[1].hist(train_stats['heights'], bins=30, alpha=0.7, label='Train', color='blue')
axes[1].hist(val_stats['heights'], bins=30, alpha=0.7, label='Val', color='orange')
axes[1].set_xlabel('Image Height (pixels)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Image Height Distribution')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Average image size (train): {np.mean(train_stats['widths']):.0f} x {np.mean(train_stats['heights']):.0f}")
print(f"Average image size (val): {np.mean(val_stats['widths']):.0f} x {np.mean(val_stats['heights']):.0f}")

## 5. Analyze Objects per Image

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Objects per image
axes[0].hist(train_stats['num_objects'], bins=range(0, max(train_stats['num_objects'])+2), 
             alpha=0.7, label='Train', color='blue')
axes[0].set_xlabel('Number of Objects per Image')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Objects per Image Distribution')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Class distribution
class_ids = list(train_stats['class_counts'].keys())
class_counts = [train_stats['class_counts'][cid] for cid in class_ids]
class_names = [dataset_config['names'][cid] for cid in class_ids]

axes[1].bar(class_names, class_counts, color='green', alpha=0.7)
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Number of Instances')
axes[1].set_title('Class Distribution (Training Set)')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print(f"Average objects per image (train): {np.mean(train_stats['num_objects']):.2f}")
print(f"Max objects per image (train): {max(train_stats['num_objects'])}")
print(f"Total annotations (train): {sum(train_stats['num_objects'])}")

## 6. Analyze Bounding Box Sizes

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Box width distribution
axes[0].hist(train_stats['box_widths'], bins=50, alpha=0.7, color='purple')
axes[0].set_xlabel('Normalized Box Width')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Bounding Box Width Distribution')
axes[0].grid(True, alpha=0.3)

# Box height distribution
axes[1].hist(train_stats['box_heights'], bins=50, alpha=0.7, color='brown')
axes[1].set_xlabel('Normalized Box Height')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Bounding Box Height Distribution')
axes[1].grid(True, alpha=0.3)

# Box area distribution
axes[2].hist(train_stats['box_areas'], bins=50, alpha=0.7, color='teal')
axes[2].set_xlabel('Normalized Box Area')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Bounding Box Area Distribution')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Average box width: {np.mean(train_stats['box_widths']):.3f}")
print(f"Average box height: {np.mean(train_stats['box_heights']):.3f}")
print(f"Average box area: {np.mean(train_stats['box_areas']):.3f}")

## 7. Visualize Sample Images with Annotations

In [None]:
# Select random samples
n_samples = 6
sample_indices = np.random.choice(len(train_images), min(n_samples, len(train_images)), replace=False)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    if idx < len(sample_indices):
        img_path = train_images[sample_indices[idx]]
        label_path = get_label_path_from_image_path(img_path)
        
        # Load image
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Load labels
        labels = []
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) >= 5:
                        labels.append([float(p) for p in parts[:5]])
        
        labels = np.array(labels) if labels else np.zeros((0, 5))
        
        # Draw labels
        if len(labels) > 0:
            img = draw_yolo_labels(img, labels, dataset_config['names'])
        
        ax.imshow(img)
        ax.set_title(f"Sample {idx+1} ({len(labels)} objects)")
        ax.axis('off')
    else:
        ax.axis('off')

plt.tight_layout()
plt.show()

## 8. Dataset Summary

In [None]:
summary_data = {
    'Split': ['Train', 'Validation', 'Test', 'Total'],
    'Images': [
        len(train_images),
        len(val_images),
        len(test_images),
        len(train_images) + len(val_images) + len(test_images)
    ],
    'Annotations': [
        sum(train_stats['num_objects']),
        sum(val_stats['num_objects']),
        0,  # Not analyzed
        sum(train_stats['num_objects']) + sum(val_stats['num_objects'])
    ],
    'Avg Objects/Image': [
        f"{np.mean(train_stats['num_objects']):.2f}" if train_stats['num_objects'] else 'N/A',
        f"{np.mean(val_stats['num_objects']):.2f}" if val_stats['num_objects'] else 'N/A',
        'N/A',
        'N/A'
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*60)
print("Dataset Summary")
print("="*60)
print(summary_df.to_string(index=False))
print("="*60)

## Next Steps

1. **Data Collection**: If the dataset is empty, collect and annotate graffiti images
2. **Data Augmentation**: Review augmentation strategies in `src/data/augmentation.py`
3. **Model Training**: Start training with `python scripts/train.py`
4. **Hyperparameter Tuning**: Adjust training parameters based on dataset characteristics