## 1. Setup and Data Loading

In [None]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from pathlib import Path
import json

# Import project modules
from data_analysis.parser import BDD100KParser
from data_analysis.analysis import BDD100KAnalyzer
from data_analysis.visualize import BDD100KVisualizer

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

print("âœ… Imports successful!")

In [None]:
# Define paths
TRAIN_JSON = '../data/bdd100k/labels/det_20/det_train.json'
VAL_JSON = '../data/bdd100k/labels/det_20/det_val.json'
TRAIN_IMAGES = '../data/bdd100k/images/100k/train'
VAL_IMAGES = '../data/bdd100k/images/100k/val'

# Check if files exist
print(f"Train JSON exists: {Path(TRAIN_JSON).exists()}")
print(f"Val JSON exists: {Path(VAL_JSON).exists()}")

## 2. Data Parsing

In [None]:
# Parse training data
parser = BDD100KParser(TRAIN_JSON)
train_data = parser.parse_all()

# Get basic statistics
stats = parser.get_statistics()

print("\n=== Dataset Statistics ===")
print(f"Total Images: {stats['total_images']}")
print(f"Total Objects: {stats['total_objects']}")
print(f"Avg Objects per Image: {stats['avg_objects_per_image']:.2f}")
print(f"Empty Images: {stats['empty_images']}")

In [None]:
# Display class distribution
class_dist = stats['class_distribution']
df_classes = pd.DataFrame(list(class_dist.items()), columns=['Class', 'Count'])
df_classes = df_classes.sort_values('Count', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=df_classes, x='Class', y='Count', palette='viridis')
plt.title('Class Distribution in Training Set', fontsize=16, fontweight='bold')
plt.xlabel('Object Class', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

df_classes

## 3. Full Analysis

In [None]:
# Run comprehensive analysis
analyzer = BDD100KAnalyzer(TRAIN_JSON, VAL_JSON)
results = analyzer.run_full_analysis()

# Print summary
analyzer.print_summary()

In [None]:
# Visualize objects per image distribution
opi = results['objects_per_image']

train_dist = opi['train']['distribution']
max_objects = max(train_dist.keys())

counts = [train_dist.get(i, 0) for i in range(max_objects + 1)]

plt.figure(figsize=(14, 6))
plt.bar(range(len(counts)), counts, alpha=0.7, color='steelblue')
plt.xlabel('Number of Objects per Image', fontsize=12)
plt.ylabel('Number of Images', fontsize=12)
plt.title('Distribution of Objects per Image', fontsize=16, fontweight='bold')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Sample Image Visualization

In [None]:
# Select a random sample with objects
samples_with_objects = [img for img in train_data if img['num_objects'] > 5]
sample = np.random.choice(samples_with_objects)

# Load and visualize image
image_path = Path(TRAIN_IMAGES) / sample['image']

if image_path.exists():
    img = cv2.imread(str(image_path))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Draw bounding boxes
    for obj in sample['objects']:
        x1, y1, x2, y2 = [int(coord) for coord in obj['bbox']]
        color = (255, 0, 0)  # Red
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        cv2.putText(img, obj['class'], (x1, y1-5), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    
    plt.figure(figsize=(16, 10))
    plt.imshow(img)
    plt.title(f"{sample['image']} - {sample['num_objects']} objects", 
             fontsize=14, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    print(f"\nObjects in image:")
    for obj in sample['objects']:
        print(f"  - {obj['class']}")
else:
    print(f"Image not found: {image_path}")

## 5. Anomaly Analysis

In [None]:
# Display anomalies
anomalies = results['anomalies']

print("=== Anomaly Summary ===")
print(f"Empty Images (Train): {anomalies['empty_images']['train']}")
print(f"Empty Images (Val): {anomalies['empty_images']['val']}")
print(f"\nTiny Bounding Boxes: {anomalies['tiny_bboxes']['count']}")
print(f"Occluded Objects: {anomalies['occluded_objects']['count']}")
print(f"Overlapping Boxes: {anomalies['overlapping_boxes']['count']}")

# Visualize
anomaly_types = ['Empty Images', 'Tiny Bboxes', 'Occluded', 'Overlapping']
counts = [
    anomalies['empty_images']['train'],
    anomalies['tiny_bboxes']['count'],
    anomalies['occluded_objects']['count'],
    anomalies['overlapping_boxes']['count']
]

plt.figure(figsize=(10, 6))
colors = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#ffa07a']
plt.bar(anomaly_types, counts, color=colors, alpha=0.7)
plt.title('Anomalies Detected in Dataset', fontsize=16, fontweight='bold')
plt.ylabel('Count', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Key Insights and Recommendations

Based on the analysis above, here are the key insights:

### Dataset Characteristics
1. **Class Imbalance:** Significant imbalance with cars dominating the dataset
2. **Small Objects:** ~28% of objects are small, challenging for detection
3. **Varied Scenes:** Average of 11 objects per image with high variance

### Recommendations for Model Training

1. **Handle Class Imbalance:**
   - Use weighted loss function
   - Oversample rare classes
   - Focal loss for hard examples

2. **Improve Small Object Detection:**
   - Use higher input resolution
   - Multi-scale training
   - Mosaic augmentation

3. **Data Augmentation:**
   - Horizontal flips
   - Color jittering
   - Random crops and scales

4. **Model Selection:**
   - YOLOv8 recommended for speed/accuracy tradeoff
   - Pretrained weights on COCO
   - Fine-tune on BDD100K

## 7. Next Steps

To continue with this project:

```bash
# Train model
python ../model/train.py --model m --epochs 50 --batch 16

# Run evaluation
python ../evaluation/metrics.py --model runs/train/best.pt

# Error analysis
python ../evaluation/error_analysis.py --model runs/train/best.pt
```