# Exploratory Data Analysis - Satellite Vehicle Detection Dataset

**Research Project**: Comparative Analysis of CNN (YOLOv11) vs Vision Transformer (RF-DETR)

**Author**: Abdullah Waraich

## Overview
This notebook performs exploratory data analysis on the satellite vehicle detection dataset from Roboflow. The dataset contains satellite imagery with annotated vehicles and will be used to compare CNN vs Transformer approaches for object detection.

## 1. Setup and Data Loading

First, we will install  required packages and download the dataset.

In [None]:
# Install required packages
!pip install roboflow matplotlib seaborn pandas numpy opencv-python pillow plotly

# Import essential libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

In [None]:
# Download dataset from Roboflow
from roboflow import Roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="--------------") # Replace it with your own key OR get code snippet directly from RoboFlow for downloading this dataset
project = rf.workspace("ab-ml-cv").project("satalite-blffa")
version = project.version(1)
dataset = version.download("yolov11")

print(f" Dataset downloaded to: {dataset.location}")

# Define paths
dataset_path = dataset.location
train_images_path = os.path.join(dataset_path, "train", "images")
train_labels_path = os.path.join(dataset_path, "train", "labels")
val_images_path = os.path.join(dataset_path, "valid", "images")
val_labels_path = os.path.join(dataset_path, "valid", "labels")
test_images_path = os.path.join(dataset_path, "test", "images")
test_labels_path = os.path.join(dataset_path, "test", "labels")

print("\n Dataset structure:")
for root, dirs, files in os.walk(dataset_path):
    level = root.replace(dataset_path, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files[:3]:  # Show first 3 files
        print(f"{subindent}{file}")
    if len(files) > 3:
        print(f"{subindent}... and {len(files)-3} more files")

## 2. Dataset Overview and Basic Statistics

We will start start by understanding the basic structure of our dataset.

In [None]:
# Load class names
with open(os.path.join(dataset_path, "data.yaml"), 'r') as f:
    import yaml
    data_config = yaml.safe_load(f)

class_names = data_config['names']
num_classes = data_config['nc']

print(" CLASS INFORMATION")
print("=" * 40)
print(f"Number of classes: {num_classes}")
print(f"Class names: {class_names}")
print("=" * 40)

# Count files in each split
def count_files(path):
    return len([f for f in os.listdir(path) if f.endswith(('.jpg', '.jpeg', '.png'))])

train_count = count_files(train_images_path)
val_count = count_files(val_images_path)
test_count = count_files(test_images_path)
total_images = train_count + val_count + test_count

print("\n DATASET SPLIT STATISTICS")
print(f"Training images:   {train_count:,} ({train_count/total_images:.1%})")
print(f"Validation images: {val_count:,} ({val_count/total_images:.1%})")
print(f"Test images:       {test_count:,} ({test_count/total_images:.1%})")
print(f"Total images:      {total_images:,}")


# Create a summary dictionary for later use
dataset_summary = {
    'train_images': train_count,
    'val_images': val_count,
    'test_images': test_count,
    'total_images': total_images,
    'num_classes': num_classes,
    'class_names': class_names
}

## 3. Image Analysis

Now let's analyze the characteristics of our satellite images.

In [None]:
def analyze_images(images_path, sample_size=100):

    image_files = [f for f in os.listdir(images_path) if f.endswith(('.jpg', '.jpeg', '.png'))]

    # Sample images if dataset is large
    if len(image_files) > sample_size:
        image_files = np.random.choice(image_files, sample_size, replace=False)

    widths, heights, file_sizes, channels = [], [], [], []

    print(f" Analyzing {len(image_files)} images...")

    for i, img_file in enumerate(image_files):
        if i % 20 == 0:
            print(f"  Progress: {i+1}/{len(image_files)}")

        img_path = os.path.join(images_path, img_file)

        # Get file size
        file_size = os.path.getsize(img_path) / 1024  # KB
        file_sizes.append(file_size)

        # Get image dimensions using PIL (faster than OpenCV)
        with Image.open(img_path) as img:
            width, height = img.size
            widths.append(width)
            heights.append(height)

            # Get number of channels
            if img.mode == 'RGB':
                channels.append(3)
            elif img.mode == 'RGBA':
                channels.append(4)
            elif img.mode == 'L':
                channels.append(1)
            else:
                channels.append(len(img.getbands()))

    return {
        'widths': widths,
        'heights': heights,
        'file_sizes': file_sizes,
        'channels': channels,
        'total_analyzed': len(image_files)
    }

# Analyze training images
print(" ANALYZING TRAINING IMAGES")
train_img_stats = analyze_images(train_images_path, sample_size=200)

print("\n IMAGE CHARACTERISTICS")
print(f"Images analyzed: {train_img_stats['total_analyzed']}")
print(f"Width range: {min(train_img_stats['widths'])} - {max(train_img_stats['widths'])} pixels")
print(f"Height range: {min(train_img_stats['heights'])} - {max(train_img_stats['heights'])} pixels")
print(f"Average dimensions: {np.mean(train_img_stats['widths']):.0f} x {np.mean(train_img_stats['heights']):.0f}")
print(f"File size range: {min(train_img_stats['file_sizes']):.1f} - {max(train_img_stats['file_sizes']):.1f} KB")
print(f"Average file size: {np.mean(train_img_stats['file_sizes']):.1f} KB")
print(f"Color channels: {Counter(train_img_stats['channels'])}")


In [None]:
# Visualize image statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Image Characteristics Analysis', fontsize=16, fontweight='bold')

# Width distribution
axes[0, 0].hist(train_img_stats['widths'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].axvline(np.mean(train_img_stats['widths']), color='red', linestyle='--', label=f"Mean: {np.mean(train_img_stats['widths']):.0f}")
axes[0, 0].set_title('Image Width Distribution')
axes[0, 0].set_xlabel('Width (pixels)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Height distribution
axes[0, 1].hist(train_img_stats['heights'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0, 1].axvline(np.mean(train_img_stats['heights']), color='red', linestyle='--', label=f"Mean: {np.mean(train_img_stats['heights']):.0f}")
axes[0, 1].set_title('Image Height Distribution')
axes[0, 1].set_xlabel('Height (pixels)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Aspect ratio
aspect_ratios = [w/h for w, h in zip(train_img_stats['widths'], train_img_stats['heights'])]
axes[1, 0].hist(aspect_ratios, bins=30, alpha=0.7, color='gold', edgecolor='black')
axes[1, 0].axvline(np.mean(aspect_ratios), color='red', linestyle='--', label=f"Mean: {np.mean(aspect_ratios):.2f}")
axes[1, 0].set_title('Aspect Ratio Distribution')
axes[1, 0].set_xlabel('Width/Height Ratio')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# File size distribution
axes[1, 1].hist(train_img_stats['file_sizes'], bins=30, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1, 1].axvline(np.mean(train_img_stats['file_sizes']), color='red', linestyle='--', label=f"Mean: {np.mean(train_img_stats['file_sizes']):.1f} KB")
axes[1, 1].set_title('File Size Distribution')
axes[1, 1].set_xlabel('File Size (KB)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Key insights
print("\n🔍 KEY INSIGHTS:")
unique_widths = len(set(train_img_stats['widths']))
unique_heights = len(set(train_img_stats['heights']))

if unique_widths == 1 and unique_heights == 1:
    print(" All images have consistent dimensions - good for training!")
else:
    print(f"  Images have varying dimensions - {unique_widths} unique widths, {unique_heights} unique heights")

avg_aspect_ratio = np.mean(aspect_ratios)
if 0.95 <= avg_aspect_ratio <= 1.05:
    print(" Images are approximately square")
else:
    print(f" Images have aspect ratio of {avg_aspect_ratio:.2f} (width/height)")

## 4. Annotation Analysis

Now let's dive into the annotations to understand our labeled data.

In [None]:
def parse_yolo_annotations(labels_path, images_path):

    annotations = []
    label_files = [f for f in os.listdir(labels_path) if f.endswith('.txt')]

    print(f" Getting {len(label_files)} annotation files...")

    for i, label_file in enumerate(label_files):
        if i % 100 == 0:
            print(f"  Progress: {i+1}/{len(label_files)}")

        label_path = os.path.join(labels_path, label_file)

        # Get corresponding image dimensions
        img_file = label_file.replace('.txt', '.jpg')
        img_path = os.path.join(images_path, img_file)

        if not os.path.exists(img_path):
            img_file = label_file.replace('.txt', '.png')
            img_path = os.path.join(images_path, img_file)

        if os.path.exists(img_path):
            with Image.open(img_path) as img:
                img_width, img_height = img.size
        else:
            print(f"Could not find image for {label_file}")
            continue

        # Parse annotations
        with open(label_path, 'r') as f:
            lines = f.readlines()

        for line in lines:
            if line.strip():
                parts = line.strip().split()
                if len(parts) == 5:
                    class_id = int(parts[0])
                    x_center = float(parts[1])
                    y_center = float(parts[2])
                    width = float(parts[3])
                    height = float(parts[4])

                    # Convert to absolute coordinates for analysis
                    abs_width = width * img_width
                    abs_height = height * img_height
                    abs_area = abs_width * abs_height

                    annotations.append({
                        'image_file': img_file,
                        'class_id': class_id,
                        'class_name': class_names[class_id],
                        'x_center_norm': x_center,
                        'y_center_norm': y_center,
                        'width_norm': width,
                        'height_norm': height,
                        'width_abs': abs_width,
                        'height_abs': abs_height,
                        'area_abs': abs_area,
                        'img_width': img_width,
                        'img_height': img_height
                    })

    return annotations

# Parse training annotations
print(" ANALYZING TRAINING ANNOTATIONS")
train_annotations = parse_yolo_annotations(train_labels_path, train_images_path)
train_df = pd.DataFrame(train_annotations)

print(f"\n ANNOTATION STATISTICS")
print(f"Total annotations: {len(train_df):,}")
print(f"Images with annotations: {train_df['image_file'].nunique():,}")
print(f"Average annotations per image: {len(train_df) / train_df['image_file'].nunique():.2f}")


# Display first few rows
print("\n Sample annotations:")
print(train_df.head())

## 5. Class Distribution Analysis

Understanding the class balance is crucial for model training. Initially we had 3 classes that were Bus, Car and Truck. Owing to a major imbalance in classes, all classes were grouped into one class i.e. Vehicle

# Class Merging

In [None]:
import os
import yaml
import shutil

# Define the new class name and ID
new_class_name = 'vehicle'
new_class_id = 0

# Define the paths
dataset_path = dataset.location # Reuse the dataset.location variable
data_yaml_path = os.path.join(dataset_path, "data.yaml")

# Load the current data.yaml
with open(data_yaml_path, 'r') as f:
    data_config = yaml.safe_load(f)

# Update class names and number of classes in data.yaml
data_config['names'] = [new_class_name]
data_config['nc'] = 1

# Save the updated data.yaml
with open(data_yaml_path, 'w') as f:
    yaml.dump(data_config, f, default_flow_style=False)

print(f"✅ Updated {data_yaml_path} with the new class: {new_class_name} (ID: {new_class_id})")

# Function to update annotation files in a directory
def update_annotations(labels_path):
    label_files = [f for f in os.listdir(labels_path) if f.endswith('.txt')]
    print(f"\nUpdating annotations in: {labels_path}")
    for i, label_file in enumerate(label_files):
        if i % 500 == 0:
            print(f"  Progress: {i+1}/{len(label_files)}")
        label_path = os.path.join(labels_path, label_file)
        updated_lines = []
        with open(label_path, 'r') as f:
            lines = f.readlines()
        for line in lines:
            if line.strip():
                parts = line.strip().split()
                if len(parts) == 5:
                    # All original class IDs (0, 1, 2) will become the new class ID (0)
                    updated_line = f"{new_class_id} {parts[1]} {parts[2]} {parts[3]} {parts[4]}\n"
                    updated_lines.append(updated_line)
        # Overwrite the original file with updated annotations
        with open(label_path, 'w') as f:
            f.writelines(updated_lines)

# Update annotations in train, valid, and test sets
update_annotations(train_labels_path) # Reuse the train_labels_path variable
update_annotations(val_labels_path) # Reuse the val_labels_path variable
update_annotations(test_labels_path) # Reuse the test_labels_path variable

print("\n✅ Annotation files updated successfully!")
print("\nDataset classes have been merged into a single 'vehicle' class.")

In [None]:
# Class distribution analysis
class_counts = train_df['class_name'].value_counts()
total_objects = len(train_df)

print(" CLASS DISTRIBUTION")

for class_name, count in class_counts.items():
    percentage = (count / total_objects) * 100
    print(f"{class_name:10}: {count:,} ({percentage:.1f}%)")


# Visualize class distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Class Distribution Analysis', fontsize=16, fontweight='bold')

# Bar plot
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
bars = axes[0].bar(class_counts.index, class_counts.values, color=colors, alpha=0.8, edgecolor='black')
axes[0].set_title('Objects per Class')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Number of Objects')
axes[0].grid(True, alpha=0.3)

# Add value labels on bars
for bar, count in zip(bars, class_counts.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(class_counts.values)*0.01,
                f'{count:,}', ha='center', va='bottom', fontweight='bold')

# Pie chart
wedges, texts, autotexts = axes[1].pie(class_counts.values, labels=class_counts.index,
                                      autopct='%1.1f%%', colors=colors, startangle=90)
axes[1].set_title('Class Distribution (%)')

# Make percentage text bold
for autotext in autotexts:
    autotext.set_fontweight('bold')
    autotext.set_color('white')

# Objects per image by class
objects_per_image = train_df.groupby(['image_file', 'class_name']).size().unstack(fill_value=0)
mean_objects_per_image = objects_per_image.mean()

bars = axes[2].bar(mean_objects_per_image.index, mean_objects_per_image.values,
                  color=colors, alpha=0.8, edgecolor='black')
axes[2].set_title('Average Objects per Image')
axes[2].set_xlabel('Class')
axes[2].set_ylabel('Average Count per Image')
axes[2].grid(True, alpha=0.3)

# Add value labels
for bar, count in zip(bars, mean_objects_per_image.values):
    axes[2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(mean_objects_per_image.values)*0.01,
                f'{count:.1f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Class imbalance analysis
max_count = class_counts.max()
min_count = class_counts.min()
imbalance_ratio = max_count / min_count

print(f"\n CLASS IMBALANCE ANALYSIS")
print(f"Most frequent class: {class_counts.index[0]} ({class_counts.iloc[0]:,} objects)")
print(f"Least frequent class: {class_counts.index[-1]} ({class_counts.iloc[-1]:,} objects)")
print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")

if imbalance_ratio > 3:
    print("  Significant class imbalance detected")
elif imbalance_ratio > 2:
    print("  Moderate class imbalance")
else:
    print(" Relatively balanced classes")

## 6. Object Size Analysis

Understanding the object sizes is crucial for satellite imagery, as vehicles appear very small in images.

In [None]:
# Object size analysis
print(" OBJECT SIZE ANALYSIS")


# Overall size statistics
print("Overall Size Statistics:")
print(f"Width range: {train_df['width_abs'].min():.1f} - {train_df['width_abs'].max():.1f} pixels")
print(f"Height range: {train_df['height_abs'].min():.1f} - {train_df['height_abs'].max():.1f} pixels")
print(f"Area range: {train_df['area_abs'].min():.0f} - {train_df['area_abs'].max():.0f} pixels²")
print(f"Average width: {train_df['width_abs'].mean():.1f} pixels")
print(f"Average height: {train_df['height_abs'].mean():.1f} pixels")
print(f"Average area: {train_df['area_abs'].mean():.0f} pixels²")

# Size statistics by class
print("\n Size Statistics by Class:")
for class_name in class_names:
    class_data = train_df[train_df['class_name'] == class_name]
    if len(class_data) > 0:
        print(f"\n{class_name.upper()}:")
        print(f"  Count: {len(class_data):,}")
        print(f"  Avg width: {class_data['width_abs'].mean():.1f} ± {class_data['width_abs'].std():.1f} px")
        print(f"  Avg height: {class_data['height_abs'].mean():.1f} ± {class_data['height_abs'].std():.1f} px")
        print(f"  Avg area: {class_data['area_abs'].mean():.0f} ± {class_data['area_abs'].std():.0f} px²")



In [None]:
# Visualize object sizes
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Object Size Analysis', fontsize=16, fontweight='bold')

# Width distribution by class
for i, class_name in enumerate(class_names):
    class_data = train_df[train_df['class_name'] == class_name]
    if len(class_data) > 0:
        axes[0, 0].hist(class_data['width_abs'], alpha=0.6, label=class_name, bins=30)

axes[0, 0].set_title('Width Distribution by Class')
axes[0, 0].set_xlabel('Width (pixels)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Height distribution by class
for i, class_name in enumerate(class_names):
    class_data = train_df[train_df['class_name'] == class_name]
    if len(class_data) > 0:
        axes[0, 1].hist(class_data['height_abs'], alpha=0.6, label=class_name, bins=30)

axes[0, 1].set_title('Height Distribution by Class')
axes[0, 1].set_xlabel('Height (pixels)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Area distribution by class
for i, class_name in enumerate(class_names):
    class_data = train_df[train_df['class_name'] == class_name]
    if len(class_data) > 0:
        axes[0, 2].hist(class_data['area_abs'], alpha=0.6, label=class_name, bins=30)

axes[0, 2].set_title('Area Distribution by Class')
axes[0, 2].set_xlabel('Area (pixels²)')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)

# Box plots for better comparison
sns.boxplot(data=train_df, x='class_name', y='width_abs', ax=axes[1, 0])
axes[1, 0].set_title('Width Distribution (Box Plot)')
axes[1, 0].set_xlabel('Class')
axes[1, 0].set_ylabel('Width (pixels)')
axes[1, 0].grid(True, alpha=0.3)

sns.boxplot(data=train_df, x='class_name', y='height_abs', ax=axes[1, 1])
axes[1, 1].set_title('Height Distribution (Box Plot)')
axes[1, 1].set_xlabel('Class')
axes[1, 1].set_ylabel('Height (pixels)')
axes[1, 1].grid(True, alpha=0.3)

sns.boxplot(data=train_df, x='class_name', y='area_abs', ax=axes[1, 2])
axes[1, 2].set_title('Area Distribution (Box Plot)')
axes[1, 2].set_xlabel('Class')
axes[1, 2].set_ylabel('Area (pixels²)')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Small object analysis
print("\n SMALL OBJECT ANALYSIS")


# Define small object thresholds (common in object detection)
small_area_threshold = 32 * 32  # 32x32 pixels
medium_area_threshold = 96 * 96  # 96x96 pixels

small_objects = train_df[train_df['area_abs'] <= small_area_threshold]
medium_objects = train_df[(train_df['area_abs'] > small_area_threshold) &
                         (train_df['area_abs'] <= medium_area_threshold)]
large_objects = train_df[train_df['area_abs'] > medium_area_threshold]

print(f"Small objects (<= {small_area_threshold} px²): {len(small_objects):,} ({len(small_objects)/len(train_df)*100:.1f}%)")
print(f"Medium objects ({small_area_threshold}-{medium_area_threshold} px²): {len(medium_objects):,} ({len(medium_objects)/len(train_df)*100:.1f}%)")
print(f"Large objects (> {medium_area_threshold} px²): {len(large_objects):,} ({len(large_objects)/len(train_df)*100:.1f}%)")

if len(small_objects) / len(train_df) > 0.5:
    print("\n  Majority of objects are small")


## 7. Spatial Distribution Analysis

Now we will analyse where objects typically appear in the images.

In [None]:
# Spatial distribution analysis
print(" SPATIAL DISTRIBUTION ANALYSIS")


# Create spatial heatmaps
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Object Spatial Distribution', fontsize=16, fontweight='bold')

# Overall heatmap
x_centers = train_df['x_center_norm'].values
y_centers = train_df['y_center_norm'].values

# Create 2D histogram for heatmap
hist, x_edges, y_edges = np.histogram2d(x_centers, y_centers, bins=20, range=[[0, 1], [0, 1]])
extent = [x_edges[0], x_edges[-1], y_edges[0], y_edges[-1]]

im1 = axes[0, 0].imshow(hist.T, extent=extent, origin='lower', cmap='YlOrRd', alpha=0.8)
axes[0, 0].set_title('Overall Object Distribution Heatmap')
axes[0, 0].set_xlabel('X Position (normalized)')
axes[0, 0].set_ylabel('Y Position (normalized)')
plt.colorbar(im1, ax=axes[0, 0], label='Object Count')

# Scatter plot by class
colors_scatter = ['red', 'blue', 'green']
for i, class_name in enumerate(class_names):
    class_data = train_df[train_df['class_name'] == class_name]
    if len(class_data) > 0:
        # Sample data if too many points
        if len(class_data) > 1000:
            class_data = class_data.sample(1000)

        axes[0, 1].scatter(class_data['x_center_norm'], class_data['y_center_norm'],
                          alpha=0.6, s=10, label=class_name, c=colors_scatter[i % len(colors_scatter)])

axes[0, 1].set_title('Object Positions by Class')
axes[0, 1].set_xlabel('X Position (normalized)')
axes[0, 1].set_ylabel('Y Position (normalized)')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xlim(0, 1)
axes[0, 1].set_ylim(0, 1)

# Position distribution histograms
axes[1, 0].hist(train_df['x_center_norm'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[1, 0].axvline(0.5, color='red', linestyle='--', label='Center')
axes[1, 0].set_title('X Position Distribution')
axes[1, 0].set_xlabel('X Position (normalized)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].hist(train_df['y_center_norm'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[1, 1].axvline(0.5, color='red', linestyle='--', label='Center')
axes[1, 1].set_title('Y Position Distribution')
axes[1, 1].set_xlabel('Y Position (normalized)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistical analysis of positions
print(f"X position statistics:")
print(f"  Mean: {train_df['x_center_norm'].mean():.3f} (0.5 = center)")
print(f"  Std: {train_df['x_center_norm'].std():.3f}")
print(f"  Range: {train_df['x_center_norm'].min():.3f} - {train_df['x_center_norm'].max():.3f}")

print(f"\nY position statistics:")
print(f"  Mean: {train_df['y_center_norm'].mean():.3f} (0.5 = center)")
print(f"  Std: {train_df['y_center_norm'].std():.3f}")
print(f"  Range: {train_df['y_center_norm'].min():.3f} - {train_df['y_center_norm'].max():.3f}")

# Edge bias analysis
edge_threshold = 0.1  # Objects within 10% of edges
edge_objects = train_df[
    (train_df['x_center_norm'] <= edge_threshold) |
    (train_df['x_center_norm'] >= 1 - edge_threshold) |
    (train_df['y_center_norm'] <= edge_threshold) |
    (train_df['y_center_norm'] >= 1 - edge_threshold)
]

edge_percentage = len(edge_objects) / len(train_df) * 100
print(f"\n Objects near edges (within {edge_threshold*100}%): {len(edge_objects):,} ({edge_percentage:.1f}%)")

if edge_percentage > 30:
    print(" High concentration of objects near edges - consider data augmentation with crops")
elif edge_percentage < 10:
    print(" Objects well distributed across image - good for training")

## 8. Sample Visualization

Now we will visualise some actual images with their annotations to see our dataset.

In [None]:
def visualize_annotations(images_path, labels_path, class_names, num_samples=6):

    # Get random sample of images that have annotations
    label_files = [f for f in os.listdir(labels_path) if f.endswith('.txt')]
    sample_files = np.random.choice(label_files, min(num_samples, len(label_files)), replace=False)

    # Color map for different classes
    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255), (0, 255, 255)]

    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()
    fig.suptitle('Sample Images with Annotations', fontsize=16, fontweight='bold')

    for i, label_file in enumerate(sample_files):
        if i >= len(axes):
            break

        # Load image
        img_file = label_file.replace('.txt', '.jpg')
        img_path = os.path.join(images_path, img_file)

        if not os.path.exists(img_path):
            img_file = label_file.replace('.txt', '.png')
            img_path = os.path.join(images_path, img_file)

        if not os.path.exists(img_path):
            continue

        # Load image with OpenCV
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img_height, img_width = img.shape[:2]

        # Load annotations
        label_path = os.path.join(labels_path, label_file)
        with open(label_path, 'r') as f:
            lines = f.readlines()

        object_counts = Counter()

        # Draw bounding boxes
        for line in lines:
            if line.strip():
                parts = line.strip().split()
                if len(parts) == 5:
                    class_id = int(parts[0])
                    x_center = float(parts[1]) * img_width
                    y_center = float(parts[2]) * img_height
                    width = float(parts[3]) * img_width
                    height = float(parts[4]) * img_height

                    # Convert to corner coordinates
                    x1 = int(x_center - width/2)
                    y1 = int(y_center - height/2)
                    x2 = int(x_center + width/2)
                    y2 = int(y_center + height/2)

                    # Draw rectangle
                    color = colors[class_id % len(colors)]
                    cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)

                    # Add label
                    label = class_names[class_id]
                    cv2.putText(img, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

                    object_counts[label] += 1

        # Display image
        axes[i].imshow(img)

        # Create title with object counts
        title_parts = []
        for class_name in class_names:
            count = object_counts.get(class_name, 0)
            if count > 0:
                title_parts.append(f"{class_name}: {count}")

        title = f"{img_file}\n{', '.join(title_parts)}"
        axes[i].set_title(title, fontsize=10)
        axes[i].axis('off')

    # Hide empty subplots
    for j in range(i+1, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()

print("\n SAMPLE IMAGES WITH ANNOTATIONS")
visualize_annotations(train_images_path, train_labels_path, class_names, num_samples=6)