# Second Dataset Analysis and Visualizations

Comprehensive analysis of the second dataset to understand:
- Label distribution
- Image properties and their relationship to labels
- Patterns in image IDs
- Visual characteristics by class


## Install and Import


In [1]:
# Install required packages
%pip install -q pandas pillow matplotlib seaborn numpy scikit-learn

import pandas as pd
import numpy as np
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
import re
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")


Note: you may need to restart the kernel to use updated packages.
Libraries imported successfully!


## Configuration


In [13]:
# Dataset paths
SECOND_DATASET_BASE_PATH = "data"
SECOND_DATASET_TRAIN_DIR = Path(SECOND_DATASET_BASE_PATH) / "train"
SECOND_DATASET_CSV_PATH = Path(SECOND_DATASET_BASE_PATH) / "train_labels.csv"
SECOND_DATASET_TEST_DIR = Path(SECOND_DATASET_BASE_PATH) / "test"

print(f"Train Directory: {SECOND_DATASET_TRAIN_DIR}")
print(f"CSV File: {SECOND_DATASET_CSV_PATH}")
print(f"Test Directory: {SECOND_DATASET_TEST_DIR}")


Train Directory: data/train
CSV File: data/train_labels.csv
Test Directory: data/test


## Load Dataset


In [14]:
# Load CSV with labels
if SECOND_DATASET_CSV_PATH.exists():
    df = pd.read_csv(SECOND_DATASET_CSV_PATH)
    print(f"Loaded CSV with {len(df)} rows")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(df.head())
    print(f"\nLabel value counts:")
    print(df['Label'].value_counts())
else:
    print(f"ERROR: CSV file not found at {SECOND_DATASET_CSV_PATH}")
    df = None


KeyboardInterrupt: 

## Analyze Image Properties


In [4]:
def analyze_image_properties(image_path, image_id, label):
    """Extract properties from an image."""
    try:
        image = Image.open(image_path)
        
        # Original mode
        original_mode = image.mode
        
        # Handle palette images with transparency
        if image.mode == 'P':
            image = image.convert('RGBA')
        # Convert to RGB, compositing transparent pixels onto white background
        if image.mode == 'RGBA':
            background = Image.new('RGB', image.size, (255, 255, 255))
            background.paste(image, mask=image.split()[3])  # Use alpha channel as mask
            image = background
        elif image.mode != 'RGB':
            image = image.convert('RGB')
        
        width, height = image.size
        aspect_ratio = width / height if height > 0 else 0
        
        # Get image statistics
        img_array = np.array(image)
        mean_r = img_array[:, :, 0].mean()
        mean_g = img_array[:, :, 1].mean()
        mean_b = img_array[:, :, 2].mean()
        std_r = img_array[:, :, 0].std()
        std_g = img_array[:, :, 1].std()
        std_b = img_array[:, :, 2].std()
        
        # Brightness (average of RGB)
        brightness = (mean_r + mean_g + mean_b) / 3
        
        # Check if image is mostly white/light
        is_mostly_white = brightness > 200
        
        # Extract numeric part from image_id if possible
        image_id_numeric = None
        try:
            # Try to extract numbers from image_id
            numbers = re.findall(r'\d+', str(image_id))
            if numbers:
                image_id_numeric = int(numbers[0])
        except:
            pass
        
        return {
            'image_id': image_id,
            'label': label,
            'original_mode': original_mode,
            'width': width,
            'height': height,
            'aspect_ratio': aspect_ratio,
            'pixel_count': width * height,
            'mean_r': mean_r,
            'mean_g': mean_g,
            'mean_b': mean_b,
            'std_r': std_r,
            'std_g': std_g,
            'std_b': std_b,
            'brightness': brightness,
            'is_mostly_white': is_mostly_white,
            'image_id_numeric': image_id_numeric
        }
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Analyze all images
if df is not None and SECOND_DATASET_TRAIN_DIR.exists():
    print("\nAnalyzing image properties...")
    image_data = []
    
    for idx, row in df.iterrows():
        image_id = str(row['Id']).zfill(5)
        label = str(row['Label']).lower()
        
        # Try to find image file
        image_found = False
        for ext in ['.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG']:
            image_path = SECOND_DATASET_TRAIN_DIR / f"{image_id}{ext}"
            if image_path.exists():
                props = analyze_image_properties(image_path, image_id, label)
                if props:
                    image_data.append(props)
                image_found = True
                break
        
        if not image_found and idx < 10:
            print(f"Warning: Image not found for ID {image_id}")
    
    # Create DataFrame from image properties
    image_df = pd.DataFrame(image_data)
    print(f"\nAnalyzed {len(image_df)} images")
    print(f"\nImage properties DataFrame shape: {image_df.shape}")
    print(f"\nFirst few rows:")
    print(image_df.head())
else:
    print("Cannot analyze images - dataset not found")
    image_df = None


Cannot analyze images - dataset not found


In [5]:
if image_df is not None and len(image_df) > 0:
    # Label distribution
    label_counts = image_df['label'].value_counts()
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Bar plot
    axes[0].bar(label_counts.index, label_counts.values, color=sns.color_palette("husl", len(label_counts)))
    axes[0].set_xlabel('Label', fontsize=12)
    axes[0].set_ylabel('Count', fontsize=12)
    axes[0].set_title('Label Distribution (Count)', fontsize=14, fontweight='bold')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(axis='y', alpha=0.3)
    
    # Add count labels on bars
    for i, (label, count) in enumerate(label_counts.items()):
        axes[0].text(i, count, str(count), ha='center', va='bottom', fontweight='bold')
    
    # Pie chart
    axes[1].pie(label_counts.values, labels=label_counts.index, autopct='%1.1f%%', 
                colors=sns.color_palette("husl", len(label_counts)), startangle=90)
    axes[1].set_title('Label Distribution (Percentage)', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nLabel Distribution Summary:")
    print(label_counts)
    print(f"\nTotal images: {len(image_df)}")
    print(f"Number of unique labels: {len(label_counts)}")
    print(f"\nClass imbalance ratio (max/min): {label_counts.max() / label_counts.min():.2f}")


## 2. Image Size and Aspect Ratio Analysis


In [6]:
if image_df is not None and len(image_df) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Width distribution by label
    for label in image_df['label'].unique():
        label_data = image_df[image_df['label'] == label]
        axes[0, 0].hist(label_data['width'], alpha=0.6, label=label, bins=30)
    axes[0, 0].set_xlabel('Width (pixels)', fontsize=12)
    axes[0, 0].set_ylabel('Frequency', fontsize=12)
    axes[0, 0].set_title('Width Distribution by Label', fontsize=14, fontweight='bold')
    axes[0, 0].legend()
    axes[0, 0].grid(alpha=0.3)
    
    # Height distribution by label
    for label in image_df['label'].unique():
        label_data = image_df[image_df['label'] == label]
        axes[0, 1].hist(label_data['height'], alpha=0.6, label=label, bins=30)
    axes[0, 1].set_xlabel('Height (pixels)', fontsize=12)
    axes[0, 1].set_ylabel('Frequency', fontsize=12)
    axes[0, 1].set_title('Height Distribution by Label', fontsize=14, fontweight='bold')
    axes[0, 1].legend()
    axes[0, 1].grid(alpha=0.3)
    
    # Aspect ratio distribution by label
    for label in image_df['label'].unique():
        label_data = image_df[image_df['label'] == label]
        axes[1, 0].hist(label_data['aspect_ratio'], alpha=0.6, label=label, bins=30)
    axes[1, 0].set_xlabel('Aspect Ratio (width/height)', fontsize=12)
    axes[1, 0].set_ylabel('Frequency', fontsize=12)
    axes[1, 0].set_title('Aspect Ratio Distribution by Label', fontsize=14, fontweight='bold')
    axes[1, 0].legend()
    axes[1, 0].grid(alpha=0.3)
    
    # Box plot: Aspect ratio by label
    image_df.boxplot(column='aspect_ratio', by='label', ax=axes[1, 1])
    axes[1, 1].set_xlabel('Label', fontsize=12)
    axes[1, 1].set_ylabel('Aspect Ratio', fontsize=12)
    axes[1, 1].set_title('Aspect Ratio by Label (Box Plot)', fontsize=14, fontweight='bold')
    axes[1, 1].tick_params(axis='x', rotation=45)
    axes[1, 1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\n" + "="*70)
    print("Image Size Statistics by Label")
    print("="*70)
    size_stats = image_df.groupby('label')[['width', 'height', 'aspect_ratio', 'pixel_count']].agg(['mean', 'std', 'min', 'max'])
    print(size_stats)


## 3. Color Analysis by Label


In [7]:
if image_df is not None and len(image_df) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Mean RGB values by label
    rgb_means = image_df.groupby('label')[['mean_r', 'mean_g', 'mean_b']].mean()
    
    x = np.arange(len(rgb_means.index))
    width = 0.25
    
    axes[0, 0].bar(x - width, rgb_means['mean_r'], width, label='Red', color='red', alpha=0.7)
    axes[0, 0].bar(x, rgb_means['mean_g'], width, label='Green', color='green', alpha=0.7)
    axes[0, 0].bar(x + width, rgb_means['mean_b'], width, label='Blue', color='blue', alpha=0.7)
    axes[0, 0].set_xlabel('Label', fontsize=12)
    axes[0, 0].set_ylabel('Mean RGB Value', fontsize=12)
    axes[0, 0].set_title('Mean RGB Values by Label', fontsize=14, fontweight='bold')
    axes[0, 0].set_xticks(x)
    axes[0, 0].set_xticklabels(rgb_means.index, rotation=45)
    axes[0, 0].legend()
    axes[0, 0].grid(axis='y', alpha=0.3)
    
    # Brightness distribution by label
    for label in image_df['label'].unique():
        label_data = image_df[image_df['label'] == label]
        axes[0, 1].hist(label_data['brightness'], alpha=0.6, label=label, bins=30)
    axes[0, 1].set_xlabel('Brightness', fontsize=12)
    axes[0, 1].set_ylabel('Frequency', fontsize=12)
    axes[0, 1].set_title('Brightness Distribution by Label', fontsize=14, fontweight='bold')
    axes[0, 1].legend()
    axes[0, 1].grid(alpha=0.3)
    
    # RGB standard deviation by label
    rgb_stds = image_df.groupby('label')[['std_r', 'std_g', 'std_b']].mean()
    
    axes[1, 0].bar(x - width, rgb_stds['std_r'], width, label='Red Std', color='red', alpha=0.7)
    axes[1, 0].bar(x, rgb_stds['std_g'], width, label='Green Std', color='green', alpha=0.7)
    axes[1, 0].bar(x + width, rgb_stds['std_b'], width, label='Blue Std', color='blue', alpha=0.7)
    axes[1, 0].set_xlabel('Label', fontsize=12)
    axes[1, 0].set_ylabel('Mean RGB Standard Deviation', fontsize=12)
    axes[1, 0].set_title('RGB Standard Deviation by Label', fontsize=14, fontweight='bold')
    axes[1, 0].set_xticks(x)
    axes[1, 0].set_xticklabels(rgb_stds.index, rotation=45)
    axes[1, 0].legend()
    axes[1, 0].grid(axis='y', alpha=0.3)
    
    # Mostly white images by label
    white_by_label = image_df.groupby('label')['is_mostly_white'].agg(['sum', 'count'])
    white_by_label['percentage'] = (white_by_label['sum'] / white_by_label['count']) * 100
    
    axes[1, 1].bar(white_by_label.index, white_by_label['percentage'], 
                  color=sns.color_palette("husl", len(white_by_label)))
    axes[1, 1].set_xlabel('Label', fontsize=12)
    axes[1, 1].set_ylabel('Percentage of Mostly White Images (%)', fontsize=12)
    axes[1, 1].set_title('Percentage of Mostly White Images by Label', fontsize=14, fontweight='bold')
    axes[1, 1].tick_params(axis='x', rotation=45)
    axes[1, 1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\n" + "="*70)
    print("Color Statistics by Label")
    print("="*70)
    color_stats = image_df.groupby('label')[['mean_r', 'mean_g', 'mean_b', 'brightness']].agg(['mean', 'std'])
    print(color_stats)


## 4. Image ID Pattern Analysis


## 5. Image Mode Analysis


In [8]:
if image_df is not None and len(image_df) > 0:
    # Analyze image ID patterns
    image_df['image_id_length'] = image_df['image_id'].str.len()
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Image ID numeric distribution by label (if available)
    if image_df['image_id_numeric'].notna().any():
        for label in image_df['label'].unique():
            label_data = image_df[(image_df['label'] == label) & (image_df['image_id_numeric'].notna())]
            if len(label_data) > 0:
                axes[0, 0].hist(label_data['image_id_numeric'], alpha=0.6, label=label, bins=50)
        axes[0, 0].set_xlabel('Image ID (Numeric)', fontsize=12)
        axes[0, 0].set_ylabel('Frequency', fontsize=12)
        axes[0, 0].set_title('Image ID Numeric Distribution by Label', fontsize=14, fontweight='bold')
        axes[0, 0].legend()
        axes[0, 0].grid(alpha=0.3)
    else:
        axes[0, 0].text(0.5, 0.5, 'No numeric IDs found', ha='center', va='center', transform=axes[0, 0].transAxes)
        axes[0, 0].set_title('Image ID Numeric Distribution', fontsize=14, fontweight='bold')
    
    # Image ID length distribution
    for label in image_df['label'].unique():
        label_data = image_df[image_df['label'] == label]
        axes[0, 1].hist(label_data['image_id_length'], alpha=0.6, label=label, bins=range(1, 20))
    axes[0, 1].set_xlabel('Image ID Length', fontsize=12)
    axes[0, 1].set_ylabel('Frequency', fontsize=12)
    axes[0, 1].set_title('Image ID Length Distribution by Label', fontsize=14, fontweight='bold')
    axes[0, 1].legend()
    axes[0, 1].grid(alpha=0.3)
    
    # Scatter: Image ID numeric vs Label (if available)
    if image_df['image_id_numeric'].notna().any():
        label_unique = sorted(image_df['label'].unique())
        for label in label_unique:
            label_data = image_df[(image_df['label'] == label) & (image_df['image_id_numeric'].notna())]
            if len(label_data) > 0:
                axes[1, 0].scatter(label_data['image_id_numeric'], 
                                 [label_unique.index(label)] * len(label_data),
                                 alpha=0.5, label=label, s=10)
        axes[1, 0].set_xlabel('Image ID (Numeric)', fontsize=12)
        axes[1, 0].set_ylabel('Label Index', fontsize=12)
        axes[1, 0].set_title('Image ID vs Label (Scatter)', fontsize=14, fontweight='bold')
        axes[1, 0].set_yticks(range(len(label_unique)))
        axes[1, 0].set_yticklabels(label_unique)
        axes[1, 0].legend()
        axes[1, 0].grid(alpha=0.3)
    else:
        axes[1, 0].text(0.5, 0.5, 'No numeric IDs found', ha='center', va='center', transform=axes[1, 0].transAxes)
        axes[1, 0].set_title('Image ID vs Label', fontsize=14, fontweight='bold')
    
    # Image ID range by label
    if image_df['image_id_numeric'].notna().any():
        id_ranges = image_df.groupby('label')['image_id_numeric'].agg(['min', 'max', 'mean'])
        x_pos = np.arange(len(id_ranges))
        axes[1, 1].bar(x_pos, id_ranges['max'] - id_ranges['min'], 
                      color=sns.color_palette("husl", len(id_ranges)), alpha=0.7)
        axes[1, 1].set_xlabel('Label', fontsize=12)
        axes[1, 1].set_ylabel('ID Range (max - min)', fontsize=12)
        axes[1, 1].set_title('Image ID Range by Label', fontsize=14, fontweight='bold')
        axes[1, 1].set_xticks(x_pos)
        axes[1, 1].set_xticklabels(id_ranges.index, rotation=45)
        axes[1, 1].grid(axis='y', alpha=0.3)
    else:
        axes[1, 1].text(0.5, 0.5, 'No numeric IDs found', ha='center', va='center', transform=axes[1, 1].transAxes)
        axes[1, 1].set_title('Image ID Range by Label', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Check if image IDs are predictive
    if image_df['image_id_numeric'].notna().any():
        print("\n" + "="*70)
        print("Image ID Statistics by Label")
        print("="*70)
        id_stats = image_df.groupby('label')['image_id_numeric'].agg(['min', 'max', 'mean', 'std', 'count'])
        print(id_stats)
        
        # Check for ID ranges that might indicate label
        print("\n" + "="*70)
        print("Image ID Ranges by Label (Potential Pattern Detection)")
        print("="*70)
        for label in image_df['label'].unique():
            label_data = image_df[(image_df['label'] == label) & (image_df['image_id_numeric'].notna())]
            if len(label_data) > 0:
                print(f"{label:15s}: IDs {label_data['image_id_numeric'].min():.0f} - {label_data['image_id_numeric'].max():.0f} (mean: {label_data['image_id_numeric'].mean():.1f})")


In [9]:
if image_df is not None and len(image_df) > 0:
    # Image mode distribution
    mode_counts = image_df['original_mode'].value_counts()
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Mode distribution overall
    axes[0].bar(mode_counts.index, mode_counts.values, color=sns.color_palette("husl", len(mode_counts)))
    axes[0].set_xlabel('Image Mode', fontsize=12)
    axes[0].set_ylabel('Count', fontsize=12)
    axes[0].set_title('Image Mode Distribution (Overall)', fontsize=14, fontweight='bold')
    axes[0].grid(axis='y', alpha=0.3)
    
    # Mode distribution by label
    mode_by_label = pd.crosstab(image_df['label'], image_df['original_mode'])
    mode_by_label.plot(kind='bar', stacked=True, ax=axes[1], colormap='Set3')
    axes[1].set_xlabel('Label', fontsize=12)
    axes[1].set_ylabel('Count', fontsize=12)
    axes[1].set_title('Image Mode Distribution by Label', fontsize=14, fontweight='bold')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].legend(title='Mode', bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\n" + "="*70)
    print("Image Mode Distribution by Label")
    print("="*70)
    print(mode_by_label)
    print("\n" + "="*70)
    print("Image Mode Percentages by Label")
    print("="*70)
    mode_percentages = mode_by_label.div(mode_by_label.sum(axis=1), axis=0) * 100
    print(mode_percentages.round(2))


## 6. Correlation Analysis: Can We Predict Label from Image Properties?


In [10]:
if image_df is not None and len(image_df) > 0:
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.model_selection import train_test_split
    
    # Prepare features
    feature_cols = ['width', 'height', 'aspect_ratio', 'pixel_count', 
                   'mean_r', 'mean_g', 'mean_b', 'std_r', 'std_g', 'std_b', 
                   'brightness', 'is_mostly_white']
    
    if image_df['image_id_numeric'].notna().any():
        feature_cols.append('image_id_numeric')
    
    # Remove rows with missing features
    feature_df = image_df[feature_cols + ['label']].dropna()
    
    if len(feature_df) > 0:
        X = feature_df[feature_cols]
        y = feature_df['label']
        
        # Encode labels
        le = LabelEncoder()
        y_encoded = le.fit_transform(y)
        
        # Train a simple classifier to see if features are predictive
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
        
        rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        print("\n" + "="*70)
        print("Feature Importance Analysis")
        print("="*70)
        print(f"\nCan we predict label from image properties?")
        print(f"Random Forest Accuracy: {accuracy*100:.2f}%")
        print(f"\n(Note: This is just to see if properties are predictive, not a real model)")
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)
        
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.barh(feature_importance['feature'], feature_importance['importance'], 
               color=sns.color_palette("husl", len(feature_importance)))
        ax.set_xlabel('Importance', fontsize=12)
        ax.set_ylabel('Feature', fontsize=12)
        ax.set_title('Feature Importance for Label Prediction', fontsize=14, fontweight='bold')
        ax.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print("\nFeature Importance Ranking:")
        print(feature_importance)
        
        # Classification report
        print("\n" + "="*70)
        print("Classification Report (based on image properties only)")
        print("="*70)
        print(classification_report(y_test, y_pred, target_names=le.classes_))
    else:
        print("Not enough data with complete features for analysis")


## 7. Sample Images by Label


In [11]:
if image_df is not None and len(image_df) > 0 and SECOND_DATASET_TRAIN_DIR.exists():
    # Show sample images from each label
    n_samples_per_label = 4
    unique_labels = sorted(image_df['label'].unique())
    
    fig, axes = plt.subplots(len(unique_labels), n_samples_per_label, figsize=(16, 4*len(unique_labels)))
    
    if len(unique_labels) == 1:
        axes = axes.reshape(1, -1)
    
    for label_idx, label in enumerate(unique_labels):
        label_data = image_df[image_df['label'] == label].head(n_samples_per_label)
        
        for sample_idx, (_, row) in enumerate(label_data.iterrows()):
            image_id = row['image_id']
            
            # Find image file
            image_path = None
            for ext in ['.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG']:
                potential_path = SECOND_DATASET_TRAIN_DIR / f"{image_id}{ext}"
                if potential_path.exists():
                    image_path = potential_path
                    break
            
            if image_path and image_path.exists():
                try:
                    img = Image.open(image_path)
                    # Handle palette images
                    if img.mode == 'P':
                        img = img.convert('RGBA')
                    if img.mode == 'RGBA':
                        background = Image.new('RGB', img.size, (255, 255, 255))
                        background.paste(img, mask=img.split()[3])
                        img = background
                    elif img.mode != 'RGB':
                        img = img.convert('RGB')
                    
                    axes[label_idx, sample_idx].imshow(img)
                    axes[label_idx, sample_idx].set_title(f"{label}\nID: {image_id}\n{row['width']}x{row['height']}", 
                                                       fontsize=10)
                    axes[label_idx, sample_idx].axis('off')
                except Exception as e:
                    axes[label_idx, sample_idx].text(0.5, 0.5, f"Error\nloading\nimage", 
                                                    ha='center', va='center', transform=axes[label_idx, sample_idx].transAxes)
                    axes[label_idx, sample_idx].axis('off')
            else:
                axes[label_idx, sample_idx].text(0.5, 0.5, f"Image\nnot found", 
                                                ha='center', va='center', transform=axes[label_idx, sample_idx].transAxes)
                axes[label_idx, sample_idx].axis('off')
    
    plt.tight_layout()
    plt.show()
    print(f"\nDisplayed {n_samples_per_label} sample images per label")


## 8. Summary Statistics and Insights


In [12]:
if image_df is not None and len(image_df) > 0:
    print("="*70)
    print("DATASET SUMMARY STATISTICS")
    print("="*70)
    
    print(f"\nTotal Images: {len(image_df)}")
    print(f"Unique Labels: {len(image_df['label'].unique())}")
    print(f"Labels: {', '.join(sorted(image_df['label'].unique()))}")
    
    print(f"\n" + "-"*70)
    print("Image Size Statistics:")
    print(f"  Width:  {image_df['width'].min()} - {image_df['width'].max()} (mean: {image_df['width'].mean():.1f})")
    print(f"  Height: {image_df['height'].min()} - {image_df['height'].max()} (mean: {image_df['height'].mean():.1f})")
    print(f"  Aspect Ratio: {image_df['aspect_ratio'].min():.2f} - {image_df['aspect_ratio'].max():.2f} (mean: {image_df['aspect_ratio'].mean():.2f})")
    
    print(f"\n" + "-"*70)
    print("Color Statistics:")
    print(f"  Mean Brightness: {image_df['brightness'].min():.1f} - {image_df['brightness'].max():.1f} (mean: {image_df['brightness'].mean():.1f})")
    print(f"  Mostly White Images: {image_df['is_mostly_white'].sum()} ({image_df['is_mostly_white'].mean()*100:.1f}%)")
    
    print(f"\n" + "-"*70)
    print("Image Mode Distribution:")
    for mode, count in image_df['original_mode'].value_counts().items():
        print(f"  {mode}: {count} ({count/len(image_df)*100:.1f}%)")
    
    print(f"\n" + "-"*70)
    print("Label Distribution:")
    label_counts = image_df['label'].value_counts()
    for label, count in label_counts.items():
        print(f"  {label:15s}: {count:4d} ({count/len(image_df)*100:5.1f}%)")
    
    print(f"\n" + "-"*70)
    print("Key Insights:")
    
    # Check if image properties are predictive
    if image_df['image_id_numeric'].notna().any():
        id_overlap = False
        label_ranges = {}
        for label in image_df['label'].unique():
            label_data = image_df[(image_df['label'] == label) & (image_df['image_id_numeric'].notna())]
            if len(label_data) > 0:
                label_ranges[label] = (label_data['image_id_numeric'].min(), label_data['image_id_numeric'].max())
        
        # Check for overlapping ranges
        ranges_list = list(label_ranges.values())
        for i, (min1, max1) in enumerate(ranges_list):
            for j, (min2, max2) in enumerate(ranges_list[i+1:], i+1):
                if not (max1 < min2 or max2 < min1):
                    id_overlap = True
                    break
        
        if not id_overlap:
            print("  ⚠️  Image IDs appear to be NON-OVERLAPPING by label - IDs might be predictive!")
        else:
            print("  ✓ Image ID ranges overlap between labels - IDs are not clearly predictive")
    
    # Check size consistency
    size_std = image_df.groupby('label')[['width', 'height']].std().mean().mean()
    if size_std < 10:
        print(f"  ⚠️  Images have very consistent sizes (std: {size_std:.1f}) - size might be predictive")
    else:
        print(f"  ✓ Images have variable sizes (std: {size_std:.1f}) - size is less predictive")
    
    # Check color differences
    color_diff = image_df.groupby('label')['brightness'].mean().std()
    if color_diff > 20:
        print(f"  ⚠️  Labels have different average brightness (std: {color_diff:.1f}) - color might be predictive")
    else:
        print(f"  ✓ Labels have similar brightness (std: {color_diff:.1f}) - color is less predictive")
    
    print("\n" + "="*70)
