# Data Exploration
## Endangered Species Image Classifier

This notebook explores the dataset for the endangered species classifier project.

In [None]:
# Import required libraries
import sys
sys.path.append('..')

from datasets import load_dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

from config.model_config import CONSERVATION_CLASSES, GEOGRAPHIC_REGIONS

## Load Dataset

Load the rare species dataset from Hugging Face.

In [None]:
# Load dataset from Hugging Face
dataset_name = 'imageomics/rare-species'
print(f"Loading dataset: {dataset_name}")

try:
    dataset = load_dataset(dataset_name)
    print("Dataset loaded successfully!")
    print(f"Dataset structure: {dataset}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Using local data if available...")

## Explore Data Structure

In [None]:
# Display dataset information
if 'dataset' in locals():
    print("Dataset splits:")
    for split in dataset.keys():
        print(f"  {split}: {len(dataset[split])} samples")
    
    # Show first example
    print("\nFirst example:")
    print(dataset['train'][0])
    
    # Show column names
    print("\nColumn names:")
    print(dataset['train'].column_names)

## Visualize Sample Images

In [None]:
# Display sample images from each conservation category
if 'dataset' in locals():
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for idx in range(min(6, len(dataset['train']))):
        sample = dataset['train'][idx]
        image = sample['image']
        
        axes[idx].imshow(image)
        axes[idx].set_title(f"Sample {idx+1}")
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.savefig('../visualizations/sample_images.png', dpi=100)
    plt.show()

## Analyze Class Distribution

In [None]:
# Analyze conservation status distribution
print("Conservation Status Categories:")
for status in CONSERVATION_CLASSES:
    print(f"  {status}")

# Create bar plot of class distribution
fig, ax = plt.subplots(figsize=(10, 6))
categories = CONSERVATION_CLASSES
counts = [10, 15, 20, 12, 5, 8]  # Placeholder counts

ax.bar(categories, counts, color='steelblue')
ax.set_xlabel('Conservation Status')
ax.set_ylabel('Number of Samples')
ax.set_title('Distribution of Conservation Status Categories')
plt.tight_layout()
plt.savefig('../visualizations/class_distribution.png', dpi=100)
plt.show()

## Analyze Geographic Distribution

In [None]:
# Analyze geographic region distribution
print("Geographic Regions:")
for region in GEOGRAPHIC_REGIONS:
    print(f"  {region}")

# Create visualization
fig, ax = plt.subplots(figsize=(12, 6))
regions = GEOGRAPHIC_REGIONS
counts = [25, 30, 15, 20, 18, 12, 8, 22]  # Placeholder counts

ax.bar(regions, counts, color='forestgreen')
ax.set_xlabel('Geographic Region')
ax.set_ylabel('Number of Species')
ax.set_title('Species Distribution Across Geographic Regions')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../visualizations/geographic_distribution_map.png', dpi=100)
plt.show()

## Summary Statistics

In [None]:
# Print summary statistics
print("=" * 50)
print("DATA EXPLORATION SUMMARY")
print("=" * 50)
print(f"\nTotal conservation categories: {len(CONSERVATION_CLASSES)}")
print(f"Total geographic regions: {len(GEOGRAPHIC_REGIONS)}")
print("\nDataset ready for preprocessing!")