In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os
from PIL import Image

# Load metadata
df = pd.read_csv('metadata.csv')

# Add the root directory with the unzipped images.
data_root_path = r'C:\Users\inesb\Downloads\rare_species' 
df['full_path'] = df['file_path'].apply(lambda x: os.path.join(data_root_path, x))

# Remove rows with missing file paths or family labels
df = df.dropna(subset=['file_path', 'family']).reset_index(drop=True)

# Check for missing files
df['exists'] = df['full_path'].apply(os.path.exists)
missing = df[df['exists'] == False]

print("Missing images:", len(missing))
if len(missing) > 0:
    print(missing[['file_path']].head())

# Drop rows with missing images
df = df[df['exists'] == True].reset_index(drop=True)

# Duplicate rows in metadata
duplicate_rows = df[df.duplicated()]
print("Duplicate metadata rows:")
print(duplicate_rows)
df = df.drop_duplicates().reset_index(drop=True)

# Duplicate image paths
duplicate_paths = df[df.duplicated(subset='full_path')]
print("Duplicate file paths:")
print(duplicate_paths)
df = df.drop_duplicates(subset='full_path').reset_index(drop=True)

# Encode each category in the target variable
df['family_encoded'] = pd.factorize(df['family'])[0]
unique_families = df['family'].unique()
print(df['family'].nunique()) # 202
df.head(3)

# Check the class distribution throughout the dataset
target_distribution = df['full_path'].groupby(df["family_encoded"]).count()
print(target_distribution.describe().T)
# The count of images per class varies from 29 to 300, so we can consider this an imbalanced dataset


Missing images: 0
Duplicate metadata rows:
Empty DataFrame
Columns: [rare_species_id, eol_content_id, eol_page_id, kingdom, phylum, family, file_path, full_path, exists]
Index: []
Duplicate file paths:
Empty DataFrame
Columns: [rare_species_id, eol_content_id, eol_page_id, kingdom, phylum, family, file_path, full_path, exists]
Index: []
202
count    202.000000
mean      59.321782
std       54.326637
min       29.000000
25%       30.000000
50%       30.000000
75%       60.000000
max      300.000000
Name: full_path, dtype: float64


In [2]:
# Stratified Split: 70% Train, 15% Validation, 15% Test
train_df, test_df = train_test_split(df, test_size = 0.15, stratify = df['family'], random_state = 42)
train_df, val_df = train_test_split(train_df, test_size = 0.1765, stratify = train_df['family'], random_state = 42) 
# (0.1765 of the remaining 85% is roughly 15% of the total)
print(f"Train shape: {train_df.shape}") # Train shape: (8387, 10)
print(f"Val shape: {val_df.shape}") # Val shape: (1798, 10)
print(f"Test shape: {test_df.shape}") # Test shape: (1798, 10)

Train shape: (8387, 10)
Val shape: (1798, 10)
Test shape: (1798, 10)
