In [2]:
import os
import pandas as pd
from PIL import Image

# Define data directory path
DATA_DIR = 'data'  # Adjust this path to match your dataset location


In [3]:
# Get image metadata
def get_image_metadata(image_path):
    with Image.open(image_path) as img:
        metadata = {
            'size': img.size,
            'mode': img.mode,
            'format': img.format,
            'info': img.info
        }
    return metadata

# Get metadata for all images in dataset
image_metadata = []
for class_folder in os.listdir(DATA_DIR):
    class_path = os.path.join(DATA_DIR, class_folder)
    if os.path.isdir(class_path):
        for image_name in os.listdir(class_path):
            image_path = os.path.join(class_path, image_name)
            try:
                metadata = get_image_metadata(image_path)
                metadata['class'] = class_folder
                metadata['filename'] = image_name
                image_metadata.append(metadata)
            except Exception as e:
                print(f"Error processing {image_path}: {e}")

# Convert to DataFrame for analysis
metadata_df = pd.DataFrame(image_metadata)

# Print summary statistics
print("\nImage Dataset Metadata Summary:")
print(f"Total number of images: {len(metadata_df)}")
print("\nImage sizes:")
print(metadata_df['size'].value_counts())
print("\nImage formats:")
print(metadata_df['format'].value_counts())
print("\nImage modes:")
print(metadata_df['mode'].value_counts())
print("\nImages per class:")
print(metadata_df['class'].value_counts())

# Save metadata to CSV
metadata_df.to_csv('image_metadata.csv', index=False)



Image Dataset Metadata Summary:
Total number of images: 35625

Image sizes:
size
(250, 250)     770
(1280, 720)    641
(800, 800)     521
(300, 300)     514
(600, 600)     502
              ... 
(780, 438)       1
(906, 882)       1
(700, 764)       1
(650, 807)       1
(698, 530)       1
Name: count, Length: 11239, dtype: int64

Image formats:
format
JPEG    29227
PNG      6384
BMP        10
GIF         4
Name: count, dtype: int64

Image modes:
mode
RGB     31632
RGBA     2512
P        1067
L         407
1           3
LA          3
CMYK        1
Name: count, dtype: int64

Images per class:
class
Pikachu       510
Charmander    485
Mewtwo        470
Psyduck       448
Squirtle      440
             ... 
Krabby        147
Paras         138
Kabuto        132
Onix          123
Persian       119
Name: count, Length: 149, dtype: int64


In [4]:
metadata_df.columns

Index(['size', 'mode', 'format', 'info', 'class', 'filename'], dtype='object')