In [None]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image

In [None]:
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)

# Define paths
data_path = '../data/sem_images/raw'
brittle_path = os.path.join(data_path, 'Brittle')
ductile_path = os.path.join(data_path, 'Ductile')

print(f"Data directory: {data_path}")
print(f"Brittle images path: {brittle_path}")
print(f"Ductile images path: {ductile_path}")

In [None]:
def explore_directory(path: Path) -> list[str]:
    """Explore a directory and list image files."""
    if not path.exists():
        print(f"Directory {path} does not exist!")
        return []

    files = [
        f for f in os.listdir(path)
        if f.lower().endswith((
            '.png',
            '.jpg',
            '.jpeg',
            '.tiff',
            '.tif',
            '.bmp',
        ))]
    return files

brittle_files = explore_directory(Path(brittle_path))
ductile_files = explore_directory(Path(ductile_path))

print(f"Number of Brittle images: {len(brittle_files)}")
print(f"Number of Ductile images: {len(ductile_files)}")
print(f"Total images: {len(brittle_files) + len(ductile_files)}")

print("\nFirst 5 Brittle files:")
for i, file in enumerate(brittle_files[:5]):
    print(f"  {i+1}. {file}")

print("\nFirst 5 Ductile files:")
for i, file in enumerate(ductile_files[:5]):
    print(f"  {i+1}. {file}")

In [None]:
def get_image_info(file_path: Path) -> dict | None:
    """Get image information such as dimensions and format."""
    try:
        with Image.open(file_path) as img:
            return {
                'width': img.width,
                'height': img.height,
                'mode': img.mode,
                'format': img.format,
                'size_bytes': os.path.getsize(file_path),
            }
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Collect image information
image_data = []

print("Analyzing Brittle images...")
for file in brittle_files:
    file_path = os.path.join(brittle_path, file)
    info = get_image_info(file_path)
    if info:
        info['filename'] = file
        info['category'] = 'Brittle'
        image_data.append(info)

print("Analyzing Ductile images...")
for file in ductile_files:
    file_path = os.path.join(ductile_path, file)
    info = get_image_info(file_path)
    if info:
        info['filename'] = file
        info['category'] = 'Ductile'
        image_data.append(info)

# Create DataFrame
df = pd.DataFrame(image_data)
print(f"\nSuccessfully analyzed {len(df)} images")

In [None]:
print("=== IMAGE DATASET OVERVIEW ===")
print(f"Total images: {len(df)}")
print(f"Categories: {df['category'].value_counts().to_dict()}")
print(f"\nImage formats: {df['format'].value_counts().to_dict()}")
print(f"Color modes: {df['mode'].value_counts().to_dict()}")

print("\n=== DIMENSION STATISTICS ===")
print(df[['width', 'height', 'size_bytes']].describe())

print("\n=== STATISTICS BY CATEGORY ===")
print(
    df.groupby('category')[['width', 'height', 'size_bytes']]
    .describe(),
)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Width distribution
axes[0,0].hist(
    df[df['category'] == 'Brittle']['width'],
    alpha=0.7, label='Brittle', bins=20,
)
axes[0,0].hist(
    df[df['category'] == 'Ductile']['width'],
    alpha=0.7, label='Ductile', bins=20,
)
axes[0,0].set_xlabel('Width (pixels)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].set_title('Image Width Distribution')
axes[0,0].legend()

# Height distribution
axes[0,1].hist(
    df[df['category'] == 'Brittle']['height'],
    alpha=0.7, label='Brittle', bins=20,
)
axes[0,1].hist(
    df[df['category'] == 'Ductile']['height'],
    alpha=0.7, label='Ductile', bins=20,
)
axes[0,1].set_xlabel('Height (pixels)')
axes[0,1].set_ylabel('Frequency')
axes[0,1].set_title('Image Height Distribution')
axes[0,1].legend()

# File size distribution
axes[1,0].hist(
    df[df['category'] == 'Brittle']['size_bytes']/1024,
    alpha=0.7, label='Brittle', bins=20,
)
axes[1,0].hist(
    df[df['category'] == 'Ductile']['size_bytes']/1024,
    alpha=0.7, label='Ductile', bins=20,
)
axes[1,0].set_xlabel('File Size (KB)')
axes[1,0].set_ylabel('Frequency')
axes[1,0].set_title('File Size Distribution')
axes[1,0].legend()

# Aspect ratio
df['aspect_ratio'] = df['width'] / df['height']
axes[1,1].hist(
    df[df['category'] == 'Brittle']['aspect_ratio'],
    alpha=0.7, label='Brittle', bins=20,
)
axes[1,1].hist(
    df[df['category'] == 'Ductile']['aspect_ratio'],
    alpha=0.7, label='Ductile', bins=20,
)
axes[1,1].set_xlabel('Aspect Ratio (Width/Height)')
axes[1,1].set_ylabel('Frequency')
axes[1,1].set_title('Aspect Ratio Distribution')
axes[1,1].legend()

plt.tight_layout()
plt.show()

In [None]:
def display_sample_images(category: str, n_samples: int = 6) -> None:
    """Display sample images from a given category."""
    category_files = df[df['category'] == category]['filename'].tolist()
    n_samples = min(n_samples, len(category_files))

    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle(f'Sample {category} Images', fontsize=16)

    for i in range(n_samples):
        row = i // 3
        col = i % 3

        if category == 'Brittle':
            img_path = os.path.join(brittle_path, category_files[i])
        else:
            img_path = os.path.join(ductile_path, category_files[i])

        img = Image.open(img_path)
        axes[row, col].imshow(img, cmap='gray' if img.mode == 'L' else None)
        axes[row, col].set_title(f'{category_files[i]}\n{img.size[0]}Ã—{img.size[1]}')
        axes[row, col].axis('off')

    # Hide any unused subplots
    for i in range(n_samples, 6):
        row = i // 3
        col = i % 3
        axes[row, col].axis('off')

    plt.tight_layout()
    plt.show()

# Display samples from both categories
display_sample_images('Brittle')
display_sample_images('Ductile')