# Image Processing and Feature Extraction
This notebook loads member images, applies augmentations, and extracts features for analysis.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Display Sample Pictures of Each Member

In [None]:
# Set up paths
image_dir = Path('data/images')

# Get all image files
image_files = sorted(list(image_dir.glob('*.jpeg')) + list(image_dir.glob('*.jpg')) + list(image_dir.glob('*.png')))

# Extract member names
members = sorted(list(set([f.stem.split('_')[0] for f in image_files])))
print(f"Members found: {members}")
print(f"Total images: {len(image_files)}")

In [None]:
# Display basic information without plotting (to save resources)
print("\nImage Details:")
for img_path in image_files:
    img = Image.open(img_path)
    print(f"{img_path.name}: Size={img.size}, Mode={img.mode}")

print("\nSkipping visualization - only extracting features to CSV")

## 2. Apply Augmentations to Images
We'll apply the following augmentations:
- Rotation (90°, 180°, 270°)
- Horizontal and Vertical Flipping
- Grayscale conversion
- Random rotation (-30° to 30°)
- Brightness adjustment

In [None]:
def apply_augmentations(image_path):
    """
    Apply various augmentations to an image and return augmented versions
    """
    img = Image.open(image_path)
    img_array = np.array(img)
    
    augmentations = {
        'original': img_array,
        'rotate_90': np.array(img.rotate(90, expand=True)),
        'rotate_180': np.array(img.rotate(180)),
        'rotate_270': np.array(img.rotate(270, expand=True)),
        'flip_horizontal': np.array(img.transpose(Image.FLIP_LEFT_RIGHT)),
        'flip_vertical': np.array(img.transpose(Image.FLIP_TOP_BOTTOM)),
        'grayscale': np.array(img.convert('L')),
        'rotate_random': np.array(img.rotate(np.random.randint(-30, 30))),
    }
    
    return augmentations

In [None]:
# Skip visualization of augmentations - only extracting features
print("Skipping augmentation visualization - only extracting features to CSV")

## 3. Extract Image Features
We'll extract multiple types of features:
- Color histograms (RGB)
- Grayscale histogram
- Statistical features (mean, std, min, max)
- Edge features using Canny edge detection
- HOG (Histogram of Oriented Gradients) features

In [None]:
from skimage.feature import hog
from skimage import exposure

def extract_color_histogram(img_array, bins=32):
    """
    Extract color histogram features for each channel
    """
    features = []
    
    if len(img_array.shape) == 3:  # Color image
        for i in range(3):  # RGB channels
            hist, _ = np.histogram(img_array[:, :, i], bins=bins, range=(0, 256))
            hist = hist / hist.sum()  # Normalize
            features.extend(hist)
    else:  # Grayscale
        hist, _ = np.histogram(img_array, bins=bins, range=(0, 256))
        hist = hist / hist.sum()
        features.extend(hist)
        # Pad with zeros to match RGB size
        features.extend([0] * (bins * 2))
    
    return features

def extract_statistical_features(img_array):
    """
    Extract statistical features from image
    """
    features = []
    
    if len(img_array.shape) == 3:  # Color image
        for i in range(3):
            channel = img_array[:, :, i]
            features.extend([
                np.mean(channel),
                np.std(channel),
                np.min(channel),
                np.max(channel),
                np.median(channel)
            ])
    else:  # Grayscale
        features.extend([
            np.mean(img_array),
            np.std(img_array),
            np.min(img_array),
            np.max(img_array),
            np.median(img_array)
        ])
        # Pad with zeros
        features.extend([0] * 10)
    
    return features

def extract_edge_features(img_array):
    """
    Extract edge detection features using Canny
    """
    # Convert to grayscale if needed
    if len(img_array.shape) == 3:
        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
    else:
        gray = img_array
    
    # Apply Canny edge detection
    edges = cv2.Canny(gray, 100, 200)
    
    # Extract features from edges
    features = [
        np.sum(edges > 0) / edges.size,  # Edge density
        np.mean(edges),
        np.std(edges)
    ]
    
    return features

def extract_hog_features(img_array, resize_shape=(128, 128)):
    """
    Extract HOG (Histogram of Oriented Gradients) features
    """
    # Convert to grayscale if needed
    if len(img_array.shape) == 3:
        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
    else:
        gray = img_array
    
    # Resize for consistent feature size
    resized = cv2.resize(gray, resize_shape)
    
    # Extract HOG features
    fd = hog(resized, orientations=9, pixels_per_cell=(8, 8),
             cells_per_block=(2, 2), visualize=False)
    
    # Return first 100 features to keep size manageable
    return fd[:100].tolist()

def extract_all_features(img_path, augmentation_type='original'):
    """
    Extract all features from an image
    """
    img = Image.open(img_path)
    img_array = np.array(img)
    
    # Apply augmentation if needed
    if augmentation_type != 'original':
        augmented = apply_augmentations(img_path)
        img_array = augmented.get(augmentation_type, img_array)
    
    # Extract all feature types
    features = {
        'image_name': img_path.name,
        'member_name': img_path.stem.split('_')[0],
        'expression': '_'.join(img_path.stem.split('_')[1:]),
        'augmentation': augmentation_type,
        'image_width': img.size[0],
        'image_height': img.size[1],
    }
    
    # Color histogram features
    hist_features = extract_color_histogram(img_array)
    for i, val in enumerate(hist_features):
        features[f'hist_{i}'] = val
    
    # Statistical features
    stat_features = extract_statistical_features(img_array)
    stat_names = []
    for channel in ['r', 'g', 'b']:
        stat_names.extend([f'{channel}_mean', f'{channel}_std', f'{channel}_min', 
                          f'{channel}_max', f'{channel}_median'])
    for name, val in zip(stat_names, stat_features):
        features[name] = val
    
    # Edge features
    edge_features = extract_edge_features(img_array)
    features['edge_density'] = edge_features[0]
    features['edge_mean'] = edge_features[1]
    features['edge_std'] = edge_features[2]
    
    # HOG features
    hog_features = extract_hog_features(img_array)
    for i, val in enumerate(hog_features):
        features[f'hog_{i}'] = val
    
    return features

## 4. Extract Features for All Images and Augmentations

In [None]:
# Extract features for all images and their augmentations
all_features = []

augmentation_types = ['original', 'rotate_90', 'rotate_180', 'rotate_270', 
                     'flip_horizontal', 'flip_vertical', 'grayscale', 'rotate_random']

print("Extracting features from images...")
for img_path in image_files:
    print(f"Processing: {img_path.name}")
    
    for aug_type in augmentation_types:
        try:
            features = extract_all_features(img_path, aug_type)
            all_features.append(features)
        except Exception as e:
            print(f"  Error with {aug_type}: {str(e)}")

print(f"\nTotal feature vectors extracted: {len(all_features)}")

## 5. Save Features to CSV

In [None]:
# Create DataFrame
features_df = pd.DataFrame(all_features)

# Save to CSV
output_path = 'image_features.csv'
features_df.to_csv(output_path, index=False)

print(f"Features saved to: {output_path}")
print(f"Shape: {features_df.shape}")
print(f"\nFirst few rows:")
print(features_df.head())

# Display summary statistics
print("\n" + "="*80)
print("FEATURE EXTRACTION SUMMARY")
print("="*80)
print(f"Total images processed: {len(image_files)}")
print(f"Total augmentations per image: {len(augmentation_types)}")
print(f"Total feature vectors: {len(features_df)}")
print(f"Features per vector: {len(features_df.columns)}")
print(f"\nMembers: {', '.join(members)}")
print(f"\nImages per member:")
print(features_df[features_df['augmentation'] == 'original']['member_name'].value_counts())

In [None]:
# Display info about the saved CSV
print("\nColumn names in image_features.csv:")
print("-" * 80)
for i, col in enumerate(features_df.columns, 1):
    print(f"{i:3d}. {col}")

print(f"\nTotal columns: {len(features_df.columns)}")