# Data Preparation - RAF-DB Face Expression Recognition

**Project:** FER AI with BLIP Fine-tuning  
**Dataset:** Balanced RAF-DB Dataset (7575 Grayscale)  
**Source:** https://www.kaggle.com/datasets/dollyprajapati182/balanced-raf-db-dataset-7575-grayscale  
**Purpose:** Import, explore, and visualize the dataset for BLIP model fine-tuning

---

## Notebook Overview
1. Environment Setup & Configuration
2. Data Loading
3. Data Exploration & Statistics
4. Data Visualization
5. Data Quality Checks
6. Export Metadata for MLOps Pipeline

## 1. Environment Setup & Configuration

In [None]:
# Import required libraries
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from collections import Counter
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✓ Libraries imported successfully")

In [None]:
# Detect environment (Colab vs Kaggle vs Local)
import sys
import os

# Check for Colab
try:
    from google.colab import drive
    IS_COLAB = True
    print("✓ Running on Google Colab")
except ImportError:
    IS_COLAB = False

# Check for Kaggle by looking at environment variables or file paths
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ or 'KAGGLE_DATA_PROXY_TOKEN' in os.environ:
    IS_KAGGLE = True
    print("✓ Running on Kaggle")
else:
    IS_KAGGLE = False

# Mount Google Drive only in Colab
if IS_COLAB:
    try:
        drive.mount('/content/drive')
        BASE_PATH = '/content/drive/MyDrive/FER_AI_Project'
        print(f"✓ Google Drive mounted at {BASE_PATH}")
    except Exception as e:
        print(f"Failed to mount Google Drive: {e}")
        BASE_PATH = '/kaggle/working'
        print(f"Using fallback path: {BASE_PATH}")

elif IS_KAGGLE:
    # Kaggle environment - use Kaggle paths
    BASE_PATH = '/kaggle/working'
    
    # You might also want to check for Kaggle datasets
    # If your data is in Kaggle datasets, you can access them at /kaggle/input/
    print(f"✓ Kaggle environment detected. Base path: {BASE_PATH}")
    
    # Common dataset locations in Kaggle
    DATA_INPUT_PATH = '/kaggle/input'
    if os.path.exists(DATA_INPUT_PATH):
        print(f"✓ Kaggle input datasets available at: {DATA_INPUT_PATH}")
        
else:
    # Local environment
    BASE_PATH = r'c:\Users\famil\Desktop\ghaith\Projects\FER_AI_Project'
    print(f"✓ Local machine detected. Base path: {BASE_PATH}")

# Create directory if it doesn't exist
os.makedirs(BASE_PATH, exist_ok=True)
print(f"✓ Using base path: {BASE_PATH}")

In [None]:
# Configuration Parameters
CONFIG = {
    'project_name': 'FER_AI_BLIP',
    'data_version': '1.0',
    'timestamp': datetime.now().strftime('%Y%m%d_%H%M%S'),
    'environment': 'colab' if IS_COLAB else 'local',
    
    # Paths (will be updated after download)
    'data_root': f'{BASE_PATH}/data',
    'raw_data_path': f'{BASE_PATH}/data/raw',  # This will be updated after kagglehub download
    'processed_data_path': f'{BASE_PATH}/data/processed',
    'metadata_path': f'{BASE_PATH}/data/metadata',
    
    # Dataset splits
    'train_dir': 'train',
    'val_dir': 'val',
    'test_dir': 'test',
    
    # Emotion folder names
    'emotion_folders': ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'],
    
    # Emotion labels mapping
    'emotion_labels': {
        'angry': 'Anger',
        'disgust': 'Disgust',
        'fear': 'Fear',
        'happy': 'Happiness',
        'neutral': 'Neutral',
        'sad': 'Sadness',
        'surprise': 'Surprise'
    },
    
    # Numeric mapping for model training
    'emotion_to_id': {
        'angry': 0,
        'disgust': 1,
        'fear': 2,
        'happy': 3,
        'neutral': 4,
        'sad': 5,
        'surprise': 6
    },
    
    # Random seed for reproducibility
    'random_seed': 42
}

# Set random seed
np.random.seed(CONFIG['random_seed'])

print("✓ Configuration loaded")
print(f"Environment: {CONFIG['environment'].upper()}")
print(f"Base Path: {BASE_PATH}")

In [None]:
# Create directory structure
directories = [
    CONFIG['data_root'],
    CONFIG['processed_data_path'],
    CONFIG['metadata_path']
]

for directory in directories:
    Path(directory).mkdir(parents=True, exist_ok=True)
    print(f"✓ Created/verified: {directory}")

print("\nDirectory structure ready!")

## 2. Data Loading

In [None]:
# Auto-download dataset from Kaggle
import kagglehub

# Download latest version
dataset_name = "dollyprajapati182/balanced-raf-db-dataset-7575-grayscale"
print(f"Downloading dataset: {dataset_name}")
download_path = kagglehub.dataset_download(dataset_name)
print(f"✓ Dataset downloaded to: {download_path}")

# Update raw data path to the downloaded location
CONFIG['raw_data_path'] = download_path
print(f"\nDataset location: {CONFIG['raw_data_path']}")

In [None]:
# Function to check if dataset is mounted/available
def check_dataset_availability(data_path):
    """
    Check if the dataset is properly downloaded and structured
    
    Returns:
        dict: Status of each split (train/val/test) and emotion folders
    """
    from pathlib import Path
    
    status = {
        'dataset_found': False,
        'splits': {},
        'total_images': 0,
        'missing_folders': []
    }
    
    base_path = Path(data_path)
    
    if not base_path.exists():
        print(f"❌ Dataset path not found: {data_path}")
        return status
    
    print(f"✓ Dataset path exists: {data_path}\n")
    
    # Check each split
    for split in ['train', 'val', 'test']:
        split_path = base_path / split
        status['splits'][split] = {
            'exists': split_path.exists(),
            'emotions': {},
            'total': 0
        }
        
        if split_path.exists():
            print(f"✓ {split.upper()} split found")
            
            # Check emotion folders
            for emotion in CONFIG['emotion_folders']:
                emotion_path = split_path / emotion
                if emotion_path.exists():
                    image_count = len(list(emotion_path.glob('*.jpg')) + 
                                    list(emotion_path.glob('*.png')) + 
                                    list(emotion_path.glob('*.jpeg')))
                    status['splits'][split]['emotions'][emotion] = image_count
                    status['splits'][split]['total'] += image_count
                    status['total_images'] += image_count
                else:
                    status['missing_folders'].append(f"{split}/{emotion}")
            
            print(f"  - Total images: {status['splits'][split]['total']}")
        else:
            print(f"❌ {split.upper()} split NOT found")
    
    status['dataset_found'] = all(status['splits'][s]['exists'] for s in ['train', 'val', 'test'])
    
    print(f"\n{'='*50}")
    print(f"Dataset Status: {'✓ READY' if status['dataset_found'] else '❌ INCOMPLETE'}")
    print(f"Total Images Found: {status['total_images']}")
    if status['missing_folders']:
        print(f"Missing Folders: {len(status['missing_folders'])}")
    print(f"{'='*50}\n")
    
    return status

# Check if dataset is available
dataset_status = check_dataset_availability(CONFIG['raw_data_path'])

if not dataset_status['dataset_found']:
    print("⚠️ Dataset not complete. Please ensure it's properly downloaded.")
else:
    print("✓ Dataset is ready for processing!")

In [None]:
def load_dataset_metadata(data_path, split_name):
    """
    Load metadata for a dataset split (train/val/test)
    
    Args:
        data_path: Path to the dataset
        split_name: Name of the split (train, val, test)
    
    Returns:
        DataFrame with image paths and labels
    """
    split_path = Path(data_path) / split_name
    
    if not split_path.exists():
        print(f"⚠️  Warning: {split_path} does not exist!")
        return pd.DataFrame(columns=['image_path', 'emotion_folder', 'emotion_id', 'emotion_label', 'split'])
    
    image_data = []
    
    # Iterate through emotion folders (angry, disgust, fear, etc.)
    for emotion_folder_name in CONFIG['emotion_folders']:
        emotion_folder = split_path / emotion_folder_name
        
        if emotion_folder.exists():
            # Get all image files
            image_files = list(emotion_folder.glob('*.jpg')) + \
                         list(emotion_folder.glob('*.png')) + \
                         list(emotion_folder.glob('*.jpeg'))
            
            for img_path in image_files:
                image_data.append({
                    'image_path': str(img_path),
                    'filename': img_path.name,
                    'emotion_folder': emotion_folder_name,
                    'emotion_id': CONFIG['emotion_to_id'][emotion_folder_name],
                    'emotion_label': CONFIG['emotion_labels'][emotion_folder_name],
                    'split': split_name
                })
    
    df = pd.DataFrame(image_data)
    print(f"✓ Loaded {len(df)} images from {split_name} split")
    
    return df

print("Data loading function defined")

In [None]:
# Load all dataset splits
print("Loading dataset splits...\n")

train_df = load_dataset_metadata(CONFIG['raw_data_path'], CONFIG['train_dir'])
val_df = load_dataset_metadata(CONFIG['raw_data_path'], CONFIG['val_dir'])
test_df = load_dataset_metadata(CONFIG['raw_data_path'], CONFIG['test_dir'])

# Combine all splits
full_dataset_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

print(f"\n{'='*50}")
print(f"Total dataset size: {len(full_dataset_df)} images")
print(f"{'='*50}")

## 3. Data Exploration & Statistics

In [None]:
# Display sample of the dataset
print("Dataset Sample:")
display(full_dataset_df.head(10))

print("\nDataset Info:")
print(full_dataset_df.info())

In [None]:
# Distribution statistics by split
print("Distribution by Split:")
split_dist = full_dataset_df['split'].value_counts().sort_index()
print(split_dist)
print(f"\nPercentages:")
print(split_dist / len(full_dataset_df) * 100)

In [None]:
# Distribution statistics by emotion
print("Distribution by Emotion (Overall):")
emotion_dist = full_dataset_df['emotion_label'].value_counts().sort_index()
print(emotion_dist)

print("\nDistribution by Emotion (Per Split):")
split_emotion_dist = pd.crosstab(full_dataset_df['split'], 
                                  full_dataset_df['emotion_label'])
display(split_emotion_dist)

In [None]:
# Sample image analysis
print("Analyzing sample images...\n")

def analyze_image_properties(df, num_samples=100):
    """
    Analyze image properties (dimensions, size, etc.)
    """
    if len(df) == 0:
        return None
    
    sample_df = df.sample(min(num_samples, len(df)), random_state=CONFIG['random_seed'])
    
    properties = {
        'width': [],
        'height': [],
        'channels': [],
        'file_size_kb': []
    }
    
    for img_path in sample_df['image_path']:
        try:
            img = Image.open(img_path)
            width, height = img.size
            channels = len(img.getbands())
            file_size = os.path.getsize(img_path) / 1024  # KB
            
            properties['width'].append(width)
            properties['height'].append(height)
            properties['channels'].append(channels)
            properties['file_size_kb'].append(file_size)
        except Exception as e:
            print(f"Error loading {img_path}: {e}")
    
    return pd.DataFrame(properties)

image_props_df = analyze_image_properties(full_dataset_df, num_samples=100)

if image_props_df is not None:
    print("Image Properties Statistics:")
    display(image_props_df.describe())
    
    print(f"\nUnique image dimensions:")
    print(image_props_df.groupby(['width', 'height']).size().sort_values(ascending=False))

## 4. Data Visualization

In [None]:
# Visualize class distribution across splits
if len(full_dataset_df) == 0:
    print("⚠️ No data available to visualize. Please ensure the dataset is downloaded and placed in the correct directory.")
    print(f"Expected location: {CONFIG['raw_data_path']}")
else:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Overall distribution
    emotion_counts = full_dataset_df['emotion_label'].value_counts()
    axes[0].bar(emotion_counts.index, emotion_counts.values, color='steelblue', alpha=0.8)
    axes[0].set_title('Overall Emotion Distribution', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Emotion')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(axis='y', alpha=0.3)
    
    # Add count labels on bars
    for i, (emotion, count) in enumerate(emotion_counts.items()):
        axes[0].text(i, count + 20, str(count), ha='center', va='bottom', fontweight='bold')
    
    # Distribution by split
    split_emotion_pivot = full_dataset_df.groupby(['split', 'emotion_label']).size().unstack(fill_value=0)
    split_emotion_pivot.plot(kind='bar', ax=axes[1], width=0.8)
    axes[1].set_title('Emotion Distribution by Split', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Split')
    axes[1].set_ylabel('Count')
    axes[1].tick_params(axis='x', rotation=0)
    axes[1].legend(title='Emotion', bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("✓ Class distribution visualized")

In [None]:
# Visualize sample images from each emotion class
def visualize_emotion_samples(df, num_samples_per_emotion=3):
    """
    Display sample images for each emotion class
    """
    if len(df) == 0:
        print("No data to visualize")
        return
    
    emotions = sorted(df['emotion_label'].unique())
    num_emotions = len(emotions)
    
    fig, axes = plt.subplots(num_emotions, num_samples_per_emotion, 
                            figsize=(num_samples_per_emotion * 3, num_emotions * 3))
    
    for i, emotion in enumerate(emotions):
        emotion_df = df[df['emotion_label'] == emotion]
        samples = emotion_df.sample(min(num_samples_per_emotion, len(emotion_df)), 
                                   random_state=CONFIG['random_seed'])
        
        for j, (_, row) in enumerate(samples.iterrows()):
            ax = axes[i, j] if num_emotions > 1 else axes[j]
            
            try:
                img = Image.open(row['image_path'])
                ax.imshow(img, cmap='gray')
                ax.axis('off')
                
                if j == 0:
                    ax.set_title(f"{emotion}\n{row['filename']}", 
                               fontsize=10, fontweight='bold', loc='left')
                else:
                    ax.set_title(row['filename'], fontsize=9, loc='left')
            except Exception as e:
                ax.text(0.5, 0.5, f'Error loading\nimage', 
                       ha='center', va='center', transform=ax.transAxes)
                ax.axis('off')
    
    plt.suptitle('Sample Images by Emotion Class', fontsize=16, fontweight='bold', y=1.00)
    plt.tight_layout()
    plt.show()

print("Displaying sample images from each emotion class...\n")
visualize_emotion_samples(full_dataset_df, num_samples_per_emotion=5)

In [None]:
# Visualize image dimension distribution
if image_props_df is not None and len(image_props_df) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Width distribution
    axes[0, 0].hist(image_props_df['width'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
    axes[0, 0].set_title('Image Width Distribution', fontweight='bold')
    axes[0, 0].set_xlabel('Width (pixels)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].grid(alpha=0.3)
    
    # Height distribution
    axes[0, 1].hist(image_props_df['height'], bins=20, color='lightcoral', edgecolor='black', alpha=0.7)
    axes[0, 1].set_title('Image Height Distribution', fontweight='bold')
    axes[0, 1].set_xlabel('Height (pixels)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].grid(alpha=0.3)
    
    # File size distribution
    axes[1, 0].hist(image_props_df['file_size_kb'], bins=20, color='lightgreen', edgecolor='black', alpha=0.7)
    axes[1, 0].set_title('File Size Distribution', fontweight='bold')
    axes[1, 0].set_xlabel('File Size (KB)')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].grid(alpha=0.3)
    
    # Channels distribution
    channel_counts = image_props_df['channels'].value_counts().sort_index()
    axes[1, 1].bar(channel_counts.index, channel_counts.values, color='orange', alpha=0.7)
    axes[1, 1].set_title('Image Channels Distribution', fontweight='bold')
    axes[1, 1].set_xlabel('Number of Channels')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("✓ Image properties visualized")

## 6. Data Quality Checks

In [None]:
# Process the entire dataset (UNCOMMENT TO RUN)
# This will process all images and save them to the processed directory

# OPTION 1: Process without normalization (save raw cropped faces)
# processing_stats = process_and_save_dataset(
#     full_dataset_df, 
#     CONFIG['processed_data_path'],
#     target_size=(224, 224),
#     save_normalized=False
# )

# OPTION 2: Process small sample for testing (recommended first)
# if len(full_dataset_df) > 0:
#     sample_df = full_dataset_df.sample(min(100, len(full_dataset_df)), random_state=CONFIG['random_seed'])
#     processing_stats = process_and_save_dataset(
#         sample_df, 
#         CONFIG['processed_data_path'] + '_sample',
#         target_size=(224, 224),
#         save_normalized=False
#     )

print("Uncomment the code above to process the dataset")

In [None]:
# Process and save preprocessed dataset
def process_and_save_dataset(df, output_base_path, target_size=(224, 224), save_normalized=False):
    """
    Process entire dataset and save to disk
    
    Args:
        df: DataFrame with image metadata
        output_base_path: Base path to save processed images
        target_size: Target image size
        save_normalized: Whether to save normalized or raw cropped images
    
    Returns:
        Statistics about processing
    """
    output_base_path = Path(output_base_path)
    output_base_path.mkdir(parents=True, exist_ok=True)
    
    stats = {
        'total': len(df),
        'processed': 0,
        'failed': 0,
        'no_face': 0
    }
    
    print(f"Processing {stats['total']} images...")
    print(f"Output path: {output_base_path}\n")
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing images"):
        try:
            # Create output directory structure: split/emotion/
            output_dir = output_base_path / row['split'] / row['emotion_folder']
            output_dir.mkdir(parents=True, exist_ok=True)
            
            # Process image
            processed_image = preprocess_image_for_training(
                row['image_path'], 
                target_size=target_size,
                normalize=save_normalized
            )
            
            if processed_image is not None:
                # Save processed image
                output_path = output_dir / row['filename']
                
                if save_normalized:
                    # Denormalize for saving
                    img_to_save = ((processed_image * [0.229, 0.224, 0.225]) + [0.485, 0.456, 0.406]) * 255
                    img_to_save = np.clip(img_to_save, 0, 255).astype(np.uint8)
                else:
                    img_to_save = processed_image
                
                # Convert RGB to BGR for OpenCV saving
                img_to_save = cv2.cvtColor(img_to_save, cv2.COLOR_RGB2BGR)
                cv2.imwrite(str(output_path), img_to_save)
                
                stats['processed'] += 1
            else:
                stats['no_face'] += 1
        
        except Exception as e:
            stats['failed'] += 1
            if stats['failed'] <= 5:  # Show first 5 errors
                print(f"Error processing {row['filename']}: {e}")
    
    print(f"\n{'='*50}")
    print(f"Processing Complete!")
    print(f"{'='*50}")
    print(f"Total images: {stats['total']}")
    print(f"Successfully processed: {stats['processed']}")
    print(f"No face detected: {stats['no_face']}")
    print(f"Failed: {stats['failed']}")
    print(f"{'='*50}\n")
    
    return stats

print("✓ Batch processing function defined")

In [None]:
# Full Preprocessing Pipeline
def preprocess_image_for_training(image_path, target_size=(224, 224), normalize=True):
    """
    Complete preprocessing pipeline for training:
    1. Detect and crop face
    2. Resize to target size
    3. Normalize pixel values
    
    Args:
        image_path: Path to input image
        target_size: Target image size (width, height)
        normalize: Whether to normalize the image
    
    Returns:
        Preprocessed image ready for model input
    """
    # Step 1: Detect and crop face
    face_image = detect_and_crop_face(image_path, target_size=target_size)
    
    if face_image is None:
        return None
    
    # Step 2: Normalize (if requested)
    if normalize:
        face_image = normalize_image(face_image)
    
    return face_image

print("✓ Complete preprocessing pipeline defined")

In [None]:
# Test face detection on sample images
print("Testing face detection on sample images...\n")

if len(full_dataset_df) > 0:
    # Sample 5 images
    sample_images = full_dataset_df.sample(min(5, len(full_dataset_df)), random_state=CONFIG['random_seed'])
    
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    
    for idx, (_, row) in enumerate(sample_images.iterrows()):
        # Original image
        original = Image.open(row['image_path'])
        axes[0, idx].imshow(original, cmap='gray')
        axes[0, idx].set_title(f"Original\n{row['emotion_label']}", fontsize=9)
        axes[0, idx].axis('off')
        
        # Processed image (face detected + cropped)
        processed = detect_and_crop_face(row['image_path'])
        if processed is not None:
            axes[1, idx].imshow(processed)
            axes[1, idx].set_title("Cropped Face", fontsize=9)
        else:
            axes[1, idx].text(0.5, 0.5, 'No face\ndetected', 
                            ha='center', va='center', transform=axes[1, idx].transAxes)
        axes[1, idx].axis('off')
    
    plt.suptitle('Face Detection & Cropping Test', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("✓ Face detection test complete")
else:
    print("⚠️ No images available for testing")

In [None]:
# Normalization Function
def normalize_image(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
    """
    Normalize image using ImageNet statistics (standard for pretrained models)
    
    Args:
        image: Input image (RGB, 0-255)
        mean: Mean values for each channel
        std: Standard deviation values for each channel
    
    Returns:
        Normalized image
    """
    # Convert to float and scale to [0, 1]
    image = image.astype(np.float32) / 255.0
    
    # Handle grayscale images (convert to RGB)
    if len(image.shape) == 2:
        image = np.stack([image] * 3, axis=-1)
    
    # Normalize using mean and std
    image = (image - mean) / std
    
    return image

print("✓ Normalization function defined")

In [None]:
# Face Detection and Cropping Function
def detect_and_crop_face(image_path, target_size=(224, 224), margin=20):
    """
    Detect and crop face from image using MTCNN
    
    Args:
        image_path: Path to the input image
        target_size: Output image size (width, height)
        margin: Margin around detected face in pixels
    
    Returns:
        Cropped and resized face image, or None if no face detected
    """
    try:
        # Read image
        img = cv2.imread(str(image_path))
        if img is None:
            return None
        
        # Convert BGR to RGB (MTCNN expects RGB)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Detect faces
        detections = detector.detect_faces(img_rgb)
        
        if len(detections) == 0:
            # No face detected, resize original image
            img_resized = cv2.resize(img_rgb, target_size)
            return img_resized
        
        # Get the first (largest) face
        detection = detections[0]
        x, y, width, height = detection['box']
        
        # Add margin
        x = max(0, x - margin)
        y = max(0, y - margin)
        width = min(img_rgb.shape[1] - x, width + 2 * margin)
        height = min(img_rgb.shape[0] - y, height + 2 * margin)
        
        # Crop face
        face = img_rgb[y:y+height, x:x+width]
        
        # Resize to target size
        face_resized = cv2.resize(face, target_size)
        
        return face_resized
    
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

print("✓ Face detection function defined")

In [None]:
# Install required libraries for face detection
!pip install opencv-python-headless mtcnn -q

import cv2
from mtcnn import MTCNN

# Initialize face detector
detector = MTCNN()

print("✓ Face detection libraries installed and initialized")

## 6. Data Preprocessing Pipeline

This section prepares images for training:
1. **Face Detection & Cropping**: Detect and crop faces from images
2. **Normalization**: Standardize pixel values for model input

In [None]:
# Data quality validation
print("Running Data Quality Checks...\n")

quality_report = {
    'total_images': len(full_dataset_df),
    'missing_files': 0,
    'corrupted_files': 0,
    'duplicate_files': 0,
    'issues': []
}

# Check for missing files
print("1. Checking for missing files...")
for idx, row in full_dataset_df.iterrows():
    if not os.path.exists(row['image_path']):
        quality_report['missing_files'] += 1
        quality_report['issues'].append(f"Missing: {row['image_path']}")

print(f"   Missing files: {quality_report['missing_files']}")

# Check for corrupted files (sample)
print("\n2. Checking for corrupted files (sampling 100 images)...")
sample_check = full_dataset_df.sample(min(100, len(full_dataset_df)), random_state=CONFIG['random_seed'])

for idx, row in sample_check.iterrows():
    try:
        img = Image.open(row['image_path'])
        img.verify()  # Verify image integrity
    except Exception as e:
        quality_report['corrupted_files'] += 1
        quality_report['issues'].append(f"Corrupted: {row['image_path']} - {str(e)}")

print(f"   Corrupted files (in sample): {quality_report['corrupted_files']}")

# Check for duplicate filenames
print("\n3. Checking for duplicate filenames...")
duplicate_filenames = full_dataset_df['filename'].duplicated().sum()
quality_report['duplicate_files'] = duplicate_filenames
print(f"   Duplicate filenames: {duplicate_filenames}")

# Check class balance
print("\n4. Checking class balance...")
emotion_counts = full_dataset_df['emotion_label'].value_counts()
max_count = emotion_counts.max()
min_count = emotion_counts.min()
imbalance_ratio = max_count / min_count if min_count > 0 else float('inf')

print(f"   Class imbalance ratio: {imbalance_ratio:.2f}")
if imbalance_ratio > 2.0:
    quality_report['issues'].append(f"Warning: Class imbalance detected (ratio: {imbalance_ratio:.2f})")

print(f"\n{'='*50}")
print("Data Quality Summary:")
print(f"{'='*50}")
print(f"Total Images: {quality_report['total_images']}")
print(f"Missing Files: {quality_report['missing_files']}")
print(f"Corrupted Files: {quality_report['corrupted_files']}")
print(f"Duplicate Files: {quality_report['duplicate_files']}")
print(f"\nIssues Found: {len(quality_report['issues'])}")

if quality_report['issues']:
    print("\nIssue Details:")
    for issue in quality_report['issues'][:10]:  # Show first 10
        print(f"  - {issue}")
else:
    print("\n✓ No critical issues found!")

## 7. Export Metadata for Training Pipeline

In [None]:
# Save dataset metadata for MLOps tracking
print("Exporting metadata for MLOps pipeline...\n")

# Export full dataset metadata
metadata_file = Path(CONFIG['metadata_path']) / f"dataset_metadata_{CONFIG['timestamp']}.csv"
full_dataset_df.to_csv(metadata_file, index=False)
print(f"✓ Dataset metadata saved: {metadata_file}")

# Export dataset statistics
dataset_stats = {
    'metadata': {
        'project_name': CONFIG['project_name'],
        'data_version': CONFIG['data_version'],
        'timestamp': CONFIG['timestamp'],
        'total_images': len(full_dataset_df)
    },
    'splits': {
        'train': len(train_df),
        'val': len(val_df),
        'test': len(test_df)
    },
    'emotion_distribution': full_dataset_df['emotion_label'].value_counts().to_dict(),
    'quality_report': quality_report,
    'config': CONFIG
}

if image_props_df is not None and len(image_props_df) > 0:
    dataset_stats['image_properties'] = {
        'mean_width': float(image_props_df['width'].mean()),
        'mean_height': float(image_props_df['height'].mean()),
        'mean_file_size_kb': float(image_props_df['file_size_kb'].mean()),
        'channels': int(image_props_df['channels'].mode()[0])
    }

stats_file = Path(CONFIG['metadata_path']) / f"dataset_stats_{CONFIG['timestamp']}.json"
with open(stats_file, 'w') as f:
    json.dump(dataset_stats, f, indent=2, default=str)

print(f"✓ Dataset statistics saved: {stats_file}")

# Export split-specific CSVs for training pipeline
train_df.to_csv(Path(CONFIG['metadata_path']) / 'train_metadata.csv', index=False)
val_df.to_csv(Path(CONFIG['metadata_path']) / 'val_metadata.csv', index=False)
test_df.to_csv(Path(CONFIG['metadata_path']) / 'test_metadata.csv', index=False)

print(f"✓ Split-specific metadata saved")

print("\n" + "="*50)
print("✓ All metadata exported successfully!")
print("="*50)

## Summary & Next Steps

### Current Status:
- ✓ Dataset loaded and validated
- ✓ Data exploration and visualization completed
- ✓ Quality checks performed
- ✓ Metadata exported for MLOps pipeline

### Next Steps for BLIP Fine-tuning:
1. **Data Preprocessing**: Resize images, normalize, augmentation
2. **Feature Engineering**: Prepare emotion labels for BLIP format
3. **Model Setup**: Configure BLIP model for fine-tuning
4. **Training Pipeline**: Implement training with MLOps tracking
5. **Evaluation**: Model performance on test set
6. **Deployment**: Model versioning and serving

### MLOps Considerations:
- Data versioning with DVC or similar
- Experiment tracking with MLflow/Weights & Biases
- Model registry for version control
- CI/CD pipeline for automated training
- Monitoring and logging infrastructure

In [None]:
# Print final summary
print("\n" + "="*60)
print(" " * 15 + "DATA PREPARATION COMPLETE")
print("="*60)
print(f"\nDataset: RAF-DB Balanced (Grayscale)")
print(f"Total Images: {len(full_dataset_df):,}")
print(f"Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}")
print(f"\nEmotion Classes: {len(CONFIG['emotion_labels'])}")
print(f"Format: Grayscale")
print(f"\nMetadata Location: {CONFIG['metadata_path']}")
print(f"Timestamp: {CONFIG['timestamp']}")
print("\n" + "="*60)

# Save dataset metadata to Google Drive for training notebook
print("\n" + "="*70)
print("Saving dataset metadata to Google Drive for training notebook...")
print("="*70 + "\n")

# Create metadata directory on Google Drive
metadata_dir = Path(BASE_PATH) / 'data' / 'metadata'
metadata_dir.mkdir(parents=True, exist_ok=True)

# Save full dataset metadata as CSV
dataset_csv_path = metadata_dir / 'dataset_metadata.csv'
full_dataset_df.to_csv(dataset_csv_path, index=False)
print(f"✓ Full dataset metadata: {dataset_csv_path}")

# Save split-specific metadata
train_df.to_csv(metadata_dir / 'train_metadata.csv', index=False)
val_df.to_csv(metadata_dir / 'val_metadata.csv', index=False)
test_df.to_csv(metadata_dir / 'test_metadata.csv', index=False)
print(f"✓ Split-specific metadata saved")

# Save dataset summary JSON
summary_json = {
    'total_images': len(full_dataset_df),
    'train_images': len(train_df),
    'val_images': len(val_df),
    'test_images': len(test_df),
    'emotions': CONFIG['emotion_labels'],
    'timestamp': CONFIG['timestamp'],
    'data_location': BASE_PATH
}

with open(metadata_dir / 'dataset_summary.json', 'w') as f:
    json.dump(summary_json, f, indent=2)

print(f"✓ Dataset summary saved")
print(f"\n✓ All metadata saved to Google Drive: {metadata_dir}")
print(f"\nNext step: Run 02_blip_training.ipynb to train BLIP on this dataset")
print("Ready for BLIP fine-tuning pipeline!")
print("="*60)