In [4]:
import os
import pandas as pd
import numpy as np
import librosa
import soundfile as sf
import random
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Configuration
TARGET_SR = 16000
TARGET_DURATION = 3.0  # seconds
SAMPLES_PER_EMOTION = 30
OUTPUT_DIR = 'validation_dataset'
AUDIO_DIR = os.path.join(OUTPUT_DIR, 'audio_files')

# Create output directories
os.makedirs(AUDIO_DIR, exist_ok=True)

print("🎵 Speech Emotion Recognition Validation Dataset Generator")
print("=" * 60)

# ================================
# 1. DATASET LOADING AND EXPLORATION
# ================================

def load_crema_d(base_path):
    """Load CREMA-D dataset information"""
    audio_files = []
    crema_path = base_path
    
    # CREMA-D emotion mapping
    emotion_map = {
        'ANG': 'angry',
        'DIS': 'disgust', 
        'FEA': 'fear',
        'HAP': 'happy',
        'NEU': 'neutral',
        'SAD': 'sad'
    }
    
    for file in os.listdir(crema_path):
        if file.endswith('.wav'):
            parts = file.split('_')
            if len(parts) >= 3:
                actor_id = parts[0]
                emotion_code = parts[2]
                
                if emotion_code in emotion_map:
                    audio_files.append({
                        'filename': file,
                        'filepath': os.path.join(crema_path, file),
                        'emotion': emotion_map[emotion_code],
                        'actor_id': actor_id,
                        'dataset': 'CREMA-D',
                        'original_emotion_code': emotion_code
                    })
    
    return audio_files

def load_ravdess(base_path):
    """Load RAVDESS dataset information"""
    audio_files = []
    
    # RAVDESS emotion mapping (from filename position 3)
    emotion_map = {
        '01': 'neutral',
        '02': 'calm', 
        '03': 'happy',
        '04': 'sad',
        '05': 'angry',
        '06': 'fear',
        '07': 'disgust',
        '08': 'surprise'
    }
    
    # Find all audio files in RAVDESS structure
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('.wav') and file.startswith('03-01-'):  # Audio-only speech files
                parts = file.split('-')
                if len(parts) >= 7:
                    emotion_code = parts[2]
                    actor_id = parts[6].split('.')[0]
                    
                    if emotion_code in emotion_map:
                        # Map 'calm' to 'neutral' for consistency
                        emotion = emotion_map[emotion_code]
                        if emotion == 'calm':
                            emotion = 'neutral'
                            
                        audio_files.append({
                            'filename': file,
                            'filepath': os.path.join(root, file),
                            'emotion': emotion,
                            'actor_id': f"R{actor_id}",  # Prefix to distinguish from CREMA-D
                            'dataset': 'RAVDESS',
                            'original_emotion_code': emotion_code
                        })
    
    return audio_files

# Load datasets
print("📂 Loading CREMA-D dataset...")
crema_files = load_crema_d('/kaggle/input/cremad/AudioWAV')
print(f"   Found {len(crema_files)} CREMA-D files")

print("📂 Loading RAVDESS dataset...")
ravdess_files = load_ravdess('/kaggle/input/ravdess-emotional-speech-audio')
print(f"   Found {len(ravdess_files)} RAVDESS files")

# Combine and analyze
all_files = crema_files + ravdess_files
df_all = pd.DataFrame(all_files)

print("\n📊 Dataset Overview:")
overview_table = df_all.groupby(['emotion', 'dataset']).size().unstack(fill_value=0)
print(overview_table)
print(f"\nTotal files loaded: {len(df_all)}")
print(f"Emotions available in both datasets: {set(df_all[df_all['dataset']=='CREMA-D']['emotion']) & set(df_all[df_all['dataset']=='RAVDESS']['emotion'])}")
print(f"Emotions only in RAVDESS: {set(df_all[df_all['dataset']=='RAVDESS']['emotion']) - set(df_all[df_all['dataset']=='CREMA-D']['emotion'])}")
print(f"Emotions only in CREMA-D: {set(df_all[df_all['dataset']=='CREMA-D']['emotion']) - set(df_all[df_all['dataset']=='RAVDESS']['emotion'])}")

# ================================
# 2. BALANCED SAMPLING STRATEGY
# ================================

def sample_balanced_dataset(df, samples_per_emotion=30):
    """Sample a balanced dataset with speaker variety from both datasets"""
    final_samples = []
    
    # Define target emotions
    target_emotions = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
    
    for emotion in target_emotions:
        print(f"\n🎯 Sampling {emotion} emotion:")
        
        if emotion == 'surprise':
            # Surprise only from RAVDESS (CREMA-D doesn't have surprise)
            emotion_files = df[(df['emotion'] == emotion) & (df['dataset'] == 'RAVDESS')].copy()
            print(f"   Available RAVDESS files: {len(emotion_files)}")
            
            if len(emotion_files) == 0:
                print(f"   ⚠️  No files found for {emotion}")
                continue
                
            # Sample from RAVDESS only
            if len(emotion_files) >= samples_per_emotion:
                sampled = emotion_files.sample(n=samples_per_emotion, random_state=42).to_dict('records')
            else:
                sampled = emotion_files.to_dict('records')
                print(f"   ⚠️  Only {len(sampled)} files available (less than {samples_per_emotion})")
        
        else:
            # For other emotions, sample from BOTH datasets
            crema_files = df[(df['emotion'] == emotion) & (df['dataset'] == 'CREMA-D')].copy()
            ravdess_files = df[(df['emotion'] == emotion) & (df['dataset'] == 'RAVDESS')].copy()
            
            print(f"   Available CREMA-D files: {len(crema_files)}")
            print(f"   Available RAVDESS files: {len(ravdess_files)}")
            
            # Calculate how many samples from each dataset
            total_available = len(crema_files) + len(ravdess_files)
            if total_available == 0:
                print(f"   ⚠️  No files found for {emotion}")
                continue
            
            # Aim for roughly equal split between datasets when possible
            if len(crema_files) > 0 and len(ravdess_files) > 0:
                # Both datasets have this emotion
                crema_samples = min(samples_per_emotion // 2, len(crema_files))
                ravdess_samples = min(samples_per_emotion - crema_samples, len(ravdess_files))
                
                # If one dataset can't provide enough, take more from the other
                if crema_samples < samples_per_emotion // 2 and len(ravdess_files) > ravdess_samples:
                    additional_needed = (samples_per_emotion // 2) - crema_samples
                    ravdess_samples = min(ravdess_samples + additional_needed, len(ravdess_files))
                elif ravdess_samples < (samples_per_emotion - samples_per_emotion // 2) and len(crema_files) > crema_samples:
                    additional_needed = samples_per_emotion - crema_samples - ravdess_samples
                    crema_samples = min(crema_samples + additional_needed, len(crema_files))
                
                print(f"   Sampling {crema_samples} from CREMA-D, {ravdess_samples} from RAVDESS")
                
                sampled = []
                if crema_samples > 0:
                    sampled.extend(crema_files.sample(n=crema_samples, random_state=42).to_dict('records'))
                if ravdess_samples > 0:
                    sampled.extend(ravdess_files.sample(n=ravdess_samples, random_state=42).to_dict('records'))
                    
            elif len(crema_files) > 0:
                # Only CREMA-D has this emotion
                n_samples = min(samples_per_emotion, len(crema_files))
                sampled = crema_files.sample(n=n_samples, random_state=42).to_dict('records')
                print(f"   Sampling {n_samples} from CREMA-D only")
                
            elif len(ravdess_files) > 0:
                # Only RAVDESS has this emotion
                n_samples = min(samples_per_emotion, len(ravdess_files))
                sampled = ravdess_files.sample(n=n_samples, random_state=42).to_dict('records')
                print(f"   Sampling {n_samples} from RAVDESS only")
                
            else:
                sampled = []
        
        print(f"   ✅ Selected: {len(sampled)} files")
        if len(sampled) > 0:
            datasets_used = set([s['dataset'] for s in sampled])
            actors_used = len(set([s['actor_id'] for s in sampled]))
            print(f"   📊 Datasets used: {', '.join(datasets_used)}")
            print(f"   🎭 Unique actors: {actors_used}")
        
        final_samples.extend(sampled)
    
    return final_samples

# Sample balanced dataset
print("\n🎲 Creating balanced sample...")
selected_samples = sample_balanced_dataset(df_all, SAMPLES_PER_EMOTION)
print(f"\n✅ Total selected samples: {len(selected_samples)}")

# ================================
# 3. AUDIO PROCESSING FUNCTIONS
# ================================

def apply_conservative_perturbations(audio, sr):
    """Apply conservative perturbations to audio"""
    perturbations = []
    
    # Random selection of 1-2 perturbations
    available_perturbations = ['noise', 'pitch', 'time_stretch', 'volume']
    selected = random.sample(available_perturbations, k=random.randint(1, 2))
    
    for perturbation in selected:
        if perturbation == 'noise':
            # Add subtle white noise (SNR 20-25 dB)
            noise_level = random.uniform(0.002, 0.005)
            noise = np.random.normal(0, noise_level, len(audio))
            audio = audio + noise
            perturbations.append(f"noise_{noise_level:.3f}")
            
        elif perturbation == 'pitch':
            # Subtle pitch shift (±1 semitone)
            n_steps = random.uniform(-0.3, 0.3)
            audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)
            perturbations.append(f"pitch_{n_steps:.2f}")
            
        elif perturbation == 'time_stretch':
            # Subtle time stretching (0.97-1.03x)
            rate = random.uniform(0.98, 1.02)
            audio = librosa.effects.time_stretch(audio, rate=rate)
            perturbations.append(f"stretch_{rate:.3f}")
            
        elif perturbation == 'volume':
            # Subtle volume scaling (±3 dB)
            db_change = random.uniform(-2, 2)
            scale_factor = 10 ** (db_change / 20)
            audio = audio * scale_factor
            perturbations.append(f"volume_{db_change:.1f}dB")
    
    return audio, perturbations

def normalize_audio_duration(audio, sr, target_duration=3.0):
    """Normalize audio to target duration"""
    target_length = int(target_duration * sr)
    
    if len(audio) > target_length:
        # Take center portion
        start = (len(audio) - target_length) // 2
        audio = audio[start:start + target_length]
    elif len(audio) < target_length:
        # Pad with silence
        pad_length = target_length - len(audio)
        audio = np.pad(audio, (0, pad_length), mode='constant', constant_values=0)
    
    return audio

def apply_loudness_normalization(audio, target_lufs=-23.0):
    """Apply loudness normalization (simplified version)"""
    # RMS-based normalization as approximation
    rms = np.sqrt(np.mean(audio**2))
    if rms > 0:
        # Target RMS corresponding to approximately -23 LUFS
        target_rms = 0.1  # Adjust based on testing
        scale_factor = target_rms / rms
        audio = audio * scale_factor
    
    # Prevent clipping
    max_val = np.max(np.abs(audio))
    if max_val > 0.95:
        audio = audio * (0.95 / max_val)
    
    return audio

# ================================
# 4. PROCESS ALL SAMPLES
# ================================

def process_audio_file(sample_info, output_idx):
    """Process a single audio file"""
    try:
        # Load audio
        audio, orig_sr = librosa.load(sample_info['filepath'], sr=None)
        
        # Resample to target sample rate
        if orig_sr != TARGET_SR:
            audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=TARGET_SR)
        
        # Normalize duration to 3 seconds
        audio = normalize_audio_duration(audio, TARGET_SR, TARGET_DURATION)
        
        # Apply conservative perturbations
        audio, perturbations = apply_conservative_perturbations(audio, TARGET_SR)
        
        # Apply loudness normalization
        audio = apply_loudness_normalization(audio)
        
        # Generate output filename
        output_filename = f"val_{output_idx:03d}.wav"
        output_path = os.path.join(AUDIO_DIR, output_filename)
        
        # Save processed audio
        sf.write(output_path, audio, TARGET_SR)
        
        # Create metadata record
        metadata = {
            'filename': output_filename,
            'emotion_label': sample_info['emotion'],
            'original_filename': sample_info['filename'],
            'original_dataset': sample_info['dataset'],
            'actor_id': sample_info['actor_id'],
            'original_emotion_code': sample_info['original_emotion_code'],
            'duration_seconds': TARGET_DURATION,
            'sample_rate': TARGET_SR,
            'perturbations_applied': ','.join(perturbations),
            'processing_success': True
        }
        
        return metadata
        
    except Exception as e:
        print(f"❌ Error processing {sample_info['filename']}: {str(e)}")
        return {
            'filename': f"val_{output_idx:03d}.wav",
            'emotion_label': sample_info['emotion'],
            'original_filename': sample_info['filename'],
            'processing_success': False,
            'error': str(e)
        }

# Process all selected samples
print("\n🔄 Processing audio files...")
processed_metadata = []

for idx, sample in enumerate(selected_samples):
    if (idx + 1) % 20 == 0:
        print(f"   Processed {idx + 1}/{len(selected_samples)} files...")
    
    metadata = process_audio_file(sample, idx + 1)
    processed_metadata.append(metadata)

# ================================
# 5. CREATE OUTPUT CSV AND SUMMARY
# ================================

# Create comprehensive CSV
df_output = pd.DataFrame(processed_metadata)

# Add additional useful columns
df_output['emotion_id'] = df_output['emotion_label'].map({
    'angry': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 
    'neutral': 4, 'sad': 5, 'surprise': 6
})

# Save CSV
csv_path = os.path.join(OUTPUT_DIR, 'validation_labels.csv')
df_output.to_csv(csv_path, index=False)

# Create summary statistics
successful_files = df_output[df_output['processing_success'] == True]

print("\n📋 VALIDATION DATASET SUMMARY")
print("=" * 50)
print(f"Total files processed: {len(df_output)}")
print(f"Successfully processed: {len(successful_files)}")
print(f"Audio specifications: {TARGET_SR}Hz, {TARGET_DURATION}s duration")

print(f"\n📊 Emotion Distribution:")
emotion_counts = successful_files['emotion_label'].value_counts().sort_index()
for emotion, count in emotion_counts.items():
    print(f"   {emotion.capitalize()}: {count} samples")

print(f"\n📊 Dataset Source Distribution:")
source_counts = successful_files['original_dataset'].value_counts()
for source, count in source_counts.items():
    print(f"   {source}: {count} samples")

print(f"\n📊 Perturbation Statistics:")
all_perturbations = []
for perturbs in successful_files['perturbations_applied'].dropna():
    if perturbs:
        all_perturbations.extend(perturbs.split(','))

from collections import Counter
perturbation_counts = Counter([p.split('_')[0] for p in all_perturbations])
for perturbation, count in perturbation_counts.items():
    print(f"   {perturbation}: {count} applications")

print(f"\n💾 Output Files:")
print(f"   Audio files: {OUTPUT_DIR}/audio_files/ ({len(successful_files)} WAV files)")
print(f"   Metadata CSV: {csv_path}")

print(f"\n✅ Validation dataset generation complete!")
print(f"📁 Files saved to: {OUTPUT_DIR}/")

# Display first few rows of the CSV for verification
print(f"\n📋 Sample CSV contents:")
print(successful_files[['filename', 'emotion_label', 'original_dataset', 'actor_id']].head(10))

🎵 Speech Emotion Recognition Validation Dataset Generator
📂 Loading CREMA-D dataset...
   Found 7442 CREMA-D files
📂 Loading RAVDESS dataset...
   Found 2880 RAVDESS files

📊 Dataset Overview:
dataset   CREMA-D  RAVDESS
emotion                   
angry        1271      384
disgust      1271      384
fear         1271      384
happy        1271      384
neutral      1087      576
sad          1271      384
surprise        0      384

Total files loaded: 10322
Emotions available in both datasets: {'neutral', 'angry', 'disgust', 'sad', 'happy', 'fear'}
Emotions only in RAVDESS: {'surprise'}
Emotions only in CREMA-D: set()

🎲 Creating balanced sample...

🎯 Sampling angry emotion:
   Available CREMA-D files: 1271
   Available RAVDESS files: 384
   Sampling 15 from CREMA-D, 15 from RAVDESS
   ✅ Selected: 30 files
   📊 Datasets used: CREMA-D, RAVDESS
   🎭 Unique actors: 23

🎯 Sampling disgust emotion:
   Available CREMA-D files: 1271
   Available RAVDESS files: 384
   Sampling 15 from CREMA-D

In [5]:
import shutil
shutil.make_archive('validation_dataset', 'zip', 'validation_dataset')
print("📦 Validation dataset zipped! Download 'validation_dataset.zip' from the output section.")

📦 Validation dataset zipped! Download 'validation_dataset.zip' from the output section.
