In [7]:
import os
import glob
import pandas as pd
import cv2
import soundfile as sf

# üÜï YOUR CREMA-D FOLDER PATH
CREMA_ROOT = r"/Users/anzheladavityan/Desktop/crema-d-mirror"

def parse_crema_filename(filename):
    """Parse CREMA-D: 1001_IEO_HAP_HI.wav ‚Üí RAVDESS format"""
    base = os.path.splitext(filename)[0]
    parts = base.split('_')
    if len(parts) != 4:
        return None
    
    actor = parts[0]        # 1001
    statement = parts[1]    # IEO
    emotion = parts[2]      # HAP
    intensity = parts[3]    # HI
    
    # Map to RAVDESS numeric codes
    emotion_map = {'ANG': '01', 'DIS': '02', 'FEA': '03', 'HAP': '04', 'NEU': '05', 'SAD': '06'}
    intensity_map = {'LO': '01', 'MD': '02', 'HI': '03', 'XX': '01'}
    
    return {
        'Actor': actor,
        'Statement': statement,
        'Emotion': emotion_map.get(emotion, '05'),
        'Emotional_intensity': intensity_map.get(intensity, '01'),
        'Modality': '02',  # Video=02 (FLV), Audio=01 (WAV)
        'Vocal_channel': '01',#CREMA-D contains only speech, not song, to be similar to ravdess
        'Repetition': '01' #it doesnt have repetition, so to be similar to ravdess
    }

# Scan files
print("üîç Scanning CREMA-D...")
wav_files = glob.glob(os.path.join(CREMA_ROOT, '**', 'AudioWAV', '*.wav'), recursive=True)
flv_files = glob.glob(os.path.join(CREMA_ROOT, '**', 'VideoFlash', '*.flv'), recursive=True)

print(f"üîä AudioWAV/*.wav: {len(wav_files)}")
print(f"üìπ VideoFlash/*.flv: {len(flv_files)}")

output_csv = os.path.join(CREMA_ROOT, 'crema_data_manifest.csv')
if os.path.exists(output_csv):
    os.remove(output_csv)

total_rows = 0
first_write = True
global_row_id = 1
video_frame_count = 0

print("\nüîÑ Processing...")

# üé• VIDEO FLV FILES: YOUR EXACT RAVDESS FORMAT
print("Processing VideoFlash FLV (frame-level)...")
for full_path in flv_files:
    filename = os.path.basename(full_path)
    parsed = parse_crema_filename(filename)
    if not parsed:
        continue
    
    cap = cv2.VideoCapture(full_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_frame_count += total_frames
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = total_frames / fps if fps > 0 else 0
    cap.release()
    
    # üü¢ YOUR EXACT RAVDESS VIDEO FORMAT
    video_rows = []
    for frame_idx in range(total_frames):
        row = {
            'ID': global_row_id,
            'file_path': full_path,
            'filename': filename,
            'frame_idx': frame_idx,
            'file_type': 'video',
            'Modality': parsed['Modality'],
            'Vocal_channel': parsed['Vocal_channel'],
            'Emotion': parsed['Emotion'],
            'Emotional_intensity': parsed['Emotional_intensity'],
            'Statement': parsed['Statement'],
            'Repetition': parsed['Repetition'],
            'Actor': parsed['Actor'],
            'total_frames': total_frames,           # üü¢ YOUR COLUMN
            'fps': round(fps, 2),                   # üü¢ YOUR COLUMN
            'duration_seconds': round(duration, 2)  # üü¢ YOUR COLUMN
        }
        video_rows.append(row)
        global_row_id += 1
    
    pd.DataFrame(video_rows).to_csv(output_csv, mode='a', header=first_write, 
                                   chunksize=10000, index=False)
    total_rows += len(video_rows)
    first_write = False

print(f"‚úÖ Videos: {len(flv_files)} ‚Üí {video_frame_count:,} frame rows")

# üîä AUDIO WAV FILES: YOUR EXACT RAVDESS FORMAT  
print("Processing AudioWAV (single row per file)...")
for full_path in wav_files:
    filename = os.path.basename(full_path)
    parsed = parse_crema_filename(filename.replace('.wav', '.flv'))  # Use video naming
    if not parsed:
        continue
    
    audio_info = sf.info(full_path)
    duration = audio_info.duration
    
    # üü¢ YOUR EXACT RAVDESS VIDEO FORMAT (adapted for audio)
    row = {
        'ID': global_row_id,
        'file_path': full_path,
        'filename': filename,
        'frame_idx': -1,  # Single metadata row
        'file_type': 'audio',
        'Modality': '01',  # Audio modality
        'Vocal_channel': parsed['Vocal_channel'],
        'Emotion': parsed['Emotion'],
        'Emotional_intensity': parsed['Emotional_intensity'],
        'Statement': parsed['Statement'],
        'Repetition': parsed['Repetition'],
        'Actor': parsed['Actor'],
        'total_frames': 1,                      # Single "frame" for audio metadata
        'fps': None,                            # No FPS for audio
        'duration_seconds': round(duration, 2)
    }
    global_row_id += 1
    
    pd.DataFrame([row]).to_csv(output_csv, mode='a', header=False, 
                              chunksize=10000, index=False)
    total_rows += 1

print(f"\n‚úÖ COMPLETE CREMA-D!")
print(f"üìä EXACT COUNTS:")
print(f"üìπ VideoFlash: {len(flv_files)} ‚Üí {video_frame_count:,} frame rows")
print(f"üîä AudioWAV: {len(wav_files)} ‚Üí {len(wav_files):,} metadata rows")
print(f"üìÑ Total rows: {total_rows:,}")
print(f"üÜî Final ID: {global_row_id-1}")
print(f"üìÅ crema_data_manifest.csv")
print(f"‚úÖ EXACTLY YOUR 15 RAVDESS COLUMNS!")


üîç Scanning CREMA-D...
üîä AudioWAV/*.wav: 7442
üìπ VideoFlash/*.flv: 7442

üîÑ Processing...
Processing VideoFlash FLV (frame-level)...
‚úÖ Videos: 7442 ‚Üí 570,622 frame rows
Processing AudioWAV (single row per file)...

‚úÖ COMPLETE CREMA-D!
üìä EXACT COUNTS:
üìπ VideoFlash: 7442 ‚Üí 570,622 frame rows
üîä AudioWAV: 7442 ‚Üí 7,442 metadata rows
üìÑ Total rows: 578,064
üÜî Final ID: 578064
üìÅ crema_data_manifest.csv
‚úÖ EXACTLY YOUR 15 RAVDESS COLUMNS!


In [2]:
df = pd.read_csv('/Users/anzheladavityan/Desktop/crema-d-mirror/crema_data_manifest.csv')
df.head()

Unnamed: 0,ID,file_path,filename,frame_idx,file_type,Modality,Vocal_channel,Emotion,Emotional_intensity,Statement,Repetition,Actor,total_frames,fps,duration_seconds
0,1,/Users/anzheladavityan/Desktop/crema-d-mirror/...,1027_IWL_HAP_XX.flv,0,video,2,1,4,1,IWL,1,1027,69,29.97,2.3
1,2,/Users/anzheladavityan/Desktop/crema-d-mirror/...,1027_IWL_HAP_XX.flv,1,video,2,1,4,1,IWL,1,1027,69,29.97,2.3
2,3,/Users/anzheladavityan/Desktop/crema-d-mirror/...,1027_IWL_HAP_XX.flv,2,video,2,1,4,1,IWL,1,1027,69,29.97,2.3
3,4,/Users/anzheladavityan/Desktop/crema-d-mirror/...,1027_IWL_HAP_XX.flv,3,video,2,1,4,1,IWL,1,1027,69,29.97,2.3
4,5,/Users/anzheladavityan/Desktop/crema-d-mirror/...,1027_IWL_HAP_XX.flv,4,video,2,1,4,1,IWL,1,1027,69,29.97,2.3


In [3]:
import pandas as pd
import numpy as np

print("üìã COLUMNS:", list(df.columns))
print(f"üìä Shape: {df.shape}")

print("\nüîç CREMA-D SANITY CHECK")
print("=" * 60)

# ---------------------------------------------------
# 1Ô∏è‚É£ BASIC STRUCTURE
# ---------------------------------------------------
print("\nüìä BASIC STATS")

total_rows = len(df)
unique_files = df['filename'].nunique()

video_df = df[df['file_type'] == 'video']
audio_df = df[df['file_type'] == 'audio']

video_files = video_df['filename'].nunique()
audio_files = audio_df['filename'].nunique()

print(f"   Total rows (frame-expanded): {total_rows:,}")
print(f"   Unique files total: {unique_files:,}")
print(f"   Unique video files: {video_files:,}")
print(f"   Unique audio files: {audio_files:,}")

# ---------------------------------------------------
# 2Ô∏è‚É£ DATASET EXPECTATIONS (CREMA-D TRUE SPECS)
# ---------------------------------------------------
print("\nüìö DATASET SPEC CHECK (CREMA-D Official)")

expected_files = 7442
expected_total = 14884

print(f"   Expected video files: 7,442")
print(f"   Expected audio files: 7,442")
print(f"   Expected total files: 14,884")

# ---------------------------------------------------
# 3Ô∏è‚É£ ACTOR / LABEL VALIDATION
# ---------------------------------------------------
print("\nüé≠ LABEL VALIDATION")

actors_unique = df['Actor'].nunique()
actor_min = df['Actor'].min()
actor_max = df['Actor'].max()

emotion_unique = df['Emotion'].nunique()
intensity_unique = df['Emotional_intensity'].nunique()
statement_unique = df['Statement'].nunique()

print(f"   Actors: {actors_unique} | Range: {actor_min}-{actor_max} (expect 1001‚Äì1091)")
print(f"   Emotions: {emotion_unique} (expect 6)")
print(f"   Intensity levels: {intensity_unique} (expect 3)")
print(f"   Statements: {statement_unique} (expect 12)")

# Safe numeric validation
valid_actor = actor_min >= 1001 and actor_max <= 1091
valid_emotion = set(df['Emotion'].astype(int).unique()) == {1,2,3,4,5,6}
valid_intensity = set(df['Emotional_intensity'].astype(int).unique()) == {1,2,3}

print(f"   Actor range valid ‚úì: {valid_actor}")
print(f"   Emotion codes valid ‚úì: {valid_emotion}")
print(f"   Intensity codes valid ‚úì: {valid_intensity}")

# ---------------------------------------------------
# 4Ô∏è‚É£ VIDEO VALIDATION
# ---------------------------------------------------
print("\nüìπ VIDEO VALIDATION")

if len(video_df) > 0:
    frames_per_file = video_df.groupby('filename')['frame_idx'].count()
    
    print(f"   Frames per video: {frames_per_file.min()} - {frames_per_file.max()}")
    print(f"   Avg frames: {frames_per_file.mean():.0f}")
    
    if 'fps' in df.columns:
        print(f"   FPS avg: {video_df['fps'].mean():.1f}")
    
    if 'duration_seconds' in df.columns:
        print(f"   Duration avg: {video_df.groupby('filename')['duration_seconds'].first().mean():.2f}s")

    # Logical validation (CREMA-D videos ~2‚Äì4 sec @30fps ‚âà 60‚Äì120 frames)
    realistic_frames = frames_per_file.between(30, 2000).all()
    print(f"   Frame count realistic ‚úì: {realistic_frames}")
else:
    realistic_frames = False

# ---------------------------------------------------
# 5Ô∏è‚É£ AUDIO VALIDATION
# ---------------------------------------------------
print("\nüîä AUDIO VALIDATION")

if len(audio_df) > 0:
    print(f"   frame_idx all -1 ‚úì: {(audio_df['frame_idx'] == -1).all()}")
    
    if 'duration_seconds' in df.columns:
        print(f"   Duration avg: {audio_df['duration_seconds'].mean():.2f}s")

# ---------------------------------------------------
# 6Ô∏è‚É£ ID VALIDATION
# ---------------------------------------------------
print("\nüÜî ID VALIDATION")

id_unique = df['ID'].is_unique
id_sequential = df['ID'].is_monotonic_increasing

print(f"   Unique IDs ‚úì: {id_unique}")
print(f"   Sequential IDs ‚úì: {id_sequential}")

# ---------------------------------------------------
# 7Ô∏è‚É£ FINAL VERDICT
# ---------------------------------------------------
print("\n" + "="*60)
print("üéØ FINAL VERDICT")

checks = [
    ("Correct video file count", video_files == expected_files),
    ("Correct audio file count", audio_files == expected_files),
    ("Correct total file count", unique_files == expected_total),
    ("Valid actor range", valid_actor),
    ("Valid emotion codes", valid_emotion),
    ("Valid intensity codes", valid_intensity),
    ("Realistic video frames", realistic_frames),
    ("Unique IDs", id_unique)
]

passed = sum(1 for _, ok in checks if ok)
print(f"‚úÖ {passed}/{len(checks)} CHECKS PASSED\n")

for name, ok in checks:
    print(f"{'‚úÖ PASS' if ok else '‚ùå FAIL':<8} {name}")

print("\nüìà SUMMARY")
print(f"   Files: {unique_files:,}")
print(f"   Actors: {actors_unique}")
print(f"   Emotions: {emotion_unique}")

üìã COLUMNS: ['ID', 'file_path', 'filename', 'frame_idx', 'file_type', 'Modality', 'Vocal_channel', 'Emotion', 'Emotional_intensity', 'Statement', 'Repetition', 'Actor', 'total_frames', 'fps', 'duration_seconds']
üìä Shape: (578064, 15)

üîç CREMA-D SANITY CHECK

üìä BASIC STATS
   Total rows (frame-expanded): 578,064
   Unique files total: 14,884
   Unique video files: 7,442
   Unique audio files: 7,442

üìö DATASET SPEC CHECK (CREMA-D Official)
   Expected video files: 7,442
   Expected audio files: 7,442
   Expected total files: 14,884

üé≠ LABEL VALIDATION
   Actors: 91 | Range: 1001-1091 (expect 1001‚Äì1091)
   Emotions: 6 (expect 6)
   Intensity levels: 3 (expect 3)
   Statements: 12 (expect 12)
   Actor range valid ‚úì: True
   Emotion codes valid ‚úì: True
   Intensity codes valid ‚úì: True

üìπ VIDEO VALIDATION
   Frames per video: 2 - 1851
   Avg frames: 77
   FPS avg: 30.0
   Duration avg: 2.56s
   Frame count realistic ‚úì: False

üîä AUDIO VALIDATION
   frame_idx a