In [13]:
# ==========================
# 📦 Data Preprocessing: Unified Cell (Google Colab)
# ==========================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import libraries
import os
import zipfile
import pandas as pd
import librosa

# Logging helper
def log(msg):
    print(f"[INFO] {msg}")

# Main preprocessing function
def extract_and_label_data_colab(ravdess_zip, cremad_zip, extract_root, save_path=None, preview=True):
    """
    Extracts audio files from ZIPs, parses emotion labels, performs cleaning,
    and returns a labeled DataFrame. Saves to CSV if path is provided.
    """

    # Setup directories
    ravdess_path = os.path.join(extract_root, "ravdess")
    cremad_path = os.path.join(extract_root, "cremad")
    os.makedirs(ravdess_path, exist_ok=True)
    os.makedirs(cremad_path, exist_ok=True)

    # Extract ZIP files
    log("Extracting RAVDESS...")
    with zipfile.ZipFile(ravdess_zip, 'r') as zip_ref:
        zip_ref.extractall(ravdess_path)

    log("Extracting CREMA-D...")
    with zipfile.ZipFile(cremad_zip, 'r') as zip_ref:
        zip_ref.extractall(cremad_path)

    target_emotions = ['happy', 'sad', 'neutral', 'angry']

    # Parse emotion from filename
    def parse_ravdess(filename):
        try:
            code = int(filename.split('-')[2])
            return {1: 'neutral', 3: 'happy', 4: 'sad', 5: 'angry'}.get(code)
        except:
            return None

    def parse_cremad(filename):
        try:
            code = filename.split('_')[2]
            return {'ANG': 'angry', 'HAP': 'happy', 'NEU': 'neutral', 'SAD': 'sad'}.get(code)
        except:
            return None

    # Collect metadata
    ravdess_data = []
    for root, _, files in os.walk(ravdess_path):
        for file in files:
            if file.endswith('.wav'):
                emotion = parse_ravdess(file)
                full_path = os.path.join(root, file)
                if emotion in target_emotions:
                    ravdess_data.append({
                        'file_path': full_path,
                        'emotion': emotion,
                        'dataset': 'RAVDESS'
                    })

    cremad_data = []
    for root, _, files in os.walk(cremad_path):
        for file in files:
            if file.endswith('.wav'):
                emotion = parse_cremad(file)
                full_path = os.path.join(root, file)
                if emotion in target_emotions:
                    cremad_data.append({
                        'file_path': full_path,
                        'emotion': emotion,
                        'dataset': 'CREMA-D'
                    })

    df = pd.DataFrame(ravdess_data + cremad_data)
    log(f"Initial size: {df.shape}")

    # ========================
    # ✅ Data Cleaning
    # ========================

    # Drop nulls
    df.dropna(subset=['file_path', 'emotion'], inplace=True)

    # Remove missing or broken files
    df = df[df['file_path'].apply(lambda x: os.path.exists(x))]

    # Remove files < 1 sec (optional quality check)
    def is_valid_duration(path, min_sec=1.0):
        try:
            duration = librosa.get_duration(path=path)
            return duration >= min_sec
        except:
            return False

    df = df[df['file_path'].apply(is_valid_duration)]
    df.reset_index(drop=True, inplace=True)

    # Show stats
    if preview:
        log("✅ Sample rows:")
        print(df.head())
        log("📊 Emotion distribution:")
        print(df['emotion'].value_counts())
        log("📂 Dataset distribution:")
        print(df['dataset'].value_counts())

    # Save to CSV
    if save_path:
        df.to_csv(save_path, index=False)
        log(f"📁 Saved to: {save_path}")

    return df

# ================
# ✅ Run It
# ================
df = extract_and_label_data_colab(
    ravdess_zip="/content/drive/MyDrive/capstone_data/archive.zip",
    cremad_zip="/content/drive/MyDrive/capstone_data/crema.zip",
    extract_root="/content/dataset",
    save_path="/content/drive/MyDrive/capstone_data/combined_clean_metadata.csv",
    preview=True
)


Mounted at /content/drive
[INFO] Extracting RAVDESS...
[INFO] Extracting CREMA-D...
[INFO] Initial size: (6244, 3)
[INFO] ✅ Sample rows:
                                           file_path emotion  dataset
0  /content/dataset/ravdess/Actor_15/03-01-05-02-...   angry  RAVDESS
1  /content/dataset/ravdess/Actor_15/03-01-05-02-...   angry  RAVDESS
2  /content/dataset/ravdess/Actor_15/03-01-03-02-...   happy  RAVDESS
3  /content/dataset/ravdess/Actor_15/03-01-03-01-...   happy  RAVDESS
4  /content/dataset/ravdess/Actor_15/03-01-03-02-...   happy  RAVDESS
[INFO] 📊 Emotion distribution:
emotion
angry      1655
happy      1655
sad        1655
neutral    1279
Name: count, dtype: int64
[INFO] 📂 Dataset distribution:
dataset
CREMA-D    4900
RAVDESS    1344
Name: count, dtype: int64
[INFO] 📁 Saved to: /content/drive/MyDrive/capstone_data/combined_clean_metadata.csv
