<a href="https://colab.research.google.com/github/DGautam11/Audio-Emotion-Recognition/blob/main/notebooks/01_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audio Data ETL Pipeline
*End-to-end pipeline for ingesting raw audio, normalizing metadata, and serializing tensors for Wav2Vec2 training.*


## Environment Configuration

In [None]:
%%capture
!pip install datasets transformers


In [None]:

import os
import json
import pandas as pd
import torch
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_from_disk
from transformers import Wav2Vec2Processor
from google.colab import drive


In [None]:


try:
    from google.colab import drive
    drive.mount('/content/drive')
    IS_COLAB = True
    print("Running on Google Colab")

    # Colab Paths (Google Drive)
    BASE_PATH = "/content/drive/MyDrive/Datasets/"
    OUTPUT_PATH = "/content/drive/MyDrive/wav2vec2-processed-data/"

except ImportError:
    IS_COLAB = False
    print("Running Locally")

    # Local Paths (Relative to the notebook)
    # Assumes  a 'datasets' folder next to the notebooks folder
    BASE_PATH = "../datasets/"
    OUTPUT_PATH = "../wav2vec2-processed-data/"

# Create Output Directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Define source directory paths (Dynamic based on BASE_PATH)
RAVDESS_PATH = os.path.join(BASE_PATH, "Ravdess", "audio_speech_actors_01-24") # Adjust subfolders if needed
CREMA_PATH = os.path.join(BASE_PATH, "Crema")
TESS_PATH = os.path.join(BASE_PATH, "Tess")
SAVEE_PATH = os.path.join(BASE_PATH, "Savee")

print(f" Looking for data in: {BASE_PATH}")
print(f" Processed data will be saved to: {OUTPUT_PATH}")

## 1. PHASE 1: EXTRACTION


### 1.1 Metadata Aggregation (Emotion Dictionaries)

*Different datasets use different labeling schemes (e.g., RAVDESS uses "01",CREMA uses "ANG"). We map all of them to a unified 8-emotion schema (neutral, calm, happy, sad, angry, fearful, disgust, surprised).*

In [None]:


RAVDESS_EMOTIONS = {
    "01": "neutral", "02": "calm", "03": "happy", "04": "sad",
    "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"
}

CREMA_EMOTIONS = {
    "SAD": "sad", "ANG": "angry", "DIS": "disgust", "FEA": "fearful",
    "HAP": "happy", "NEU": "neutral"
}

SAVEE_EMOTIONS = {
    "a": "angry", "d": "disgust", "f": "fearful", "h": "happy",
    "n": "neutral", "sa": "sad", "su": "surprised"
}

### 1.2 Source Traversal Functions
*Logic: Walk through directory trees, extract file paths, and decode filenames into labels.*

In [None]:
# Create functions to process each dataset

def process_ravdess(path):
    data = []
    for actor_folder in os.listdir(path):
            for filename in os.listdir(os.path.join(path, actor_folder)):
                if filename.endswith(".wav"):
                  file_path = os.path.join(path, actor_folder, filename)
                  emotion_code = filename.split("-")[2]
                  emotion = RAVDESS_EMOTIONS[emotion_code]
                  data.append({"file_path": file_path, "emotion": emotion})
    return pd.DataFrame(data)

def process_crema(path):
    data = []
    for filename in os.listdir(path):
        if filename.endswith(".wav"):
            file_path = os.path.join(path, filename)
            emotion_code = filename.split("_")[2]
            emotion = CREMA_EMOTIONS[emotion_code]
            data.append({"file_path": file_path, "emotion": emotion})
    return pd.DataFrame(data)

def process_tess(path):
    data = []
    for emotion_folder in os.listdir(path):
        emotion = emotion_folder.split("_")[1].lower()
        folder_path = os.path.join(path, emotion_folder)
        for filename in os.listdir(folder_path):
            if filename.endswith(".wav"):
                file_path = os.path.join(folder_path, filename)
                emotion = filename[0].lower()
                data.append({"file_path": file_path, "emotion": emotion})
    return pd.DataFrame(data)

def process_savee(path):
    data = []
    for filename in os.listdir(path):
        if filename.endswith(".wav"):
            file_path = os.path.join(path, filename)
            emotion_code = filename.split('_')[1][0]  # This gets the letter after 'DC_'

            # Handle special cases for 'sa' and 'su'
            if emotion_code == 's':
                emotion_code = filename.split('_')[1][:2]

            emotion = SAVEE_EMOTIONS[emotion_code]
            data.append({"file_path": file_path, "emotion": emotion})
    return pd.DataFrame(data)


def process_tess(path):
    data = []
    for emotion_folder in os.listdir(path):
        folder_path = os.path.join(path, emotion_folder)
        if os.path.isdir(folder_path):
            # Extract emotion from folder name
            emotion = emotion_folder.split("_")[-1].lower()

            # Special cases for 'fear' and 'surprise'
            if emotion == 'fear':
                emotion = 'fearful'
            elif 'surprise' in emotion:
                emotion = 'surprised'

            for filename in os.listdir(folder_path):
                if filename.endswith(".wav"):
                    file_path = os.path.join(folder_path, filename)
                    data.append({
                        "file_path": file_path,
                        "emotion": emotion
                    })

    return pd.DataFrame(data)





In [None]:
# Execution: Source Traversal
ravdess_df = process_ravdess(RAVDESS_PATH)
crema_df = process_crema(CREMA_PATH)
tess_df = process_tess(TESS_PATH)
savee_df = process_savee(SAVEE_PATH)

In [None]:
# Aggregation: Merge all metadata
combined_df = pd.concat([ravdess_df, crema_df, tess_df, savee_df], ignore_index=True)

In [None]:
# Validation: Ensure files exist
combined_df["status"] = combined_df["file_path"].apply(lambda path: True if os.path.exists(path) else None)
combined_df = combined_df.dropna(subset=["status"]).drop(columns=["status"])
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

print(f"Audio Files: {len(combined_df)}")
print(combined_df['emotion'].value_counts())

In [None]:
# "Calm" (192 samples) is too small and similar to "Neutral".
# Merge them to create a robust, balanced dataset.
combined_df['emotion'] = combined_df['emotion'].replace('neutral', 'calm')
print(combined_df['emotion'].value_counts())


## 2. PHASE 2: TRANSFORMATION (Process)
 *Normalizing labels and processing audio signals.*

### 2.1 Label Encoding

In [None]:
# Encode emotions as numeric labels
label_encoder = LabelEncoder()
combined_df['label'] = label_encoder.fit_transform(combined_df['emotion'])

id2label = {str(i): label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}

# Save to disk as a simple JSON file
label_file_path = os.path.join(OUTPUT_PATH, "label_mapping.json")

with open(label_file_path, "w") as f:
    json.dump({"id2label": id2label, "label2id": label2id}, f)

print(f" Label mapping saved to: {label_file_path}")
print(f"   Mapping: {id2label}")

In [None]:
# Save metadata for reproducibility
combined_df.to_csv(os.path.join(OUTPUT_PATH,'processed_audio_data.csv'), index=False)

### 2.2 Train and Test Split

In [None]:
#Splits the processed data into training and testing sets.

train_df, test_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['label'], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

### 2.3 Audio Signal Processing

In [None]:
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')

In [None]:
def preprocess(audio):
    try:
        audio_path = audio['file_path']

        speech_array, _ = librosa.load(audio_path, sr=16000, mono=True)

        # Tokenize
        inputs = processor(
            speech_array,
            sampling_rate=16000,
            return_tensors='pt',
            truncation=True,
            max_length=32000,
            padding="max_length"
        )
        input_values = inputs.input_values[0]

        # If the processor failed to pad it correctly, DISCARD the file.
        if input_values.shape[-1] != 32000:
            return None

        return {
            "input_values": input_values,
            "labels": torch.tensor(audio["label"])
        }
    except Exception as e:
        print(f"Error processing {audio['file_path']}: {e}")
        return None

In [None]:

# *Applying signal processing and filtering out corrupt audio files.*

# Apply Transformation for training set
training_set = train_dataset.map(preprocess)
# Remove any rows where the processor returned 'None' (corrupt/short files)
training_set = training_set.filter(lambda x: x is not None and x.get("input_values") is not None)
print(f" {len(training_set)} valid samples.")

test_set = test_dataset.map(preprocess)
test_set = test_set.filter(lambda x: x is not None and x.get("input_values") is not None)
print(f" {len(test_set)} valid samples.")



## 3. PHASE 3: FORMATTING & SERIALIZATION (Load)
 *Setting PyTorch format and saving the clean dataset to disk.*

In [None]:

# Set Tensor Format
# This prepares the data for the PyTorch DataLoader in the next notebook
training_set.set_format(type="torch", columns=["input_values", "labels"])
test_set.set_format(type="torch", columns=["input_values", "labels"])

#  Save to Disk (Google Drive)
print(f"Saving processed data to: {OUTPUT_PATH}")

training_set.save_to_disk(os.path.join(OUTPUT_PATH, "train_dataset"))
test_set.save_to_disk(os.path.join(OUTPUT_PATH, "test_dataset"))

print("ETL Pipeline Complete.  Ready for Notebook 02_wav2vec_finetuning.")

In [None]:
# Check if files exist
print(f"ðŸ“‚ Checking Output Path: {OUTPUT_PATH}")
print(f"   - Train Data Exists? {os.path.exists(os.path.join(OUTPUT_PATH, 'train_dataset'))}")
print(f"   - Test Data Exists?  {os.path.exists(os.path.join(OUTPUT_PATH, 'test_dataset'))}")
print(f"   - Mapping Exists?    {os.path.exists(os.path.join(OUTPUT_PATH, 'label_mapping.json'))}")

# Check the mapping content
with open(os.path.join(OUTPUT_PATH, 'label_mapping.json'), 'r') as f:
    data = json.load(f)
    print(f"\nðŸ”— Final Label Mapping ({len(data['id2label'])} classes):")
    print(data['id2label'])