<a href="https://colab.research.google.com/github/Baah134/Baah134/blob/main/SER_CARINE/Paper_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
from tqdm import tqdm

# ==========================================
# 1. CONFIGURATION
# ==========================================
# Two input paths (Your manual split folders)
PATH_A = "/content/drive/MyDrive/DeepLearning/External/RAVDESS Emotional Speech Audio/audio_speech_actors_01-24/"
PATH_B = "/content/drive/MyDrive/DeepLearning/External/RAVDESS Emotional Speech Audio/Test/"

OUTPUT_PATH = "dual_ravdess_processed_data/"

# RAVDESS Mapping (Merging Neutral 01 + Calm 02)
# Classes: 0=Neutral, 1=Happy, 2=Sad, 3=Angry, 4=Fear, 5=Disgust, 6=Surprise
EMOTION_MAP = {
    1: 0, 2: 0,
    3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 8: 6
}

# Technical Specs (Li et al. 2022)
SAMPLE_RATE = 16000
DURATION = 5.0 # Fixed duration to get ~504 frames
TOTAL_SAMPLES = int(SAMPLE_RATE * DURATION) # 80,000

# STFT Specs
N_FFT = 512
WIN_LENGTH = 400 # 25ms
HOP_LENGTH = 160 # 10ms (This yields 500-504 frames for 5s)
N_MELS = 60      # Paper Table 1
FMAX = 8000      # Paper Comparison Result

# ==========================================
# 2. CUSTOM IMEL MATH (The Innovation)
# ==========================================
def hz_to_imel(freq):
    return 2595 * np.exp(1 + freq / 700.0)

def imel_to_hz(imel):
    return 700 * (np.log(imel / 2595.0) - 1)

def imel_filter_bank(sr, n_fft, n_mels=60, fmin=0.0, fmax=8000):
    """Generates the Inverse-Mel Triangular Filter Bank"""
    imel_min = hz_to_imel(fmin)
    imel_max = hz_to_imel(fmax)

    # Create evenly spaced points in IMel scale
    imel_points = np.linspace(imel_min, imel_max, n_mels + 2)
    hz_points = imel_to_hz(imel_points)

    # Convert to FFT bins
    bin_points = np.floor((n_fft + 1) * hz_points / sr).astype(int)

    filters = np.zeros((n_mels, 1 + n_fft // 2))

    for i in range(n_mels):
        f_m_minus = bin_points[i]
        f_m = bin_points[i+1]
        f_m_plus = bin_points[i+2]

        for k in range(f_m_minus, f_m):
            filters[i, k] = (k - f_m_minus) / (f_m - f_m_minus)
        for k in range(f_m, f_m_plus):
            filters[i, k] = (f_m_plus - k) / (f_m_plus - f_m)

    return filters

# ==========================================
# 3. FEATURE EXTRACTOR
# ==========================================
def extract_dual_features(audio_path):
    try:
        # 1. Load & Pad
        y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
        if len(y) < TOTAL_SAMPLES:
            y = np.pad(y, (0, TOTAL_SAMPLES - len(y)))
        else:
            y = y[:TOTAL_SAMPLES]

        # 2. Pre-emphasis
        # y = lfilter([1, -0.97], [1], y) # Optional, often handled by Mel filter design

        # 3. Power Spectrum
        D = np.abs(librosa.stft(y, n_fft=N_FFT, win_length=WIN_LENGTH, hop_length=HOP_LENGTH))**2

        # 4. Channel A: Mel Spectrogram
        mel_filters = librosa.filters.mel(sr=sr, n_fft=N_FFT, n_mels=N_MELS, fmax=FMAX)
        mel_spec = np.dot(mel_filters, D)
        log_mel = librosa.power_to_db(mel_spec, ref=np.max)

        # 5. Channel B: IMel Spectrogram
        imel_filters = imel_filter_bank(sr, N_FFT, N_MELS, fmax=FMAX)
        imel_spec = np.dot(imel_filters, D)
        log_imel = librosa.power_to_db(imel_spec, ref=np.max)

        # Shape Check: Should be (60, ~500)
        # We need strict shape for CNN. Let's crop/pad to 504 width (Table 1)
        TARGET_WIDTH = 504

        def fix_width(spec):
            if spec.shape[1] > TARGET_WIDTH:
                return spec[:, :TARGET_WIDTH]
            else:
                return np.pad(spec, ((0,0), (0, TARGET_WIDTH - spec.shape[1])))

        log_mel = fix_width(log_mel)
        log_imel = fix_width(log_imel)

        # Add Channel Dim: (60, 504, 1)
        return log_mel[..., np.newaxis], log_imel[..., np.newaxis]

    except Exception as e:
        print(f"Error: {e}")
        return None, None

# ==========================================
# 4. PROCESSING LOOP (MERGING)
# ==========================================
def process_ravdess_unified():
    # Master Lists
    X_mel_all = []
    X_imel_all = []
    Y_all = []
    S_all = [] # Speakers

    paths_to_scan = [PATH_A, PATH_B]

    if not os.path.exists(OUTPUT_PATH):
        os.makedirs(OUTPUT_PATH)

    print("Starting Dual-Channel Extraction...")

    # Iterate over both directories (Train Folder AND Test Folder)
    for root_path in paths_to_scan:
        if not os.path.exists(root_path):
            print(f"Skipping missing path: {root_path}")
            continue

        # Get Actor Folders
        actor_folders = [d for d in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, d))]

        for actor in tqdm(actor_folders, desc=f"Scanning {os.path.basename(root_path)}"):
            actor_path = os.path.join(root_path, actor)
            files = [f for f in os.listdir(actor_path) if f.endswith('.wav')]

            for file_name in files:
                # Parse Filename: 03-01-06-01-02-01-24.wav
                parts = file_name.split('.')[0].split('-')
                if len(parts) < 7: continue

                emotion_code = int(parts[2])
                speaker_id = parts[6] # '24'

                if emotion_code not in EMOTION_MAP: continue
                label = EMOTION_MAP[emotion_code]

                # Extract
                full_path = os.path.join(actor_path, file_name)
                mel, imel = extract_dual_features(full_path)

                if mel is not None:
                    X_mel_all.append(mel)
                    X_imel_all.append(imel)
                    Y_all.append(label)
                    S_all.append(speaker_id)

    # Convert to Arrays
    print("\nConverting to Numpy Arrays...")
    X_mel_all = np.array(X_mel_all)
    X_imel_all = np.array(X_imel_all)
    Y_all = np.array(Y_all)
    S_all = np.array(S_all)

    # Clean NaNs
    X_mel_all = np.nan_to_num(X_mel_all)
    X_imel_all = np.nan_to_num(X_imel_all)

    print(f"\nFinal Dataset Stats:")
    print(f"Mel Shape:  {X_mel_all.shape}")
    print(f"IMel Shape: {X_imel_all.shape}")
    print(f"Labels:     {Y_all.shape}")

    # Save
    print(f"Saving to {OUTPUT_PATH}...")
    np.save(os.path.join(OUTPUT_PATH, "X_mel_all.npy"), X_mel_all)
    np.save(os.path.join(OUTPUT_PATH, "X_imel_all.npy"), X_imel_all)
    np.save(os.path.join(OUTPUT_PATH, "Y_all.npy"), Y_all)
    np.save(os.path.join(OUTPUT_PATH, "S_all.npy"), S_all)
    print("Done.")

if __name__ == "__main__":
    process_ravdess_unified()

Starting Dual-Channel Extraction...


Scanning : 100%|██████████| 22/22 [14:35<00:00, 39.79s/it]
Scanning : 100%|██████████| 2/2 [01:16<00:00, 38.24s/it]



Converting to Numpy Arrays...

Final Dataset Stats:
Mel Shape:  (1440, 60, 504, 1)
IMel Shape: (1440, 60, 504, 1)
Labels:     (1440,)
Saving to dual_ravdess_processed_data/...
Done.


In [5]:
!zip -r dual_ravdess_processed_data.zip dual_ravdess_processed_data/

  adding: dual_ravdess_processed_data/ (stored 0%)
  adding: dual_ravdess_processed_data/X_mel_all.npy (deflated 58%)
  adding: dual_ravdess_processed_data/Y_all.npy (deflated 91%)
  adding: dual_ravdess_processed_data/S_all.npy (deflated 98%)
  adding: dual_ravdess_processed_data/X_imel_all.npy (deflated 75%)


In [None]:
!unzip -q dual_ravdess_processed_data.zip -d ./