<a href="https://colab.research.google.com/github/Baah134/Baah134/blob/main/SER_CARINE/Paper_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install librosa numpy scipy PyWavelets
DATASET_PATH = "/content/drive/MyDrive/DeepLearning/External/EMoDB/"
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import numpy as np
import librosa
import scipy.stats
import pywt
from scipy.signal import lfilter

# ==========================================
# 1. SETUP & CONFIGURATION
# ==========================================
# Path from your code snippet
DATASET_PATH = "/content/drive/MyDrive/DeepLearning/External/EMoDB/"
OUTPUT_PATH = "processed_data/"

# The exact 7 classes derived from your logic
CLASSES = ['Angry', 'Boredom', 'Disgust', 'Anxiety', 'Happiness', 'Sadness', 'Neutral']

# The "Decoder Ring" (Filename Code -> Emotion Name)
CODE_TO_EMOTION = {
    'W': 'Angry',
    'L': 'Boredom',
    'E': 'Disgust',
    'A': 'Anxiety', # Often 'Fear' in literature, but 'Anxiety' in your code
    'F': 'Happiness',
    'T': 'Sadness',
    'N': 'Neutral'
}

# Map Emotion Name -> Integer (0, 1, 2...) for the model
EMOTION_TO_INT = {label: i for i, label in enumerate(CLASSES)}

# ==========================================
# 2. FEATURE EXTRACTOR (BHANGALE ET AL.)
# ==========================================
def extract_bhangale_features(audio_path):
    """
    Extracts the exact 715-dim feature vector described in Bhangale et al. (2023).
    Includes Pre-emphasis, MFCCs, ZCR, Spectral Features, and Wavelets.
    """
    # Load audio, Resample to 16kHz [cite: 1795]
    y, sr = librosa.load(audio_path, sr=16000)

    # Standardize to 4 seconds (64000 samples) [cite: 1796]
    target_length = 64000
    if len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)))
    else:
        y = y[:target_length]

    # Pre-emphasis filter
    y = lfilter([1, -0.97], [1], y)

    # Frame Settings: 40ms window, 50% overlap [cite: 1530]
    n_fft = 640
    hop_length = 320

    # --- A. TIME-SERIES FEATURES (Length 199 each) ---
    zcr = _fix_length(librosa.feature.zero_crossing_rate(y, frame_length=n_fft, hop_length=hop_length)[0], 199)
    centroid = _fix_length(librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)[0], 199)
    S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
    kurtosis = _fix_length(scipy.stats.kurtosis(S, axis=0), 199)

    # --- B. STATIC FEATURES ---
    # MFCC (39)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=n_fft, hop_length=hop_length)
    mfcc_combined = np.concatenate((mfcc, librosa.feature.delta(mfcc), librosa.feature.delta(mfcc, order=2)), axis=0)
    mfcc_global = np.mean(mfcc_combined, axis=1)

    # Scalars & Stats
    rms_global = np.array([np.mean(librosa.feature.rms(y=y, frame_length=n_fft, hop_length=hop_length)[0])])
    rolloff_global = np.array([np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)[0])])

    # LPCC (13)
    lpc_coeffs = librosa.lpc(y, order=13)
    lpcc_global = lpc_coeffs[1:]
    if len(lpcc_global) < 13: lpcc_global = np.pad(lpcc_global, (0, 13-len(lpcc_global)))

    # Wavelet Packet Transform (56) [cite: 1636]
    wp = pywt.WaveletPacket(data=y, wavelet='db2', mode='symmetric', maxlevel=3)
    wpt_features = []
    for node in wp.get_level(3, 'natural'):
        d = node.data
        wpt_features.extend([np.mean(d), np.median(d), np.std(d), np.var(d), scipy.stats.skew(d), scipy.stats.kurtosis(d), np.sum(d**2)])
    wpt_global = np.array(wpt_features)

    # Voice Quality (3) & Formants (5)
    f0, _, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    f0 = f0[~np.isnan(f0)]
    pitch_val = np.mean(f0) if len(f0) > 0 else 0.0
    jitter = (np.mean(np.abs(np.diff(f0))) / pitch_val) if pitch_val > 0 else 0.0
    shimmer = 0.0 # Placeholder
    formants_vec = np.zeros(5) # Placeholder
    vq_features = np.array([jitter, shimmer, pitch_val])

    # Concatenate [cite: 1671]
    return np.concatenate([mfcc_global, rms_global, zcr, centroid, lpcc_global, wpt_global, rolloff_global, kurtosis, vq_features, formants_vec])

def _fix_length(arr, target_len):
    if len(arr) < target_len: return np.pad(arr, (0, target_len - len(arr)))
    return arr[:target_len]

# ==========================================
# 3. MAIN PROCESSING LOOP
# ==========================================
def process_emodb_data():
    X_features = []
    Y_labels = []
    S_speakers = [] # Important for your research pivot!

    print(f"Reading files from: {DATASET_PATH}")
    files = os.listdir(DATASET_PATH)

    count = 0
    for file_name in files:
        file_path = os.path.join(DATASET_PATH, file_name)

        # 1. Filter out folders or non-wav files
        if not os.path.isfile(file_path) or not file_name.endswith('.wav'):
            continue

        try:
            # 2. Extract Info from Filename (EMODB Format: 03a01Wa.wav)
            # Index 0-1: Speaker ID (03)
            # Index 5: Emotion Code (W)

            speaker_id = file_name[0:2]
            emotion_code = file_name[5]

            # 3. Validate Emotion Code
            if emotion_code not in CODE_TO_EMOTION:
                print(f"Skipping {file_name}: Unknown code '{emotion_code}'")
                continue

            emotion_name = CODE_TO_EMOTION[emotion_code]
            label_int = EMOTION_TO_INT[emotion_name]

            # 4. Extract Features
            features = extract_bhangale_features(file_path)

            # 5. Store
            if features.shape[0] == 715:
                X_features.append(features)
                Y_labels.append(label_int)
                S_speakers.append(speaker_id)
                count += 1
            else:
                print(f"Error shape {features.shape} in {file_name}")

        except Exception as e:
            print(f"Error processing {file_name}: {e}")

    # ==========================================
    # 4. SAVE ARRAYS
    # ==========================================
    X = np.array(X_features)
    Y = np.array(Y_labels)
    S = np.array(S_speakers)

    # Reshape X for the 1D CNN: (Batch, 715, 1) [cite: 1716]
    X = X[..., np.newaxis]

    print(f"\n--- DONE ---")
    print(f"Processed: {count} files")
    print(f"X Shape: {X.shape}") # Should be (535, 715, 1)
    print(f"Y Shape: {Y.shape}")
    print(f"Speakers Extracted: {len(np.unique(S))}") # Should be 10 for EMODB

    if not os.path.exists(OUTPUT_PATH):
        os.makedirs(OUTPUT_PATH)

    np.save(os.path.join(OUTPUT_PATH, "X_emodb.npy"), X)
    np.save(os.path.join(OUTPUT_PATH, "Y_emodb.npy"), Y)
    np.save(os.path.join(OUTPUT_PATH, "S_emodb.npy"), S) # Save speakers for your custom split later
    print(f"Saved .npy files to {OUTPUT_PATH}")

if __name__ == "__main__":
    process_emodb_data()

Reading files from: /content/drive/MyDrive/DeepLearning/External/EMoDB/


  kurtosis = _fix_length(scipy.stats.kurtosis(S, axis=0), 199)
