In [1]:
# ! pip install tensorflow-directml-plugin
# ! pip install librosa
# ! pip uninstall numpy
# ! pip install numpy==1.26.4

In [2]:
import os
import glob
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [3]:
def extract_label(file_path):
    """
    Extracts the sentiment label from the penultimate letter of the filename.
    Expected mapping:
        E - disgust
        L - boredom
        W - anger	
        A - anxiety
        F - happiness
        T - sadness
        N - neutral
    """
    filename = os.path.basename(file_path)
    name_without_ext = os.path.splitext(filename)[0]
    if len(name_without_ext) < 2:
        raise ValueError(f"Filename '{filename}' is too short to extract label.")
    # Get the penultimate character and convert to uppercase
    penultimate = name_without_ext[-2].upper()
    mapping = {
        'E': 'disgust',
        'L': 'boredom',
        'W': 'anger',
        'A': 'anxiety',
        'F': 'happiness',
        'T': 'sadness',
        'N': 'neutral'
    }
    label = mapping.get(penultimate, 'unknown')
    if label == 'unknown':
        print(f"Warning: Unknown sentiment for file {filename} with letter '{penultimate}'")
    return label

In [4]:
def load_data(dataset_path, sr=22050, duration=3, n_mels=128):
    """
    Loads WAV files from the given directory, converts each to a mel-spectrogram,
    and extracts labels based on the filename.
    
    Parameters:
      - dataset_path: path to the directory containing .wav files
      - sr: sampling rate for audio files
      - duration: duration (in seconds) to which each audio file will be trimmed or padded
      - n_mels: number of mel bands for the spectrogram
    
    Returns:
      - X: numpy array of mel-spectrograms
      - y: numpy array of sentiment labels
    """
    X = []
    y = []
    files = glob.glob(os.path.join(dataset_path, '*.wav'))
    print(f"Found {len(files)} audio files.")
    for file in files:
        try:
            label = extract_label(file)
            # Load audio file (if the file is shorter than the duration, it will be padded)
            audio, _ = librosa.load(file, sr=sr, duration=duration)
            if len(audio) < sr * duration:
                padding = sr * duration - len(audio)
                audio = np.pad(audio, (0, padding), 'constant')
            # Compute the mel-spectrogram
            mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
            # Convert to log scale (dB)
            mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
            X.append(mel_spec)
            y.append(label)
        except Exception as e:
            print(f"Error processing {file}: {e}")
    X = np.array(X)
    y = np.array(y)
    return X, y


In [5]:
def preprocess_labels(y):
    """
    Encodes string labels into numerical one-hot vectors.
    """
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    y_onehot = to_categorical(y_encoded)
    return y_onehot, le

In [None]:
def build_model(input_shape, num_classes):
    """
    Builds and returns an improved CNN model using Keras with better architecture
    for increased accuracy.
    """
    model = Sequential()
    
    # First convolutional block - increased filters and added batch normalization
    model.add(Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=input_shape))
    model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.2))
    
    # Second convolutional block with more filters
    model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.3))
    
    # Third convolutional block for deeper feature extraction
    model.add(Conv2D(256, (3, 3), padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.4))
    
    # Fully connected layers
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    
    return model

In [8]:
def scheduler(epoch, lr):
    # reduce learning rate by half every 10 epochs
    if epoch > 0 and epoch % 100 == 0:
        return lr * 0.5
    return lr

In [9]:

def main():
    dataset_path = 'EmoDB\wav'  # Update with your dataset path
    X, y = load_data(dataset_path)
    print("Shape before expanding dims:", X.shape)
    # Expand dims to add a channel dimension for CNN input
    X = X[..., np.newaxis]
    print("Shape after expanding dims:", X.shape)
    
    y_onehot, le = preprocess_labels(y)
    print("Detected labels:", le.classes_)
    
    # Stratified and shuffled train-test split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_onehot, test_size=0.2, random_state=42, shuffle=True, 
        stratify=np.argmax(y_onehot, axis=1)
    )
    
    input_shape = X_train.shape[1:]
    num_classes = y_onehot.shape[1]
    model = build_model(input_shape, num_classes)
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    
    # Define callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=200, restore_best_weights=True)
    lr_scheduler = LearningRateScheduler(scheduler)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=100, min_lr=1e-6)
    
    epochs = 2000
    batch_size = 32
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        callbacks=[early_stop, lr_scheduler, reduce_lr]
    )
    
    loss, acc = model.evaluate(X_val, y_val)
    print(f"Validation Loss: {loss:.4f}, Validation Accuracy: {acc:.4f}")
    
    # model.save('sentiment_cnn_model.h5')
    # print("Model saved as 'sentiment_cnn_model.h5'")


In [10]:
if __name__ == '__main__':
    main()

Found 535 audio files.
Shape before expanding dims: (535, 128, 130)
Shape after expanding dims: (535, 128, 130, 1)
Detected labels: ['anger' 'anxiety' 'boredom' 'disgust' 'happiness' 'neutral' 'sadness']
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 128, 130, 16)      160       
                                                                 
 batch_normalization (BatchN  (None, 128, 130, 16)     64        
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 64, 65, 16)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 64, 65, 16)        0         
                                                  