##                               Speech Emotion Recognition

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Flatten, Dense, Dropout, Bidirectional, LSTM, Attention
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import IPython.display as ipd

In [None]:
import librosa

In [None]:
pip install librosa audiomentations 

In [None]:
pip install soundfile

In [None]:
import audiomentations as A
import soundfile as sf
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


## Using TESS dataframes

In [None]:
## path to TESS data
tess_path = '/kaggle/input/toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data'

In [None]:
tess_file_paths = []
tess_labels = []

for folder in os.listdir(tess_path):
    folder_path = os.path.join(tess_path, folder)
    label = folder[4:] ## folder = OAF_Fear (take from index 4 to last)
    label = label.lower() ## converting the string to lower

    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        ## storing this file path in list
        tess_file_paths.append(file_path)
        ## storing the corresponding emotion
        tess_labels.append(label)

In [None]:
## lets make a pandas dataframe for tess data also
tess_data = pd.DataFrame({
    'paths' : tess_file_paths,
    'emotions' : tess_labels
})

## lets print first 5 rows of this dataset
tess_data.head()

In [None]:


# Define augmentations
augment = A.Compose([
    A.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    A.PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    A.TimeStretch(min_rate=0.8, max_rate=1.2, p=0.5)
])


# Create directory for augmented files
augmented_dir = "augmented_audio"
os.makedirs(augmented_dir, exist_ok=True)

# Process and augment data
augmented_file_paths = []
augmented_labels = []

for file_path, label in zip(tess_file_paths, tess_labels):
    # Load audio file
    audio, sr = librosa.load(file_path, sr=None)

    # Apply augmentation
    augmented_audio = augment(audio, sample_rate=sr)

    # Save augmented audio
    augmented_file_path = os.path.join(augmented_dir, os.path.basename(file_path).replace('.wav', '_augmented.wav'))
    try:
        sf.write(augmented_file_path, augmented_audio, sr)
        augmented_file_paths.append(augmented_file_path)
        augmented_labels.append(label)
    except Exception as e:
        print(f"Error writing {augmented_file_path}: {e}")


## Combine Augmented and TESS data

In [None]:
# Combine Augmented and TESS data
all_file_paths =  tess_file_paths + augmented_file_paths
all_labels =  tess_labels + augmented_labels

# Create a dataframe combining both datasets
emotion_data = pd.DataFrame({
    'paths': all_file_paths,
    'emotions': all_labels
})

# Display the first 5 rows of the combined dataframe
print(emotion_data.head())


In [None]:
label_mapping = {
    'disguist': 'disgust', 
    'fear': 'fearful', 
    'pleasant_surprised': 'pleasant_surprise'
}

In [None]:
emotion_data['emotions'] = emotion_data['emotions'].replace(label_mapping)


In [None]:
print(emotion_data['emotions'].unique())  # Should show the corrected list of unique emotions


In [None]:


plt.figure(figsize=(10,5))
sns.countplot(x=emotion_data['emotions'], order=emotion_data['emotions'].value_counts().index)
plt.xticks(rotation=45)
plt.title("Updated Emotion Counts")
plt.show()


In [None]:
# Function to create waveplot
def create_waveplot(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.title('Waveplot for audio with {} emotion'.format(e), size=15)
    librosa.display.waveshow(data, sr=sr)
    plt.show()

# Function to create spectrogram
def create_spectrogram(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.title('Spectrogram for audio with {} emotion'.format(e), size=15)
    D = librosa.amplitude_to_db(librosa.stft(data), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.show()

# Set emotion and path
emotion = 'disgust'
path = np.array(emotion_data.paths[emotion_data.emotions == emotion])[1]  # Ensure data_path has 'Path' and 'Emotions' columns

# Load audio data
data, sampling_rate = librosa.load(path)

# Generate waveplot and spectrogram
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)

ipd.Audio(data, rate=sampling_rate)

In [None]:
emotion='angry'
path = np.array(emotion_data.paths[emotion_data.emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)
ipd.Audio(data, rate=sampling_rate)

In [None]:
emotion='fearful'
path = np.array(emotion_data.paths[emotion_data.emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)

ipd.Audio(data, rate=sampling_rate)

In [None]:
emotion='sad'
path = np.array(emotion_data.paths[emotion_data.emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)

ipd.Audio(data, rate=sampling_rate)

In [None]:
emotion='happy'
path = np.array(emotion_data.paths[emotion_data.emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)

ipd.Audio(data, rate=sampling_rate)

In [None]:
emotion='neutral'
path = np.array(emotion_data.paths[emotion_data.emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)

ipd.Audio(data, rate=sampling_rate)

In [None]:
emotion='pleasant_surprise'
path = np.array(emotion_data.paths[emotion_data.emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)

ipd.Audio(data, rate=sampling_rate)

In [None]:
## So we'll try to extract MEL and MFCC features
mel_features = []
mfcc_features = []

## traversing all the paths in combined dataset
for i in range(len(emotion_data)):
    ## loading the audio file
    data, sample_rate = librosa.load(emotion_data.loc[i, 'paths'])
    ## extracting MEL features
    mel_features.append(np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis= 0))
    ## extracting MFCC features
    mfcc_features.append(np.mean(librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc= 40).T, axis=0))

In [None]:
## lets convert both mel_features and mfcc_features list to arrays
mel_features_array = np.array(mel_features)
mfcc_features_array = np.array(mfcc_features)

## lets check shape of both
print(f"Shape of MEL features : {mel_features_array.shape}")
print(f"Shape of MFCC features : {mfcc_features_array.shape}")

In [None]:
features = np.hstack((mel_features_array, mfcc_features_array))

## now lets check shape
print(f"Shape of feature data : {features.shape}")

In [None]:
## lets make a datagframe containing these features and corresponding emotions
df_features = pd.DataFrame(features)
## combine both emotion_data and df_features
data = pd.concat([emotion_data, df_features], axis=1)
## lets print first 5 rows of new data
data.head()

In [None]:
## we don't need paths column we can drop it
data_new = data.drop(columns=['paths'])
data_new.head()

In [None]:
## defining X (features) and y (prediction)
X = data_new.drop(columns=['emotions'])
y = data_new['emotions'].to_numpy()

In [None]:
## we need to encode y as it contains 8 classes, which are object, hence need to use OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
y_encoded = encoder.fit_transform(y.reshape(-1, 1))

## how it is encoded
encoder.get_feature_names_out()

In [None]:
## lets check shape of y_encoded
print(f"Shape of y_encoded : {y_encoded.shape}")

In [None]:
y_encoded = y_encoded.toarray()
y_encoded

In [None]:
## lets split it into training and testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, shuffle=True, random_state=42)



print(f"Shape of X_train : {X_train.shape}")
print(f"Shape of X_test : {X_test.shape}")
print(f"Shape of y_train : {y_train.shape}")
print(f"Shape of y_test : {y_test.shape}")

In [None]:
## scaling features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(X_test.shape)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, LSTM, Dense, Dropout, TimeDistributed, Bidirectional
from tensorflow.keras.optimizers import Adam

In [None]:
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [None]:
pip install tensorflow-addons


## CNN-A-BILSTM MODEL

In [None]:
from tensorflow.keras.layers import Lambda
model = Sequential()
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dropout, Flatten, Dense, Bidirectional, LSTM, Lambda
from tensorflow.keras.optimizers import RMSprop

# Define Attention Layer
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1],), initializer="normal", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[-2],), initializer="zeros", trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = tf.nn.tanh(tf.tensordot(x, self.W, axes=1) + self.b)
        a = tf.nn.softmax(e, axis=1)
        return tf.reduce_sum(x * tf.expand_dims(a, -1), axis=1)

# Define Model
input_layer = Input(shape=(X_train.shape[1], X_train.shape[2], 1))

# **Efficient CNN Feature Extractor**
x = Conv2D(64, (5, 5), padding='same', activation='relu')(input_layer)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(1, 2), padding='same')(x)
x = Dropout(0.2)(x)  # Reduced dropout slightly

x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(1, 2), padding='same')(x)
x = Dropout(0.3)(x)  # More aggressive dropout here

# **Flatten and Transition to LSTM**
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)

# **Stronger LSTM Layers Instead of Deep CNN**
x = Lambda(lambda t: tf.expand_dims(t, axis=1))(x)
x = Bidirectional(LSTM(256, return_sequences=True))(x)  # Increased LSTM units
x = Bidirectional(LSTM(128, return_sequences=True))(x)  # Second BiLSTM layer

# **Attention Layer for Focus**
x = AttentionLayer()(x)

# **Final Dense Layers**
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
out = Dense(y_train.shape[1], activation='softmax')(x)

# **Compile Model with RMSprop**
model = Model(inputs=input_layer, outputs=out)
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(learning_rate=0.0005), metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Early stopping to monitor validation loss
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,  # Stop training if val_loss doesn't improve for 10 epochs
    restore_best_weights=True
)

In [None]:
from tensorflow.keras.losses import CategoricalCrossentropy

# Focal loss function
def focal_loss(gamma=2.0, alpha=0.25):
    def loss_fn(y_true, y_pred):
        cce = CategoricalCrossentropy()
        cross_entropy = cce(y_true, y_pred)
        pt = tf.exp(-cross_entropy)
        focal_loss = alpha * (1 - pt) ** gamma * cross_entropy
        return focal_loss
    return loss_fn


In [None]:
# Learning Rate Scheduling
lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

from tensorflow.keras.callbacks import LearningRateScheduler

def clr_schedule(epoch, lr):
    min_lr = 1e-5
    max_lr = 5e-4
    cycle = 15
    return min_lr + (max_lr - min_lr) * abs((epoch % (2 * cycle)) - cycle) / cycle

clr_callback = LearningRateScheduler(clr_schedule)


In [None]:
from tensorflow.keras import metrics
model.compile(optimizer=Adam(learning_rate=0.001), loss=focal_loss(), metrics=['accuracy', metrics.Precision(), metrics.Recall()])

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=128, callbacks=[lr_schedule])


In [None]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")

In [None]:

# Plot accuracy
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')

In [None]:
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss')
plt.show()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Get model predictions on the test set
# Evaluate Model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)
accuracy = accuracy_score(y_true_labels, y_pred_labels)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

# Compute Precision, Recall, F1-Score
print("Classification Report:\n", classification_report(y_true_labels, y_pred_labels, digits=4))


In [None]:
from sklearn.metrics import confusion_matrix

# Compute Confusion Matrix
conf_matrix = confusion_matrix(y_true_labels, y_pred_labels)

# Normalize Confusion Matrix (Row-wise Normalization)
conf_matrix_norm = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]

# Plot Confusion Matrix with Accuracy
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_norm, annot=True, fmt=".2f", cmap="Blues", xticklabels=['disgust', 'anger', 'fearful', 'sad', 'happy', 'neutral', 'pleasant_surprise'], yticklabels=['disgust', 'anger', 'fearful', 'sad', 'happy', 'neutral', 'pleasant_surprise'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Normalized Confusion Matrix (Accuracy per Class)")
plt.show()
