<a href="https://colab.research.google.com/github/A190nux/Speech-Emotion-Recogniton/blob/main/Speech_Emotion_Recognition_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from scipy.io import wavfile
import os.path
import IPython.display
import seaborn as sns
import librosa
import librosa.display
import soundfile

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras import utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from keras import optimizers

import warnings

In [2]:
!pip install pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import re
import os
import wave
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pydub import AudioSegment
from IPython.display import Audio, display
import librosa as lib
import librosa.display

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D
from tensorflow.keras import Model

from sklearn.model_selection import train_test_split

TensorFlow version: 2.12.0


In [5]:
PATH = "/content/drive/MyDrive/Colab-Notebooks/Crema"
AUDIO_PATH = "/content/drive/MyDrive/Colab-Notebooks/Crema/1001_DFA_ANG_XX.wav"

In [6]:
def zeroCrossingRate(audio):
  return lib.feature.zero_crossing_rate(audio).mean()

In [7]:
def energy(audio):
  # using a spectrogram will give a more accurate representation
  # of energy over time because its frames can be windowed
  S, phase = lib.magphase(lib.stft(audio))
  return lib.feature.rms(S=S).mean()

In [8]:
def melSpectrogram(audio, sr):
  mel_spectrogram = lib.feature.melspectrogram(y=audio, sr=sr, n_fft=200)
  log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
  return log_mel_spectrogram

In [9]:
def chromaStft(audio, sr):
  stft = np.abs(librosa.stft(audio))
  return librosa.feature.chroma_stft(S=stft, sr=sr, n_fft=200).mean()

In [10]:
def mfcc(data, sr):
  return librosa.feature.mfcc(y=data, sr=sr, n_fft=200).mean()

In [11]:
def tonnetz(data, sr):
  return librosa.feature.tonnetz(y=data, sr=sr).mean();

In [12]:
def playAudio(audio_file):
  audio = AudioSegment.from_wav(audio_file)
  # Play the audio
  audio.export('temp_audio.wav', format='wav')
  audio_data = open('temp_audio.wav', 'rb').read()
  display(Audio(audio_data))
  # Delete the temporary audio file
  os.remove('temp_audio.wav')

In [13]:
def visualize_waveform(audio, sr):
    plt.figure(figsize=(12, 4))
    plt.plot(audio)
    plt.xlabel('Time')
    plt.ylabel('Amplitude')
    plt.title('Waveform')
    plt.show()

In [14]:
def loadAndListenToAudiolib(dataset_path, class_name):
  D, Y = [], []
  # defining the regular expression
  audio_files = glob.glob(os.path.join(dataset_path, f"*{class_name}*.wav"))
  print(class_name)
  # play the audio
  '''playAudio(audio_files[0])'''
  # plot the spectrum
  '''audio, sr = lib.load(audio_files[0])'''
  '''visualize_waveform(audio, sr)'''
  # plotWaveform(audio_files[0], class_name)
  for audio_file in audio_files: 
    # load the audio file
    audio, sr = lib.load(audio_file)
    # extract zero crossing rate
    '''zcr = zeroCrossingRate(audio)'''
    # extract energy
    '''rms = energy(audio)'''
    # extract mel spectrogram
    mel_spec = melSpectrogram(audio, sr)
    # chroma stft
    '''cs = chromaStft(audio, sr)'''
    '''mfc = mfcc(audio, sr)'''
    '''ton = tonnetz(audio, sr)'''
    # combined_features = np.concatenate(([zcr, rms], mel_spec.flatten()))
    '''combined_features = [zcr, rms, mel_spec, cs, mfc, ton]'''
    # print(combined_features)
    '''D.append(combined_features)'''
    D.append(mel_spec)
    Y.append(class_name)
  return D, Y

In [15]:
def loadDatalib(PATH):
  D, Y = [], []
  classes = ["SAD", "ANG", "DIS", "FEA", "HAP", "NEU"]
  for cls in classes: 
    d, y = loadAndListenToAudiolib(PATH, cls)
    # D = np.concatenate((D, d))
    D.extend(d)
    Y = np.concatenate((Y, y))
  return D, Y

In [16]:
class cnnModel(Model):
  def __init__(self):
    super(cnnModel, self).__init__()
    # Conv layer + maxpool
    self.conv1 = Conv2D(512, 5, activation='relu')
    self.maxpool1 = MaxPool2D(pool_size=(5, 5), strides=2)

    self.conv2 = Conv2D(512, 5, activation='relu')
    self.maxpool2 = MaxPool2D(pool_size=(5, 5), strides=2)

    self.conv3 = Conv2D(128, 5, activation='relu')
    self.maxpool3 = MaxPool2D(pool_size=(5, 5), strides=2)

    #Fully connected layer
    self.flatten = Flatten()
    self.d1 = Dense(256, activation='relu')
    self.d2 = Dense(6, activation='softmax')

  def call(self, x):
    x = self.conv1(x)
    x = self.maxpool1(x)

    x = self.conv2(x)
    x = self.maxpool2(x)

    x = self.conv3(x)
    x = self.maxpool3(x)

    x = self.flatten(x)
    x = self.d1(x)
    return self.d2(x)

In [17]:
model = cnnModel()

In [18]:
loss_object = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

In [19]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')

In [20]:
@tf.function
def train_step(audio, labels):
  with tf.GradientTape() as tape:
    # training=True is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(audio, training=True)
    loss = loss_object(labels, predictions)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  train_loss(loss)
  train_accuracy(labels, predictions)


In [21]:
@tf.function
def test_step(audio, labels):
  # training=False is only needed if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  predictions = model(audio, training=False)
  t_loss = loss_object(labels, predictions)

  test_loss(t_loss)
  test_accuracy(labels, predictions)


In [22]:
D, Y = loadDatalib(PATH)

SAD


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


ANG
DIS
FEA
HAP
NEU


In [23]:
np.pad(D[1], ((0, 0), (0, 256 - D[1].shape[1]))).shape

(128, 256)

In [24]:
for i, array in enumerate(D):
    D[i] = np.pad(array, ((0, 0), (0, 256 - array.shape[1])))
D = np.array(D)

In [25]:
D.shape

(7442, 128, 256)

In [26]:
Y.shape

(7442,)

In [27]:
df = pd.DataFrame(Y)

In [28]:
df.head()

Unnamed: 0,0
0,SAD
1,SAD
2,SAD
3,SAD
4,SAD


In [29]:
# Use the get_dummies() method to one-hot encode the labels
one_hot_labels = pd.get_dummies(df)

# Convert the DataFrame to a NumPy array
one_hot_labels = one_hot_labels.to_numpy()

print(one_hot_labels.shape)

(7442, 6)


In [30]:
Y = one_hot_labels

In [31]:
Y.shape

(7442, 6)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(D, Y, test_size=0.3, random_state=69, shuffle=True, stratify=Y)

In [33]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5209, 128, 256)
(2233, 128, 256)
(5209, 6)
(2233, 6)


In [34]:
X_train = X_train.reshape((5209, 128, 256, 1))
X_test = X_test.reshape((2233, 128, 256, 1))

In [35]:
unicheck = np.unique(y_test, axis=0)

In [36]:
unicheck

array([[0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]], dtype=uint8)

In [37]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(8)

test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(8)

In [None]:
EPOCHS = 5

for epoch in range(EPOCHS):
  # Reset the metrics at the start of the next epoch
  train_loss.reset_states()
  train_accuracy.reset_states()
  test_loss.reset_states()
  test_accuracy.reset_states()

  for audio, labels in train_ds:
    train_step(audio, labels)

  for test_audio, test_labels in test_ds:
    test_step(test_audio, test_labels)

  print(
    f'Epoch {epoch + 1}, '
    f'Loss: {train_loss.result()}, '
    f'Accuracy: {train_accuracy.result() * 100}, '
    f'Test Loss: {test_loss.result()}, '
    f'Test Accuracy: {test_accuracy.result() * 100}'
  )

Epoch 1, Loss: 3.7987558841705322, Accuracy: 29.679399490356445, Test Loss: 1.562891960144043, Test Accuracy: 37.79668426513672
Epoch 2, Loss: 1.7149198055267334, Accuracy: 24.45766830444336, Test Loss: 1.7898541688919067, Test Accuracy: 17.151813507080078
