In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import librosa # audio processing
from IPython.display import Audio # playing audio
from matplotlib import pyplot as plt # plots
import librosa.display
!pip install noisereduce
import noisereduce as nr
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling2D,BatchNormalization, MaxPooling1D, ReLU
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, ZeroPadding2D
from tensorflow.keras import optimizers
from tensorflow.keras.losses import SparseCategoricalCrossentropy, sparse_categorical_crossentropy
from tensorflow.python.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input/speech-emotion-recognition-en/Crema'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Dataset

In [None]:
data = [] # the audio signal
label = [] # the sentiment (for classification)
meta = [] # metadata (actor_sentence_sentiment_pitch)
sampling_rate = 18000 # all of them should have the same sampling rate

In [None]:
def play_plot(index):
    print(meta[index])
    Audio(data=data[index], rate=sampling_rate)
    #fig, ax = plt.subplots(nrows=3, sharex=True)
    #librosa.display.waveshow(data[index], sr=sampling_rate, ax=ax[0])

In [None]:
def adjust_length(time_series_list, length):
    n = len(time_series_list)
    for i in range(n):
        audio_length = len(time_series_list[i])
        if audio_length < length:
            time_series_list[i] = np.append(time_series_list[i], [0 for i in range(length-audio_length)])
        else:
            time_series_list[i] = np.array(time_series_list[i][:length])

In [None]:
def check_for_nan(l):
    for x in l:
        if str(x) == 'nan':
            return True
    return False

In [None]:
emotions_dict = dict()
emotions_dict['SAD'] = 0
emotions_dict['ANG'] = 1
emotions_dict['DIS'] = 2
emotions_dict['FEA'] = 3
emotions_dict['HAP'] = 4
emotions_dict['NEU'] = 5

In [None]:
'''Load to lists.. takes too long, run it just once''' 
length_sum = 0
list_a = []
list_b = []
for dirname, _, filenames in os.walk('/kaggle/input/speech-emotion-recognition-en/Crema'):
    for filename in filenames:
        meta.append(filename[:-4])
        full_filename = os.path.join(dirname, filename)
        sentiment = filename.split('_')[2]
        label.append(emotions_dict[sentiment])
        signal, sr = librosa.load(full_filename, sr = sampling_rate)
        reduced_noise = nr.reduce_noise(y=signal, sr=sampling_rate)
        if not check_for_nan(reduced_noise):
            signal = reduced_noise
        data.append(signal)
        length_sum += len(signal)
        if (len(data)%100 == 0):
            print(len(data), " audio loaded")
n = len(data)
adjust_length(data, 3*sampling_rate)
data = np.array(data)

## Display metadata, play audio and plot waveform

In [None]:
index = 5

In [None]:
Audio(data=data[index], rate=sampling_rate)

In [None]:
fig = plt.figure(figsize=(15,5))
fig.suptitle(meta[index], fontsize=15)
librosa.display.waveshow(data[index], sr=sampling_rate)

## Feature Extraction

In [None]:
def feature_extraction_1D(data):

    # Zero Crossing rate
    features = librosa.feature.zero_crossing_rate(y=data)

    # Energy
    features = np.append(features, librosa.feature.rms(y=data), axis=1)

    # Mel-frequency cepstral coefficient
    l = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=13), axis=0).reshape(1, 106)
    features = np.append(features, l, axis=1)
    
    # Spectral Centroid
    features = np.append(features, librosa.feature.spectral_centroid(y=data, sr=sampling_rate), axis=1)
    
    # Spectral Bandwidth
    features = np.append(features, librosa.feature.spectral_bandwidth(y=data, sr=sampling_rate), axis=1)
    
    # Spectral Flatness
    features = np.append(features, librosa.feature.spectral_flatness(y=data), axis=1)
    
    # Spectral Rolloff maximum frequencies
    features = np.append(features, librosa.feature.spectral_rolloff(y=data, sr=sampling_rate), axis=1)
    
    # Spectral Rolloff minimum frequencies
    features = np.append(features, librosa.feature.spectral_rolloff(y=data, sr=sampling_rate, roll_percent=0.01), axis=1)
    
    return np.array(features)

In [None]:
data_features_extracted_1D = []
for i in range(n):
    data_features_extracted_1D.append(np.squeeze(np.append(feature_extraction_1D(data[i]), label[i])))
    if (len(data_features_extracted_1D)%100 == 0):
            print(len(data_features_extracted_1D), " entry processed")
data_features_extracted_1D = np.array(data_features_extracted_1D)
print(data_features_extracted_1D.shape)

# Split Data

In [None]:
def split_1D(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state =1, stratify = y)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.05, random_state =1, stratify = y_train)
    return x_train, x_val, x_test, y_train, y_test, y_val

In [None]:
x_train, x_val, x_test, y_train, y_test, y_val = split_1D(data_features_extracted_1D, label)

# 1D Model

In [None]:
x_train = x_train[:,:,np.newaxis]
x_val = x_val[:,:,np.newaxis]
x_test = x_test[:,:,np.newaxis]

In [None]:
model=Sequential()

model.add(Conv1D(input_shape=(x_train.shape[1], 1),filters=64,kernel_size=3,padding="same", activation="relu"))
model.add(Conv1D(filters=64,kernel_size=3,padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size=2, strides=2))
model.add(Conv1D(filters=128, kernel_size=3, padding="same", activation="relu"))
model.add(Conv1D(filters=128, kernel_size=3, padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size=2, strides=2))
model.add(Conv1D(filters=256, kernel_size=3, padding="same", activation="relu"))
model.add(Conv1D(filters=256, kernel_size=3, padding="same", activation="relu"))
model.add(Conv1D(filters=256, kernel_size=3, padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size=2, strides=2))
model.add(Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
model.add(Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
model.add(Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size=2,strides=2))
model.add(Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
model.add(Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
model.add(Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size=2,strides=2))

model.add(Flatten())
model.add(Dense(units=4096,activation="relu"))
model.add(Dense(units=4096,activation="relu"))
model.add(Dense(units=6, activation="softmax"))

opt = optimizers.Adam()
model.compile(optimizer=opt , loss = 'SparseCategoricalCrossentropy' , metrics = ['accuracy'])
checkpoint = ModelCheckpoint("checkpoint_1D.h5", monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
early = EarlyStopping(monitor='val_acc', min_delta=0, patience=20, verbose=1, mode='auto')

history=model.fit(np.array(x_train), np.array(y_train), epochs=150, validation_data=(np.array(x_val), np.array(y_val)), callbacks=[checkpoint, early])
model.save("./best_model_1D")

In [None]:
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(accuracy))
plt.plot(epochs, accuracy, 'bo', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
predicted_classes = model.predict(x_test)
predicted_classes = np.argmax(np.round(predicted_classes),axis=1)

correct = np.where(predicted_classes==y_test)[0]
print ("Found %d correct labels" % len(correct))

incorrect = np.where(predicted_classes!=y_test)[0]
print ("Found %d incorrect labels" % len(incorrect))

In [None]:
print(classification_report(y_test, predicted_classes))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted_classes)
sns.heatmap(conf_mat, annot=True, fmt="d", cmap = 'ocean_r',xticklabels = list(emotions_dict.keys()),yticklabels = list(emotions_dict.keys()))
print('val accuracy:', max(val_accuracy)*100)
print('train accuracy:', max(accuracy)*100)
print("test accuracy",(len(correct)*100/(len(correct)+len(incorrect))))

In [None]:
y= model.predict('/kaggle/input/speech-emotion-recognition-en/Crema/1001_DFA_HAP_XX')