#Code for Kazakh Speech based Speech Emotion Recognition Feaute extraction

##The code working states:

* Importation of libraries (librosa, numpy, tensorflow, matplotlib, keras, pandas)

* import the data (the dataset having links to each video file that we have in our folder)

* store the audio file

In [5]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.pyplot import specgram

import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import seaborn as sns

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras import regularizers

import os
import glob
import IPython.display as ipd
import plotly.express as px
import scipy.io.wavfile
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
dataset_path = "/content/drive/MyDrive/KazSER/Dialogue Dataset/Input.csv" #path to our table with names of videos

In [8]:
df = pd.read_csv(dataset_path)
df.head()

audio = pd.DataFrame()
audio['File'] = df['FileName']
audio['Emotion'] = df['Emotion']

print(audio)

           File   Emotion
0    dia1_clip1   neutral
1    dia1_clip2   neutral
2    dia1_clip2   neutral
3    dia1_clip4   neutral
4    dia1_clip5   neutral
5    dia1_clip6       joy
6    dia1_clip7   neutral
7    dia1_clip8       joy
8    dia1_clip9     anger
9    dia2_clip1       joy
10   dia2_clip2   neutral
11   dia2_clip3   neutral
12   dia2_clip4       joy
13   dia3_clip1     anger
14   dia3_clip2      fear
15   dia3_clip3     anger
16   dia3_clip4       joy
17   dia3_clip5       joy
18   dia3_clip6     anger
19   dia3_clip7   disgust
20   dia3_clip8     anger
21   dia3_clip9   neutral
22  dia3_clip10   neutral
23  dia3_clip11       joy
24  dia3_clip12   neutral
25   dia4_clip1  surprise
26   dia4_clip2   neutral
27   dia4_clip3   neutral
28   dia4_clip4  surprise
29   dia4_clip5   neutral
30   dia4_clip6   neutral
31   dia4_clip7       joy
32   dia4_clip8       joy
33   dia4_clip9   neutral
34  dia4_clip10   neutral
35   dia5_clip1   neutral
36   dia5_clip2   neutral
37   dia5_cl

In [9]:
# dir_list = os.listdir(dataset_path)

# file_emotion = []

# unique_emotions = {1: 0, 2:0, 3: 0, 4: 0, 5: 0, 6:0, 7: 0, 8: 0}

# # file_path = []
# # for i in dir_list:
# #     actor = os.listdir(dataset_path + i)
# #     for f in actor:
# #         part = f.split('.')[0].split('-')
# #         if (unique_emotions[int(part[2])] <= 99):
# #           file_emotion.append(int(part[2]))
# #           file_path.append(dataset_path + i + '/' + f)
# #           unique_emotions[int(part[2])] += 1

# emotion_df = pd.DataFrame(file_emotion, columns=[''])

# path_df = pd.DataFrame(file_path, columns=['path'])
# df = pd.concat([emotion_df, path_df], axis=1)

# df.emotion.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust',
#                              8:'surprise'},
#                             inplace=True)
# print(df.emotion.value_counts())

In [10]:
audio = pd.concat([audio], axis = 0) #to concatinate several datasets to one

In [11]:
#to show the distribution of emotion of input data
px_fig = px.histogram(audio, x='Emotion', color='Emotion', marginal='box',
                      title='Emotion Count')
px_fig.update_layout(bargap=0.2)
px_fig.show()

In [12]:
#to create waveplot of audio file
def create_waveplot(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.title('Waveplot for audio with {} emotion'.format(e), size=15)
    librosa.display.waveshow(data, sr=sr)
    plt.show()

In [13]:
#to create a spectogram
def create_spectrogram(data, sr, e):
    # stft function converts the data into short term fourier transform
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(12, 3))
    plt.title('Spectrogram for audio with {} emotion'.format(e), size=15)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
    #librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar()

In [14]:
#to preprocess the audio and add some features
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.80):
    return librosa.effects.time_stretch(data, rate = rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.8):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

def higher_speed(data, speed_factor = 1.2):
    return librosa.effects.time_stretch(data, rate = speed_factor)

# def lower_speed(data, speed_factor = 0.8):
#     return librosa.effects.time_stretch(data, rate = speed_factor)

In [15]:
def extract_features(data, sample_rate):

    result = np.array([])

    # ZCR
    # zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    # result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    # stft = np.abs(librosa.stft(data))
    # chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    # result = np.hstack((result, chroma_stft)) # stacking horizontally


    # Root Mean Square Value
    # rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    # result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    # mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    # result = np.hstack((result, mel)) # stacking horizontally

    # MFCC - our main feature type, but other features can be used as well
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    return result

#Function to get all the MFCC for each preprocessed dataset
def get_features(path):

    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, offset=0.6)

    #without augmentation
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)

    #noised
    noise_data = noise(data)
    res2 = extract_features(noise_data,  sample_rate)
    result = np.vstack((result, res2)) # stacking vertically

    #stretched
    stretch_data = stretch(data)
    res3 = extract_features(stretch_data,  sample_rate)
    result = np.vstack((result, res3))

    #shifted
    shift_data = shift(data)
    res4 = extract_features(shift_data,  sample_rate)
    result = np.vstack((result, res4))

    # pitched
    pitch_data = pitch(data, sample_rate)
    res5 = extract_features(pitch_data,  sample_rate)
    result = np.vstack((result, res5))

    #speed up
    higher_speed_data = higher_speed(data)
    res6 = extract_features(higher_speed_data,  sample_rate)
    result = np.vstack((result, res6))

    # # #speed down
    # lower_speed_data = higher_speed(data)
    # res7 = extract_features(lower_speed_data,  sample_rate)
    # result = np.vstack((result, res7))

    return result

In [16]:

# def get_features_meld(path):
#     # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
#     data, sample_rate = librosa.load(path)

#     #without augmentation
#     res1 = extract_features(data, sample_rate)
#     result = np.array(res1)

#     #noised
#     noise_data = noise(data)
#     res2 = extract_features(noise_data,  sample_rate)
#     result = np.vstack((result, res2)) # stacking vertically

#     #stretched
#     stretch_data = stretch(data)
#     res3 = extract_features(stretch_data,  sample_rate)
#     result = np.vstack((result, res3))

#     #shifted
#     shift_data = shift(data)
#     res4 = extract_features(shift_data,  sample_rate)
#     result = np.vstack((result, res4))

#     #pitched
#     pitch_data = pitch(data, sample_rate)
#     res5 = extract_features(pitch_data,  sample_rate)
#     result = np.vstack((result, res5))

#     #speed up
#     higher_speed_data = higher_speed(data)
#     res6 = extract_features(higher_speed_data,  sample_rate)
#     result = np.vstack((result, res6))

#     # #speed down
#     # lower_speed_data = higher_speed(data)
#     # res7 = extract_features(lower_speed_data,  sample_rate)
#     # result = np.vstack((result, res7))

#     return result

In [17]:
#to store for each audio file input, the features set and the emotion value
X, Y = [], []
audio_path = '/content/drive/MyDrive/KazSER/Dialogue Dataset/Videos/Audio only/'

# '/content/drive/MyDrive/KazSER/Dataset/Videos/Audio only/emotion 1 - neutral - audio.mp3'

for path, emotion in zip(audio.File, audio.Emotion):
    feature = get_features(audio_path + path + '.mp3')
    for ele in feature:
        X.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)

In [18]:
# #function to get the audio file from video file
# from moviepy.editor import VideoFileClip

# def extract_audio(path):

#   video_clip = VideoFileClip(path)

#   audio_clip = video_clip.audio

#   audio_path = '/content/audio.wav'

#   audio_clip.write_audiofile(audio_path)

#   return audio_path

In [19]:
#code to create a newdatafremae with MFCC features and emotion and storing it as a new csv file
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features-kazser-dialogues-2.csv', index=False)

In [20]:
#code to get more information about the features and emotion
display(Features.head(100))
display(Features.describe())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,labels
0,-382.777466,95.185249,-9.166131,14.621998,-1.048449,6.800313,-4.792756,-1.155354,-3.982703,2.512948,...,-0.365112,-7.758314,0.169606,-8.093513,-4.014623,-7.481653,-3.599602,-5.959622,-3.666107,neutral
1,-382.776546,95.184388,-9.165718,14.620529,-1.047388,6.799871,-4.793028,-1.153296,-3.982802,2.511928,...,-0.367623,-7.756356,0.168930,-8.093755,-4.014647,-7.482607,-3.597698,-5.960827,-3.665056,neutral
2,-416.953094,94.256798,-10.248082,14.283892,-0.973696,6.933626,-4.728953,-0.912788,-3.058490,1.987085,...,-0.916115,-6.788033,0.345472,-8.168850,-2.842900,-7.050879,-3.677244,-4.828530,-4.087322,neutral
3,-380.192657,96.508324,-7.976822,13.526684,-1.560805,7.099102,-5.150238,-1.769677,-4.373225,1.861434,...,-0.705358,-8.465516,0.336240,-7.578731,-3.303786,-7.280236,-3.762106,-6.008490,-3.252334,neutral
4,-409.745575,93.558792,-10.563017,14.027417,-1.078533,6.355930,-6.004551,-2.727870,-3.792922,0.537500,...,-2.608426,-8.699957,0.746265,-8.661509,-2.722612,-8.063672,-1.977104,-6.360843,0.069875,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-339.810211,53.988480,22.498755,15.501820,-1.832121,11.264935,-5.337697,2.752551,-4.222995,-3.324195,...,-0.212312,-3.758683,4.636116,1.750077,-1.332264,0.394417,-0.531932,0.693858,1.278345,anger
96,-278.548401,68.819633,-8.563692,23.438206,0.504112,-13.273792,-13.118335,1.487600,-6.966363,-6.160036,...,-2.319573,-6.142572,-0.911222,-3.846950,-5.140581,-7.793033,-6.408188,-8.625794,-1.294529,joy
97,-262.834190,56.439940,-2.760192,18.106546,1.036476,-12.831987,-12.727324,1.242575,-6.883751,-6.557708,...,-2.693303,-5.576122,-1.281508,-4.087177,-5.600710,-7.473568,-6.617495,-8.104533,-1.596436,joy
98,-308.953186,64.089661,-8.303349,20.963326,1.137694,-14.384514,-13.905827,2.350412,-7.023786,-6.129883,...,-2.127836,-5.979624,-1.411682,-3.914438,-5.248547,-8.026722,-6.423928,-8.539291,-0.457856,joy


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0,258.0
mean,-309.745403,83.259624,7.746035,15.344857,0.313558,4.958985,-5.764549,-0.759015,-3.891657,0.007498,-4.478833,-0.44232,-4.706715,-0.522075,-2.734138,-1.787967,-3.081666,-0.3669,-3.004225,-0.770274
std,85.893766,32.939978,15.699494,10.426798,7.764167,8.414901,8.99374,6.341365,6.557352,6.356394,6.671027,5.151612,5.093827,4.594292,4.507747,3.921158,3.650833,3.54884,3.534033,3.329929
min,-522.254639,6.579367,-27.967146,-13.485168,-22.240274,-17.422338,-28.920195,-14.902534,-19.733994,-13.427852,-18.471052,-13.310358,-16.891483,-10.853231,-16.664679,-12.839865,-12.341676,-7.951094,-18.048126,-12.63846
25%,-372.274651,60.148136,-4.379825,8.917585,-3.237051,0.076305,-12.57092,-5.340332,-7.784252,-5.12882,-10.504535,-5.115309,-9.10753,-3.498691,-5.183381,-4.39585,-5.898934,-2.914141,-5.654115,-2.24471
50%,-306.999585,83.231823,8.296974,15.379624,1.10337,6.396354,-4.811497,-1.808006,-3.930093,1.286519,-3.599944,0.817056,-4.223341,-0.934318,-2.302652,-1.457215,-2.740263,0.046958,-2.466779,-0.325534
75%,-253.839493,109.50713,15.995073,21.058527,5.057837,10.694432,0.66104,2.738049,-0.354467,4.772467,1.3042,3.296988,-0.480842,1.992709,-0.187885,1.057691,-0.225087,2.091483,-0.495888,1.386457
max,-100.886375,141.911072,55.185116,40.25346,14.541234,28.323032,11.96376,16.359356,9.664433,13.559478,6.020531,8.428116,3.75524,21.247648,11.226429,7.088062,5.875615,9.536966,6.331724,9.371258
