In [377]:
import soundfile # to read audio file
import numpy as np
import scipy
import librosa # to extract speech features
import glob
import pandas as pd
from  scipy.sparse import csr_matrix
import os
import pickle # to save model after training
from sklearn.model_selection import train_test_split # for splitting training and testing
from sklearn.neural_network import MLPClassifier # multi-layer perceptron model
from sklearn.metrics import accuracy_score # to measure how good we are

In [418]:
def extract_feature(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
        e.g:
        `features = extract_feature(path, mel=True, mfcc=True)`
    """ 
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        #if chroma or contrast:
        #stft = np.abs(librosa.stft(X))
        audio=np.frombuffer(X,dtype=np.int16)
        stft = librosa.feature.melspectrogram(audio.astype('float32'), sr= sample_rate)
        result = np.array([])
        if mfcc:
            #mfccs = np.mean(librosa.feature.mfcc(y=X, sr=16000, n_mfcc=40).T, axis=0)
            mfc= np.mean(librosa.feature.mfcc(y=X, sr=16000, S=stft, n_mfcc=40).T,axis=0)
            #result=result.reshape(40,334)
            #result = result.reshape(1,total_length)
            result = np.hstack((result, mfc))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=16000).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:  
            mel = np.mean(librosa.feature.melspectrogram(y=X,S=stft, sr=16000).T,axis=0)
            rows = len(mel)
            columns = 1
            total_length = rows * columns
            #mel=mel.reshape(1,total_length)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=16000).T,axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            #tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), S=stft,sr=sample_rate).T,axis=0)
            tonnetz = np.mean(librosa.feature.tonnetz(y=X, sr=16000, chroma=chroma).T,axis=0)
            result = np.hstack((result, tonnetz))
    return result

In [379]:
AVAILABLE_EMOTIONS = {
    "angry",
    "surprised",
    "happy"
}

In [380]:
 df = pd.read_csv('C:/Users/Engy/Desktop/arabic-natural-audio-dataset/ANAD.csv')
 df.head()
#d = df.loc[df['name'] == "V2_1 (1).wav'"]
#df.loc[df['name'] == "V2_1 (1).wav'", 'Emotion '].values[0]


Unnamed: 0,name,Emotion,pcm_intensity_sma_max,pcm_intensity_sma_min,pcm_intensity_sma_range,pcm_intensity_sma_maxPos,pcm_intensity_sma_minPos,pcm_intensity_sma_amean,pcm_intensity_sma_linregc1,pcm_intensity_sma_linregc2,...,F0env_sma_de_linregerrQ,F0env_sma_de_stddev,F0env_sma_de_skewness,F0env_sma_de_kurtosis,F0env_sma_de_quartile1,F0env_sma_de_quartile2,F0env_sma_de_quartile3,F0env_sma_de_iqr12,F0env_sma_de_iqr23,F0env_sma_de_iqr13
0,V2_1 (1).wav',surprised,2e-06,0,2e-06,39,0,0.0,0.0,0.0,...,44.29016,6.804174,3.144908,12.62142,-0.208212,0.0,1.81985,0.208212,1.81985,2.028063
1,V2_1 (2).wav',surprised,2e-06,0,2e-06,38,0,0.0,0.0,0.0,...,15.2841,4.709333,2.091973,7.180433,-0.068987,0.610156,2.934069,0.679144,2.323913,3.003057
2,V2_1 (3).wav',surprised,2e-06,0,2e-06,38,0,0.0,0.0,0.0,...,46.80196,7.886595,2.563328,9.669317,-0.837921,0.0,1.825973,0.837921,1.825973,2.663894
3,V2_1 (4).wav',surprised,2e-06,0,2e-06,38,0,0.0,0.0,0.0,...,26.65183,5.670303,1.83799,7.500129,0.0,0.215477,3.138976,0.215477,2.923499,3.138976
4,V2_1 (5).wav',surprised,1e-06,0,1e-06,57,0,0.0,0.0,0.0,...,16.19031,4.621727,1.855579,5.687108,-0.105829,0.327793,2.982076,0.433622,2.654283,3.087905


In [389]:
def load_data(test_size):
    X, y = [], []
    for file in glob.glob("C:/Users/Engy/Desktop/arabic-natural-audio-dataset/1sec_segmented_*/1sec_segmented_*/*.wav"):
        # get the base name of the audio file
        basename = os.path.basename(file)
        basename = basename + "'"
        #s = pd.Series(df('name'))
        # get the emotion label
        #emotion = int2emotion[basename.split("-")[2]]
        #print(basename)
        #df= df.to_string(index=False)
        #print(df['name'].values)
        if (basename in df['name'].values):
            #print(basename)
            emotion =  df.loc[df['name'] == basename, 'Emotion '].values[0] 
            # extract speech features
            features = extract_feature(file, mfcc=True,chroma=True,mel=True,contrast=True,tonnetz=True)
            # add to data
            X.append(features)
            y.append(emotion)
            # split the data to training and testing and return it
    
    return train_test_split(np.array(X), y, test_size=0.25, random_state=14)

In [424]:
X_train, X_test, y_train, y_test = load_data(test_size=0.25)



In [425]:
model_params = {
    'alpha': 0.01,
    'batch_size': 256,
    'epsilon': 1e-08, 
    'hidden_layer_sizes': (300,), 
    'learning_rate': 'adaptive', 
    'max_iter': 500, 
}
model = MLPClassifier(**model_params)

In [426]:
#X_tr= (X_train).reshape(1000,-1)
#y_tr = np.array(y_train).reshape(-1,1)
print(X_train.shape)
print(y_tr.shape)
#print(X_train)
model.fit(X_train,y_train)

(1000, 188)
(1000, 1)




MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [427]:
print(X_test.shape)
y_pred = model.predict(X_test)

(334, 188)


In [428]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 89.82%


In [396]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))


In [405]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
Acc = loaded_model.score(X_test, y_test) * 100
print("Accuracy: {:.2f}%".format(Acc))

Accuracy: 91.62%


In [448]:
def predict_record():
        X_record= []
        # extract speech features
        features = extract_feature('Sample.wav', mfcc=True,chroma=True,mel=True,contrast=True,tonnetz=True)
        # add to data
        X_record.append(features)
        y_record = model.predict(np.array(X_record))
        print(y_record)
        return y_record

In [449]:
Out_Emotion = predict_record()

['happy']


