# Audio Model

In [89]:
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import backend as K

In [50]:
audio_data_fp = "../mmsd_raw_data/converted_utterances"
dataset_csv_path = "../normalized_mustard_dataset.csv"

In [90]:
"""
From https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
"""
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [77]:
def create_mfcc(filepath:str, n_fft:int, hop_length: int, n_mfcc:int = 13):
    """
    Creates MFCC for file at filepath

    :param filepath: Location of file to be used
    :param n_fft: Number of Fast Fourier Transforms
    :param hop_length: Number of Hops within samples
    :param n_mfcc: Number of MFCC's to be outputted
    :return: Array containing mean of all MFCC's
    """
    signal, sample_rate = librosa.load(filepath)
    mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc)
    mean_mfccs = np.mean(mfccs.T,axis=0)
    return mean_mfccs

In [51]:
def create_mfcc_features(data):
    """
    Creates a list of Mel-Frequency Co-Efficients
    """
    hop_length = 512
    n_fft = 2048

    _mfcc_df = pd.DataFrame(columns=["features", "sarcasm_state"])

    tqdm_data = tqdm(zip(data["file_name"], data["sarcasm"]))

    for file_name,sarcasm_state in tqdm_data:
        tqdm_data.set_description(f"Creating MFCC for {file_name}")
        fp = f"{audio_data_fp}/{str(file_name)}"
        mean_mfccs = create_mfcc(fp, n_fft, hop_length)
        _mfcc_df = _mfcc_df.append({
            "features": mean_mfccs,
            "sarcasm_state": sarcasm_state
        }, ignore_index=True)

    return _mfcc_df

In [38]:
dataset_df = pd.read_csv(dataset_csv_path)
mfcc_df = create_mfcc_features(dataset_df)

Creating MFCC for 2_524.wav: : 690it [02:44,  4.19it/s]  


In [71]:
mfcc_df.head()

Unnamed: 0,features,sarcasm_state
0,"[-146.32887, 99.83219, -50.631638, 9.274939, -...",True
1,"[-147.90367, 104.595116, -39.535, 4.108271, -2...",True
2,"[-21.120022, 86.11223, -35.381474, 22.640253, ...",False
3,"[-23.78374, 70.78856, -34.25742, 23.38274, -16...",False
4,"[1.6350226, 82.73289, -46.96253, 14.140306, -2...",True


In [72]:
X = np.array(mfcc_df['features'].tolist())
y = np.array(mfcc_df['sarcasm_state'].tolist())

In [73]:
X.shape

(690, 13)

In [75]:
# Label Encoder for getting sarcasm state
label_encoder = LabelEncoder()
y = to_categorical(label_encoder.fit_transform(y))

In [76]:
y.shape

(690, 2)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=0)

In [85]:
# Number of Classes
label_count = y.shape[1]

In [86]:
model = Sequential()
model.add(Dense(100, input_shape=(13,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(label_count))
model.add(Activation('softmax'))

model.summary()

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_47 (Dense)             (None, 100)               1400      
_________________________________________________________________
activation_35 (Activation)   (None, 100)               0         
_________________________________________________________________
dropout_35 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_48 (Dense)             (None, 200)               20200     
_________________________________________________________________
activation_36 (Activation)   (None, 200)               0         
_________________________________________________________________
dropout_36 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_49 (Dense)             (None, 100)             

In [96]:
model.compile(loss="categorical_crossentropy", metrics=['accuracy',f1_m,precision_m,recall_m], optimizer='adam')

In [97]:
num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath="../models/audio_model.h5", verbose=True, save_best_only=True)

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, callbacks=[checkpointer])

Train on 552 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


<tensorflow.python.keras.callbacks.History at 0x7f89badd7d50>

## Testing Model

In [99]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print(f"loss: {loss}\naccuracy:{accuracy}\nf1_score:{f1_score}\nprecision:{precision}\nrecall:{recall}")

loss: 0.8415306720180787
accuracy:0.6449275612831116
f1_score:0.6662499308586121
precision:0.6662499904632568
recall:0.6662499904632568


In [85]:
test_file_path = "../mmsd_raw_data/converted_utterances/2_626.wav"
audio, sample_rate = librosa.load(test_file_path)
mfcc_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
mfcc_scaled_features = np.mean(mfcc_features.T, axis=0)

mfcc_scaled_features = mfcc_scaled_features.reshape(1, -1)

In [90]:
predicted_label = model.predict_classes(mfcc_scaled_features)
predicted_class = label_encoder.inverse_transform(predicted_label)
print(f"Sarcastic: {predicted_class[0]}")

Sarcastic: False
