In [1]:
#imports
import pandas as pd
import numpy as np
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import time
from scipy.io import wavfile as wav
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics 
#keras
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Nadam
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
#Settings
audio_dataset_path = "C:\\Users\\Filip\\Desktop\\Jupyter\\wav_dataset"
#test files for prediction
test_file_on = audio_dataset_path+"\\"+"on"+"\\"+"3cc595de_nohash_1.wav"
test_file_down = audio_dataset_path+"\\"+"down"+"\\"+"b87bdb22_nohash_1.wav"
test_file_right = audio_dataset_path+"\\"+"right"+"\\"+"2aca1e72_nohash_1.wav"
class_label = ["down","go","left","on","right","stop","up"]
wav_sample_rate = 16000
num_mfcc = 40
#number of spectograms to make (per class)
num_files = 500
num_epochs = 120
num_batch = 32

In [3]:
#makes mfc spectrogram out of .wav file and rescales it
def get_spectrogram(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type="kaiser_fast")
        mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc = num_mfcc)
        scaled = np.mean(mfcc.T, axis=0)
    except Except as e:
        print("Error with file: ", file_name)
        return None, None
    return scaled

In [4]:
#iterates through all of files in dataset and makes spectrograms out of them
#saves spectrograms in numpy DataFrame (excel-like sheet)
def spectro_bot(dataset_path):
    entries = []
    start_time = time.time()
    for dir_name in class_label:
        print(dir_name)
        label_index = class_label.index(dir_name)
        dir_path = dataset_path+"\\"+dir_name
        i = 0
        for file_name in os.listdir(dir_path):
            file_path = dir_path+"\\"+file_name
            data = get_spectrogram(file_path)
            entries.append([data, label_index])
            i=i+1
            if (i==num_files):
                break
    entries_data_frame = pd.DataFrame(entries, columns=["entries", "label"])
    entries_data_frame = entries_data_frame.sample(frac=1).reset_index(drop=True)
    finish_time = time.time()
    print("Finished processing {} files in {} seconds".
          format(len(entries_data_frame), finish_time-start_time))
    return entries_data_frame

In [5]:
#make spectrograms
data_frame = spectro_bot(audio_dataset_path)
# Convert features and corresponding classification labels into numpy arrays
X = np.array(data_frame.entries.tolist())
y = np.array(data_frame.label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

#split dataset
x_train, x_test, y_train, y_test = train_test_split(X, yy,
                                                    test_size=0.25,
                                                    random_state = 42)



down
go
left
on
right
stop
up
Finished processing 3500 files in 41.03398537635803 seconds


In [6]:
num_labels = yy.shape[1]
#=====================MODEL===========================
model = Sequential()
#input layer
model.add(Dense(300, input_shape=(num_mfcc,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
#second layer
model.add(Dense(600))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#output layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))
#====================================================

In [7]:
#compile
model.compile(optimizer='Nadam', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()
# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=0)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 300)               12300     
_________________________________________________________________
activation_1 (Activation)    (None, 300)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 600)               180600    
_________________________________________________________________
activation_2 (Activation)    (None, 600)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 7)                

In [8]:
#train
start_time = time.time()
model.fit(x_train, y_train, 
          batch_size=num_batch, 
          epochs=num_epochs, 
          validation_data=(x_test, y_test), 
          verbose=1)
finish_time = time.time()
print("Training finished in {} seconds".format(finish_time-start_time))

Train on 2625 samples, validate on 875 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120


Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78/120
Epoch 79/120
Epoch 80/120
Epoch 81/120
Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120


Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120
Training finished in 43.14533448219299 seconds


In [9]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9020952582359314
Testing Accuracy:  0.5771428346633911


In [10]:
#makes spectrogram out of .wav file for prediction
#returns different format than get_spectrogram(), usable only in
#print_prediction() function
def extract_feature(file_name):
   
    try:
        audio_data, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40)
        mfccsscaled = np.mean(mfccs.T,axis=0)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None, None

    return np.array([mfccsscaled])

In [11]:
#prints prediction in 
def print_prediction(file_name,model):
    prediction_feature = extract_feature(file_name)
    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", class_label[predicted_class[0]], '\n') 
    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )


In [14]:
print_prediction(test_file_right, model)

The predicted class is: right 

0 		 :  0.09295421093702316284179687500000
1 		 :  0.27327957749366760253906250000000
2 		 :  0.01250028889626264572143554687500
3 		 :  0.11387594044208526611328125000000
4 		 :  0.50309169292449951171875000000000
5 		 :  0.00311328377574682235717773437500
6 		 :  0.00118502764962613582611083984375
