In [1]:
from google.colab import drive
drive.mount('/mydrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /mydrive


## Extract data from zip file

In [2]:
# import package to unzip .7zip file
!apt-get install p7zip-full
!ls

Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip-full is already the newest version (16.02+dfsg-6).
0 upgraded, 0 newly installed, 0 to remove and 28 not upgraded.
sample_data


In [0]:
# create local directory and copy zip file into it
!mkdir /kaggle_audio/
!cp /mydrive/My\ Drive/DL_Project/03_Speech_to_Text/Speech_Commands_Recognition/datasets/kaggle_audio_dataset/train.7z /kaggle_audio/

In [4]:
# locally unzip the audio files in train.7z
!cd /kaggle_audio/ && p7zip -d train.7z


7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 1121103842 bytes (1070 MiB)

Extracting archive: train.7z
--
Path = train.7z
Type = 7z
Physical Size = 1121103842
Headers Size = 389133
Method = Delta LZMA2:24
Solid = +
Blocks = 2

  0%      0% 39 - train/audio/_background_noise_/dude_miaowing.wav                                                            0% 41 - train/audio/_background_noise_/pink_noise.wav                                                         0% 42 - train/audio/_background_noise_/running_tap.wav

## Imports

In [0]:
%matplotlib inline

import IPython.display
import numpy as np
import matplotlib.pyplot as plt
import copy
from scipy.io import wavfile
from scipy.signal import butter, lfilter
import scipy.ndimage

## Data Loader

In [6]:
!cd /kaggle_audio/train && ls

audio  LICENSE	README.md  testing_list.txt  validation_list.txt


In [7]:
DATADIR = '/kaggle_audio/' # unzipped train and test data
OUTDIR = './model-k' # just a random name
# Data Loading
import os
import re
from glob import glob

POSSIBLE_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}

def load_data(data_dir):
    """ Return 2 lists of tuples:
    [(class_id, user_id, path), ...] for train
    [(class_id, user_id, path), ...] for validation
    """
    # Just a simple regexp for paths with three groups:
    # prefix, label, user_id
    pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    all_files = glob(os.path.join(data_dir, 'train/audio/*/*wav'))

    with open(os.path.join(data_dir, 'train/validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
    valset = set()
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))

    possible = set(POSSIBLE_LABELS)
    train, val = [], []
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            if label == '_background_noise_':
                label = 'silence'
            if label not in possible:
                label = 'unknown'

            label_id = name2id[label]

            sample = (label_id, uid, entry)
            if uid in valset:
                val.append(sample)
            else:
                train.append(sample)

    print('There are {} train and {} val samples'.format(len(train), len(val)))
    return train, val

trainset, valset = load_data(DATADIR)

There are 57929 train and 6798 val samples


In [8]:
trainset_sub = trainset[2000:2200]
valset_sub = valset[:10]
trainset_sub

[(11, '7bae88ed', '/kaggle_audio/train/audio/four/7bae88ed_nohash_1.wav'),
 (11, 'cc71bada', '/kaggle_audio/train/audio/four/cc71bada_nohash_0.wav'),
 (11, '617de221', '/kaggle_audio/train/audio/four/617de221_nohash_1.wav'),
 (11, 'f4f59743', '/kaggle_audio/train/audio/four/f4f59743_nohash_1.wav'),
 (11, 'fb24c826', '/kaggle_audio/train/audio/four/fb24c826_nohash_0.wav'),
 (11, 'fbf3dd31', '/kaggle_audio/train/audio/four/fbf3dd31_nohash_0.wav'),
 (11, 'b9db6e42', '/kaggle_audio/train/audio/four/b9db6e42_nohash_0.wav'),
 (11, 'bab36420', '/kaggle_audio/train/audio/four/bab36420_nohash_2.wav'),
 (11, '5705a0f9', '/kaggle_audio/train/audio/four/5705a0f9_nohash_1.wav'),
 (11, '5f47fdf9', '/kaggle_audio/train/audio/four/5f47fdf9_nohash_1.wav'),
 (11, '637c702a', '/kaggle_audio/train/audio/four/637c702a_nohash_0.wav'),
 (11, '9a7c1f83', '/kaggle_audio/train/audio/four/9a7c1f83_nohash_1.wav'),
 (11, 'b49caed3', '/kaggle_audio/train/audio/four/b49caed3_nohash_2.wav'),
 (11, 'e7ebf59c', '/kaggl

# Import MFCC Package

In [9]:
!pip install python_speech_features

Collecting python_speech_features
  Downloading https://files.pythonhosted.org/packages/ff/d1/94c59e20a2631985fbd2124c45177abaa9e0a4eee8ba8a305aa26fc02a8e/python_speech_features-0.6.tar.gz
Building wheels for collected packages: python-speech-features
  Building wheel for python-speech-features (setup.py) ... [?25l[?25hdone
  Created wheel for python-speech-features: filename=python_speech_features-0.6-cp36-none-any.whl size=5889 sha256=82fa41fe44541e1c8749d14d0b7e69f1b76c4828ba345bcf3ba2872f6951b9b6
  Stored in directory: /root/.cache/pip/wheels/3c/42/7c/f60e9d1b40015cd69b213ad90f7c18a9264cd745b9888134be
Successfully built python-speech-features
Installing collected packages: python-speech-features
Successfully installed python-speech-features-0.6


In [10]:
!git clone https://github.com/jameslyons/python_speech_features
%cd python_speech_features
!python setup.py develop

Cloning into 'python_speech_features'...
remote: Enumerating objects: 427, done.[K
remote: Total 427 (delta 0), reused 0 (delta 0), pack-reused 427[K
Receiving objects: 100% (427/427), 218.49 KiB | 416.00 KiB/s, done.
Resolving deltas: 100% (207/207), done.
/content/python_speech_features
running develop
running egg_info
creating python_speech_features.egg-info
writing python_speech_features.egg-info/PKG-INFO
writing dependency_links to python_speech_features.egg-info/dependency_links.txt
writing requirements to python_speech_features.egg-info/requires.txt
writing top-level names to python_speech_features.egg-info/top_level.txt
writing manifest file 'python_speech_features.egg-info/SOURCES.txt'
writing manifest file 'python_speech_features.egg-info/SOURCES.txt'
running build_ext
Creating /usr/local/lib/python3.6/dist-packages/python-speech-features.egg-link (link to .)
Adding python-speech-features 0.6 to easy-install.pth file

Installed /content/python_speech_features
Processing dep

In [11]:
%cd ..
!git clone https://github.com/Tony607/Acoustic_Diagnosis
%cd Acoustic_Diagnosis

/content
Cloning into 'Acoustic_Diagnosis'...
remote: Enumerating objects: 32, done.[K
remote: Total 32 (delta 0), reused 0 (delta 0), pack-reused 32[K
Unpacking objects: 100% (32/32), done.
/content/Acoustic_Diagnosis


# Sequential Model

### Initialization

In [12]:
import glob
import pandas as pd
from scipy import stats
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
import numpy as np
from sklearn.model_selection import train_test_split
%matplotlib inline
plt.style.use('ggplot')

### Test Data

In [0]:
# Need current directory to be /content/Acoustic_Diagnosis/
# %cd content/Acoustic_Diagnosis/
base_dir = "./data/"
sound_file_paths = ["new_60Hz_de_10s_22khz.wav","new_60Hz_re_10s_22khz.wav",
                    "used_60Hz_de_10s_22khz.wav","used_60Hz_re_10s_22khz.wav",
                    "red_60Hz_de_10s_22khz.wav","red_60Hz_re_10s_22khz.wav"]
# Output tags
sound_names = ["new","new",
               "used","used",
               "red","red"]

### Google Speech Data

In [47]:
# Need directory to be top level
%cd ..
base_dir = "./"
sound_names = []
sound_file_paths = []
i = 1
for sample in trainset:
  if i%1 == 0:
    class_id, user_id, filepath = sample
    file_locations,file_names = os.path.split(filepath)
    _,sound_label = os.path.split(file_locations)
    if sound_label in ['yes','no','on','off','left','right','stop','go','up','down']:
      sound_file_paths.append(filepath)
      sound_names.append(sound_label)
  i+=1

print(sound_names[-10:])
sound_file_paths[-10:]

/
['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no']


['/kaggle_audio/train/audio/no/1995cf9e_nohash_0.wav',
 '/kaggle_audio/train/audio/no/c79159aa_nohash_1.wav',
 '/kaggle_audio/train/audio/no/f292725f_nohash_0.wav',
 '/kaggle_audio/train/audio/no/10ace7eb_nohash_2.wav',
 '/kaggle_audio/train/audio/no/5c39594f_nohash_2.wav',
 '/kaggle_audio/train/audio/no/01bb6a2a_nohash_1.wav',
 '/kaggle_audio/train/audio/no/cb802c63_nohash_0.wav',
 '/kaggle_audio/train/audio/no/1df483c0_nohash_2.wav',
 '/kaggle_audio/train/audio/no/850e2222_nohash_0.wav',
 '/kaggle_audio/train/audio/no/a3fc7884_nohash_0.wav']

### MFCC Feature Extraction

In [0]:
def feature_normalize(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.std(dataset, axis=0)
    return (dataset - mu) / sigma
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield int(start), int(start + window_size)
        start += (window_size / 2) # stepping at half window size
def extract_features(base_dir, sound_file_paths, sound_names ,bands = 20, frames = 41):
    window_size = 512 * (frames - 1)
    mfccs = []
    labels = []
    for i, sound_file_path in enumerate(sound_file_paths):
        sound_file_full_path = os.path.join(base_dir, sound_file_path)
        sound_clip,s = librosa.load(sound_file_full_path)
        sound_clip = feature_normalize(sound_clip)
        label = sound_names[i]
        for (start,end) in windows(sound_clip,window_size):
            if(len(sound_clip[start:end]) == window_size):
                signal = sound_clip[start:end]
                # y: audio time series, sr: sampling rate, n_mfcc: number of MFCCs to return
                # librosa.feature.mfcc() function return numpy array with shape (bands, frames)
                # transpose since the model expects time axis(frames) come first
                mfcc = librosa.feature.mfcc(y=signal, sr=s, n_mfcc = bands).T 
                mfccs.append(mfcc)
                labels.append(label)
    features = np.asarray(mfccs)
    return np.array(features), np.array(labels,dtype = np.str)

def one_hot_encode(labels):
    return np.asarray(pd.get_dummies(labels), dtype = np.float32)

In [0]:
bands = 20
frames = 41
features,labels = extract_features(base_dir, sound_file_paths, sound_names, bands = bands, frames = frames)
labels = one_hot_encode(labels)

In [0]:
%cd content
import pickle

filename = 'training_features'
with open(filename, 'wb') as f:
    pickle.dump(features, f)

filename = 'training_labels'
with open(filename, 'wb') as f:
    pickle.dump(labels, f)

In [0]:
import pickle

filename = 'training_features'
with open(filename, 'rb') as f:
    features = pickle.load(f)

filename = 'training_labels'
with open(filename, 'rb') as f:
    labels = pickle.load(f)

In [0]:
unique_labels = set()
for x in sound_names:
    unique_labels.add(x)
print(unique_labels)
len(unique_labels)

In [0]:
# Display one MFCC feature vector for one frame of one sample
print(features[-1][-1])
print(features.shape)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.2, random_state=10)

### Build Model

In [0]:
learning_rate = 0.001
training_iters = 10000
batch_size = 20
display_step = 500

# Network Parameters
n_input = bands 
n_steps = frames
n_hidden = 64
n_classes = labels.shape[1] # new , other , red

In [0]:
tf.reset_default_graph()
x = tf.placeholder("float", [None, n_steps, n_input])
y = tf.placeholder("float", [None, n_classes])
keep_prob = tf.placeholder(tf.float32, name='keep_prob')

weight = tf.Variable(tf.random_normal([n_hidden, n_classes]))
bias = tf.Variable(tf.random_normal([n_classes]))

### RNN

In [0]:
def lstm_cell(lstm_size, keep_prob):
    cell = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    return tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob = keep_prob)
def RNN(x, weight, bias, keep_prob, num_layers=2):
    lstm_layers = tf.contrib.rnn.MultiRNNCell([lstm_cell(n_hidden, keep_prob) for _ in range(num_layers)])
    output, state = tf.nn.dynamic_rnn(lstm_layers, x, dtype = tf.float32)
    output = tf.transpose(output, [1, 0, 2])
    last = tf.gather(output, int(output.get_shape()[0]) - 1)
    return tf.nn.softmax(tf.matmul(last, weight) + bias)

In [0]:
prediction = RNN(x, weight, bias, keep_prob)
# Define loss and optimizer
loss_f = -tf.reduce_sum(y * tf.log(prediction))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss_f)
# Evaluate model
correct_pred = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

### Training

In [0]:
saver = tf.train.Saver()
session=tf.InteractiveSession()
# Initializing the variables
session.run(tf.global_variables_initializer())

for itr in range(training_iters):    
    offset = (itr * batch_size) % (labels.shape[0] - batch_size)
    batch_x = X_train[offset:(offset + batch_size), :, :]
    batch_y = y_train[offset:(offset + batch_size), :]
    _, c = session.run([optimizer, loss_f],feed_dict={x: batch_x, y : batch_y, keep_prob: 0.95})

    if itr % display_step == 0:
        # Calculate batch accuracy
        acc = session.run(accuracy, feed_dict={x: batch_x, y: batch_y, keep_prob: 1})
        # Calculate batch loss
        loss = session.run(loss_f, feed_dict={x: batch_x, y: batch_y, keep_prob: 1})
        print("Iter " + str(itr) + ", Minibatch Loss= " + \
              "{:.6f}".format(loss) + ", Training Accuracy= " + \
              "{:.5f}".format(acc))

print('Test accuracy: ',round(session.run(accuracy, feed_dict={x: X_test, y: y_test, keep_prob: 1}) , 3))
saver.save(session, save_path = "./model/mfcc_audio.ckpt")

### Validation

##### Validation Data

In [0]:
test_sound_names = []
test_sound_file_paths = []
i = 1
for sample in valset:
  if i%1 == 0:
    class_id, user_id, filepath = sample
    file_locations,file_names = os.path.split(filepath)
    test_sound_file_paths.append(filepath)
    _,sound_label = os.path.split(file_locations)
    test_sound_names.append(sound_label)
  i+=1

unknowns = ["Unknown"] * len(test_sound_file_paths)

print(test_sound_names[-10:])
test_sound_file_paths[-10:]

Validate

In [0]:
test_features,_ = extract_features(base_dir, test_sound_file_paths, unknowns)

In [0]:
filename = 'testing_features'
with open(filename, 'wb') as f:
    pickle.dump(test_features, f)

In [0]:
filename = 'testing_features'
with open(filename, 'rb') as f:
    labels = pickle.load(f)

In [0]:
y_predicts = session.run(prediction, feed_dict={x: test_features, keep_prob: 1})
predicted_logit = stats.mode(np.argmax(y_predicts,1))[0][0]
list_labels = list(unique_labels)
predicted_label = list_labels[predicted_logit]
predicted_probability = stats.mode(np.argmax(y_predicts,1))[1][0] / len(y_predicts)

In [0]:
(predicted_label, predicted_probability)

In [0]:
predicted_class = np.argmax(y_predicts,1)
actual_class = np.argmax(one_hot_encode(test_sound_names),1)

correct = 0
wrong = 0

for j in predicted_class:
  if predicted_class[j] == actual_class[j]:
    correct += 1
  else:
    wrong += 1

validation_accuracy = correct / (correct + wrong)
print(validation_accuracy)

In [0]:
print(y_predicts[1])


Results:
Training Accuracy = 91.2%,
Validation Accuracy = 82.0%, full set