In [7]:
import glob
import os
import librosa
import librosa.core
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
%matplotlib inline
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

In [35]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield int(start), int(start + window_size)
        start += (window_size / 2)

def extract_features(parent_dir,sub_dirs,file_ext="*.wav",bands = 60, frames = 41):
    window_size = 512 * (frames - 1)
    log_specgrams = []
    labels = []
    for l, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            sound_clip,s = librosa.load(fn)
            print(fn)
            label = fn.split('/')[2].split('-')[1].split(".")[0]
            for (start,end) in windows(sound_clip,window_size):
                if(len(sound_clip[start:end]) == window_size):
                    signal = sound_clip[start:end]
                    melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
                    logspec = librosa.amplitude_to_db(melspec)
                    logspec = logspec.T.flatten()[:, np.newaxis].T
                    log_specgrams.append(logspec)
                    labels.append(label)
            
    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    return np.array(features), np.array(labels,dtype = np.int)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [42]:
parent_dir = 'data'
sub_dirs= ['Cryingbaby','else']
features,labels = extract_features(parent_dir,sub_dirs)
labels = one_hot_encode(labels)

data/Cryingbaby/Cryingbaby-0-13.wav
data/Cryingbaby/Cryingbaby-0-4.wav
data/Cryingbaby/Cryingbaby-0-5.wav
data/Cryingbaby/Cryingbaby-0-12.wav
data/Cryingbaby/Cryingbaby-0-10.wav
data/Cryingbaby/Cryingbaby-0-38.wav
data/Cryingbaby/Cryingbaby-0-7.wav
data/Cryingbaby/Cryingbaby-0-6.wav
data/Cryingbaby/Cryingbaby-0-39.wav
data/Cryingbaby/Cryingbaby-0-11.wav
data/Cryingbaby/Cryingbaby-0-29.wav
data/Cryingbaby/Cryingbaby-0-15.wav
data/Cryingbaby/Cryingbaby-0-2.wav
data/Cryingbaby/Cryingbaby-0-3.wav
data/Cryingbaby/Cryingbaby-0-14.wav
data/Cryingbaby/Cryingbaby-0-28.wav
data/Cryingbaby/Cryingbaby-0-16.wav
data/Cryingbaby/Cryingbaby-0-1.wav
data/Cryingbaby/Cryingbaby-0-17.wav
data/Cryingbaby/Cryingbaby-0-64.wav
data/Cryingbaby/Cryingbaby-0-58.wav
data/Cryingbaby/Cryingbaby-0-59.wav
data/Cryingbaby/Cryingbaby-0-65.wav
data/Cryingbaby/Cryingbaby-0-67.wav
data/Cryingbaby/Cryingbaby-0-66.wav
data/Cryingbaby/Cryingbaby-0-62.wav
data/Cryingbaby/Cryingbaby-0-63.wav
data/Cryingbaby/Cryingbaby-0-49.wav

In [1]:
import os
path = "data/else/"
files = os.listdir(path)
index =0
for file in files:
    if file.split(".")[-1] =="wav":
        index +=1
        os.rename(path +"/"+ file,f"{path}/else-1-{index}.wav")
        print(file)
        if index ==68:
            break
print(files)

laugh_1.m4a_68.wav
laugh_1.m4a_40.wav
laugh_1.m4a_54.wav
laugh_1.m4a_5.wav
laugh_1.m4a_83.wav
silence.wav_21.wav
silence.wav_35.wav
silence.wav_34.wav
silence.wav_20.wav
laugh_1.m4a_82.wav
laugh_1.m4a_4.wav
laugh_1.m4a_55.wav
laugh_1.m4a_41.wav
laugh_1.m4a_69.wav
laugh_1.m4a_57.wav
laugh_1.m4a_43.wav
laugh_1.m4a_6.wav
laugh_1.m4a_80.wav
silence.wav_36.wav
silence.wav_22.wav
silence.wav_23.wav
silence.wav_37.wav
laugh_1.m4a_81.wav
laugh_1.m4a_7.wav
laugh_1.m4a_42.wav
laugh_1.m4a_56.wav
laugh_1.m4a_52.wav
laugh_1.m4a_46.wav
laugh_1.m4a_85.wav
laugh_1.m4a_3.wav
silence.wav_33.wav
silence.wav_27.wav
silence.wav_26.wav
silence.wav_32.wav
laugh_1.m4a_2.wav
laugh_1.m4a_84.wav
laugh_1.m4a_47.wav
laugh_1.m4a_53.wav
laugh_1.m4a_45.wav
laugh_1.m4a_51.wav
laugh_1.m4a_79.wav
laugh_1.m4a_86.wav
laugh_1.m4a_0.wav
laugh_2.m4a_8.wav
silence.wav_18.wav
silence.wav_24.wav
silence.wav_30.wav
silence.wav_31.wav
silence.wav_25.wav
silence.wav_19.wav
laugh_2.m4a_9.wav
laugh_1.m4a_1.wav
laugh_1.m4a_87.wav
lau

In [43]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev = 0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(1.0, shape = shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x,W,strides=[1,2,2,1], padding='SAME')

def apply_convolution(x,kernel_size,num_channels,depth):
    weights = weight_variable([kernel_size, kernel_size, num_channels, depth])
    biases = bias_variable([depth])
    return tf.nn.relu(tf.add(conv2d(x, weights),biases))

def apply_max_pool(x,kernel_size,stride_size):
    return tf.nn.max_pool(x, ksize=[1, kernel_size, kernel_size, 1], 
                          strides=[1, stride_size, stride_size, 1], padding='SAME')

In [44]:
rnd_indices = np.random.rand(len(labels)) < 0.70

train_x = features[rnd_indices]
train_y = labels[rnd_indices]
test_x = features[~rnd_indices]
test_y = labels[~rnd_indices]

In [75]:
frames = 41
bands = 60

feature_size = 2460 #60x41
num_labels = 2
num_channels = 2

batch_size = 20
kernel_size = 50
depth = 20
num_hidden = 400

learning_rate = 0.005
training_iterations = 200

In [76]:
X = tf.placeholder(tf.float32, shape=[None,bands,frames,num_channels])
Y = tf.placeholder(tf.float32, shape=[None,num_labels])

cov = apply_convolution(X,kernel_size,num_channels,depth)

shape = cov.get_shape().as_list()
cov_flat = tf.reshape(cov, [-1, shape[1] * shape[2] * shape[3]])

f_weights = weight_variable([shape[1] * shape[2] * depth, num_hidden])
f_biases = bias_variable([num_hidden])
f = tf.nn.sigmoid(tf.add(tf.matmul(cov_flat, f_weights),f_biases))

out_weights = weight_variable([num_hidden, num_labels])
out_biases = bias_variable([num_labels])
y_ = tf.nn.softmax(tf.matmul(f, out_weights) + out_biases)

In [77]:
cross_entropy = -tf.reduce_sum(Y * tf.log(y_))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [78]:
cost_history = np.empty(shape=[1],dtype=float)
with tf.Session() as session:
    tf.global_variables_initializer().run()

    for itr in range(training_iterations):    
        offset = (itr * batch_size) % (train_y.shape[0] - batch_size)
        batch_x = train_x[offset:(offset + batch_size), :, :, :]
        batch_y = train_y[offset:(offset + batch_size), :]
        
        _, c = session.run([optimizer, cross_entropy],feed_dict={X: batch_x, Y : batch_y})
        cost_history = np.append(cost_history,c)
        if itr%10 == 0:
            print(round(session.run(accuracy, feed_dict={X: test_x, Y: test_y}) , 3))
    
    print('Test accuracy: ',round(session.run(accuracy, feed_dict={X: test_x, Y: test_y}) , 3))
    fig = plt.figure(figsize=(15,10))
    plt.plot(cost_history)
    plt.axis([0,training_iterations,0,np.max(cost_history)])
    plt.show()

0.264
0.264


KeyboardInterrupt: 