In this project I tried to make binary classification of audio files. I had dataset of audio files with
word barbie and puppy in proportion 39/39. As this is quite small dataset I decided to make audio
augmentations. For classification, I created spectrograms of audio files and trained small CNN model.

In [5]:
import os
import tensorflow as tf
import matplotlib
import keras.backend as K
import numpy as np
from thinkdsp import read_wave, Wave
from PIL import Image
matplotlib.style.use('ggplot')

Annotation!
I've changed some code in thinkdsp module. I added function save_and_plot in Spectrogram class.

In [None]:
try:
    os.mkdir("./training")
    os.mkdir("./validation")
    os.mkdir("./training/n0")
    os.mkdir("./training/n1")
    os.mkdir("./validation/n0")
    os.mkdir("./validation/n1")
    print("Done!")
except:
    print("Folders already exist!")

As our dataset is quite small, we need to perform data augmentation. Function make_augmentation adding
noise signal to original wave file and saves result:
Function several parameters:
signal: noise to add
folder: folder of data to augment(barbie or puppy)
index: number of augmentation. Is used in naming files
direct: directory to save files
n: 0/1, depending on barbie or puppy
amp: amplitude of noise

In [6]:
barbie_paths = os.listdir("./barbie_vs_puppy/barbie")
puppy_paths = os.listdir("./barbie_vs_puppy/puppy")

def make_augmentation(signal, folder, paths, index, direct, n, amp):
    for path in paths:
        audio = read_wave(f'./barbie_vs_puppy/{folder}/{path}')
        #audio_spectrogram = audio.make_spectrogram(224)
        dur = audio.duration
        if dur > 2:
            segment = audio.segment(0 + (dur-2)/2, 2)
        else:
            segment = audio.segment(0, 2)
        if len(segment.ys) != 96000:
            segment.ys = np.concatenate((segment.ys, np.zeros((96000-len(segment.ys),))))
        new_audio = Wave(ys=segment.ys, framerate=segment.framerate)
        a_duration = new_audio.duration
        a_framerate = new_audio.framerate

        wave = signal.make_wave(duration=a_duration, framerate=a_framerate)
        wave.normalize(amp=amp)
        noisy_audio_arr = new_audio.ys + wave.ys
        new_noisy_audio = Wave(ys=noisy_audio_arr, framerate=segment.framerate).make_audio()
        with open(f'./{direct}/n{n}/{index}{path}', 'wb') as f:
            f.write(new_noisy_audio.data)


In [None]:
from thinkdsp import UncorrelatedGaussianNoise, BrownianNoise, UncorrelatedUniformNoise
ugn = UncorrelatedGaussianNoise()
bn = BrownianNoise()
uun = UncorrelatedUniformNoise()

#barbie
for amp, index in zip([0.08, 0.15, 0.2], [0,1,2]):
    make_augmentation(ugn,"barbie", barbie_paths,index,"audio_files", 0, amp)
for amp, index in zip([0.08, 0.15, 0.2], [3,4,5]):
    make_augmentation(bn,"barbie", barbie_paths,index,"audio_files", 0, amp)
for amp, index in zip([0.08, 0.15, 0.2], [6,7,8]):
    make_augmentation(uun,"barbie", barbie_paths,index,"audio_files", 0, amp)

#puppy
for amp, index in zip([0.08, 0.15, 0.2], [0,1,2]):
    make_augmentation(ugn,"puppy", puppy_paths,index,"audio_files", 1, amp)
for amp, index in zip([0.08, 0.15, 0.2], [3,4,5]):
    make_augmentation(bn,"puppy", puppy_paths,index,"audio_files", 1, amp)
for amp, index in zip([0.08, 0.15, 0.2], [6,7,8]):
    make_augmentation(uun,"puppy", puppy_paths,index,"audio_files", 1, amp)

Copying original files to augmentation folder

In [None]:
import shutil

for elem in barbie_paths:
    shutil.copy(f"./barbie_vs_puppy/barbie/{elem}", f"./audio_files/n0/{elem}")
for elem in puppy_paths:
    shutil.copy(f"./barbie_vs_puppy/puppy/{elem}", f"./audio_files/n1/{elem}")

Now let's create spectrogram of each file
ThinkDSP file was changed for that purpose. In class Spectrogram plot_and_save function was added.
Creating and saving spectrograms is time-consuming process, may take about 1 hour, depending on
configuration of machine

In [7]:
import time
def create_images(path, n):
    i = 0
    data = os.listdir(f"./training/n{n}")
    for elem in path:
        start = time.time()
        elem_name = elem.split(".")[0]
        if f"{elem_name}.png" in data:
            print(time.time() - start, i)
            i+=1
            continue
        wave = read_wave(f"audio_files/n{n}/{elem}")
        #print(elem, f"audio_files/n{n}/{elem}")
        arr = wave.ys
        new_audio = Wave(ys=arr, framerate=wave.framerate)
        #new_audio.make_spectrogram(255).plot_and_save()
        new_audio.make_spectrogram(256).plot_and_save(path=f"./training/n{n}/{elem_name}.png")
        print(time.time() - start, i)
        i+=1

In [11]:
barbie_audio_paths = os.listdir("./audio_files/n0")
puppy_audio_paths = os.listdir("./audio_files/n1")
create_images(barbie_audio_paths, 0)
create_images(puppy_audio_paths, 1)


0.0 0
0.0 1
0.0 2
0.0 3
0.0 4
0.0 5
0.0 6
0.0 7
0.0 8
0.0 9
0.0 10
0.0 11
0.0 12
0.0 13
0.0 14
0.0 15
0.0 16
0.0 17
0.0 18
0.0 19
0.0 20
0.0 21
0.0 22
0.0 23
0.0 24
0.0 25
0.0 26
0.0 27
0.0 28
0.0 29
0.0 30
0.0 31
0.0 32
0.0 33
0.0 34
0.0 35
0.0 36
0.0 37
0.0 38
0.0 39
0.0 40
0.0 41
0.0 42
0.0 43
0.0 44
0.0 45
0.0 46
0.0 47
0.0 48
0.0 49
0.0 50
0.0 51
0.0 52
0.0 53
0.0 54
0.0 55
0.0 56
0.0 57
0.0 58
0.0 59
0.0 60
0.0 61
0.0 62
0.0 63
0.0 64
0.0 65
0.0 66
0.0 67
0.0 68
0.0 69
0.0 70
0.0 71
0.0 72
0.0 73
0.0 74
0.0 75
0.0 76
0.0 77
0.0 78
0.0 79
0.0 80
0.0 81
0.0 82
0.0 83
0.0 84
0.0 85
0.0 86
0.0 87
0.0 88
0.0 89
0.0 90
0.0 91
0.0 92
0.0 93
0.0 94
0.0 95
0.0 96
0.0 97
0.0 98
0.0 99
0.0 100
0.0 101
0.0 102
0.0 103
0.0 104
0.0 105
0.0 106
0.0 107
0.0 108
0.0 109
0.0 110
0.0 111
0.0 112
0.0 113
0.0 114
0.0 115
0.0 116
0.0 117
0.0 118
0.0 119
0.0 120
0.0 121
0.0 122
0.0 123
0.0 124
0.0 125
0.0 126
0.0 127
0.0 128
0.0 129
0.0 130
0.0 131
0.0 132
First print 0.015625476837158203
Print after s

Randomly splitting data to train and valid datasets in 80/20 proportion.

In [14]:
import random
barbie = os.listdir("./training/n0")
puppy = os.listdir("./training/n1")
files_barbie = random.sample(barbie, int(len(barbie)*0.2))
files_puppy = random.sample(puppy, int(len(puppy)*0.2))

for elem in files_barbie:
    shutil.move(f"./training/n0/{elem}", f"./validation/n0/{elem}")
for elem in files_puppy:
    shutil.move(f"./training/n1/{elem}", f"./validation/n1/{elem}")

Preprocessing images before passing them to Keras CNN. Just resizing them to (224,224) size.
Process takes ~ 100 seconds.

In [28]:
barbie_train = os.listdir("./training/n0")
puppy_train = os.listdir("./training/n1")
barbie_val = os.listdir("./validation/n0")
puppy_val = os.listdir("./validation/n1")

start = time.time()
for elem in barbie_train:
    img = Image.open(f"./training/n0/{elem}")
    img = img.resize((224,224))
    img.save(f"./training/n0/{elem}")
print(time.time()-start)
for elem in puppy_train:
    img = Image.open(f"./training/n1/{elem}")
    img = img.resize((224,224))
    img.save(f"./training/n1/{elem}")

print(time.time()-start)
for elem in barbie_val:
    img = Image.open(f"./validation/n0/{elem}")
    img = img.resize((224,224))
    img.save(f"./validation/n0/{elem}")
print(time.time()-start)
for elem in puppy_val:
    img = Image.open(f"./validation/n1/{elem}")
    img = img.resize((224,224))
    img.save(f"./validation/n1/{elem}")
print(time.time()-start)

50.882320404052734
79.73747110366821
89.74546885490417
103.28654456138611


Creating CNN, train it and then save weights. Model is a simple CNN with 3 convolutional layers.

In [29]:
IMAGE_SHAPE = (224, 224)
TRAINING_DATA_DIR = 'training/'
VALID_DATA_DIR = 'validation/'

In [37]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255
)
train_generator = datagen.flow_from_directory(
    TRAINING_DATA_DIR,
    shuffle=True,
    target_size=IMAGE_SHAPE,
)
valid_generator = datagen.flow_from_directory(
    VALID_DATA_DIR,
    shuffle=False,
    target_size=IMAGE_SHAPE,
)

Found 624 images belonging to 2 classes.
Found 156 images belonging to 2 classes.


In [38]:
def build_model(num_classes):
    model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu', 
                           input_shape=(224, 224, 3)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2),
    tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2),
    tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model
model = build_model(num_classes=2)

In [39]:
def get_f1(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    
    return f1_val

In [40]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=[get_f1]
)
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_9 (Conv2D)           (None, 222, 222, 16)      448       
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 111, 111, 16)     0         
 2D)                                                             
                                                                 
 conv2d_10 (Conv2D)          (None, 109, 109, 16)      2320      
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 54, 54, 16)       0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 52, 52, 32)        4640      
                                                                 
 max_pooling2d_11 (MaxPoolin  (None, 26, 26, 32)      

In [41]:
EPOCHS = 2
BATCH_SIZE = 16
history = model.fit(train_generator,
                    steps_per_epoch=train_generator.samples // BATCH_SIZE // 2,
                    epochs=EPOCHS,
                    validation_data=valid_generator,
                    validation_steps= valid_generator.samples // BATCH_SIZE // 2,
                    verbose=1
                    )

Epoch 1/2
Epoch 2/2


In [42]:
model.save_weights('barbie-puppyRecognition')