In [1]:
%matplotlib inline
import os
import pandas as pd
import glob
import numpy as np

In [2]:
import librosa
import librosa.display
import pylab
import matplotlib
import gc
import matplotlib.pyplot as plt

In [3]:
class Opts():
    def __init__(self):
        self.train_data_path = "./working_v2/train/"
        self.test_data_path = "./working_v2/test/"
        self.csv_path = "./meta"
        self.data_path = "./data/"
        self.model_save_path = "./output/"
        self.model_name = "cnn_v2.h5"
        self.sample_rate = 22050
        self.hop_length = 256
        self.win_length = 1024
        self.n_mels = 60
        self.n_fft = 1024 # 75% overlapping with hop_length=256
        
        if not os.path.exists(self.train_data_path):
            os.makedirs(self.train_data_path)
        if not os.path.exists(self.test_data_path):
            os.makedirs(self.test_data_path)
        if not os.path.exists(self.csv_path):
            os.makedirs(self.csv_path)


In [4]:
CLASS_ID = {0: "air_conditioner",
            1: "car_horn",
            2:"children_playing",
            3:"dog_bark",
            4:"drilling",
            5:"engine_idling",
            6:"gun_shot",
            7:"jackhammer",
            8:"siren",
            9:"street_music"}

def _get_meta_info(filename):
    infos = filename.split('-')
    return [int(info) for info in infos]

In [5]:
def input_to_target(opts):
    # audio files and their corresponding labels
    train_paths = [opts.data_path + "fold1/*.wav", opts.data_path + "fold2/*.wav"]
    # train_paths = [opts.data_path + "fold1/*.wav"]
    train_label_path = opts.data_path +  "train_labels.csv"
    test_paths =  [opts.data_path + "fold3/*.wav"]

    # input
    train_files, test_files = [], []
    for train_path in train_paths:
        train_files += glob.glob(train_path)
    for test_path in test_paths:
        test_files += glob.glob(test_path)

    train_labels, class_names, file_names = [], [], []
    for train_file in train_files:
        _, class_id, _, _ = _get_meta_info(train_file.split('/')[-1].strip('.wav'))
        # print("train_file={:s}, class_id={:d}".format(train_file, class_id))
        train_labels.append(int(class_id))
        class_names.append(CLASS_ID[int(class_id)])
        file_names.append(train_file.split('/')[-1].split('.')[0])
    # csv storing information for training dataset
    train_file_df = pd.DataFrame({'file_paths': train_files,
                                  'file_names': file_names,
                                  'labels': train_labels,
                                  'class_names': class_names})

    test_labels, class_names, file_names = [], [], []
    for test_file in test_files:
        _, class_id, _, _ = _get_meta_info(test_file.split('/')[-1].strip('.wav'))
        test_labels.append(int(class_id))
        class_names.append(CLASS_ID[int(class_id)])
        file_names.append(test_file.split('/')[-1].split('.')[0])
    # csv storing information for training dataset
    test_file_df = pd.DataFrame({'file_paths': test_files, 
                                 'file_names': file_names,
                                 'labels': test_labels,
                                 'class_names': class_names})
    
    train_file_df.to_csv(os.path.join(opts.csv_path, "train.csv"))
    test_file_df.to_csv(os.path.join(opts.csv_path, "test.csv"))

    return train_file_df, test_file_df

In [6]:
def create_spectrogram(file_path, title, opts, flag="train"):
    plt.interactive(False)
    clip, _ = librosa.load(file_path, sr=opts.sample_rate)
    clip_normed = librosa.util.normalize(clip)
    
    fig = plt.figure(figsize=[0.72,0.72])
    ax = fig.add_subplot(111) #subplot 1: 1x1 grid

    # hide all axes
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    
    
    S = librosa.feature.melspectrogram(y=clip_normed, sr=opts.sample_rate, \
                                       n_fft=opts.n_fft, 
                                       n_mels=opts.n_mels, \
                                       hop_length=opts.hop_length,\
                                       win_length=opts.win_length)
    S_DB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_DB);
    
#     S = librosa.feature.melspectrogram(y=clip_normed, sr=opts.sample_rate,\
#                                        hop_length=self.hop_length, window_size=self.window_size)
#     librosa.display.specshow(librosa.power_to_db(S, ref=np.max))

    if flag == "train":
        filename  = os.path.join(opts.train_data_path, title + '.jpg')
    else:
        filename  = os.path.join(opts.test_data_path, title + '.jpg')

    plt.savefig(filename, dpi=400, bbox_inches='tight',pad_inches=0)
    plt.close()    
    fig.clf()
    plt.close(fig)
    plt.close('all')

In [7]:
def gen_spectrogram_set(df, opts, flag="train"):
    for file_path in df['file_paths']:
        title = file_path.split('/')[-1].split('.')[0]
        create_spectrogram(file_path, title, opts, flag)

In [8]:
opts = Opts()

In [9]:

train_file_df, test_file_df = input_to_target(opts)
gen_spectrogram_set(train_file_df, opts)
gen_spectrogram_set(test_file_df, opts, flag="test")

In [10]:
def df_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    return df
train_csv_path = os.path.join(opts.csv_path, "train.csv")
test_csv_path = os.path.join(opts.csv_path, "test.csv")

train_file_df, test_file_df = df_from_csv(train_csv_path), df_from_csv(test_csv_path)

In [11]:
from keras_preprocessing.image import ImageDataGenerator
data_gen = ImageDataGenerator(rescale=1./255., validation_split=0.15) #normalize data by multiplying 1/255

def append_ext(fn):
    return fn+".jpg"
train_file_df["file_names"] = train_file_df["file_names"].apply(append_ext)
test_file_df["file_names"] = test_file_df["file_names"].apply(append_ext)

train_gen = data_gen.flow_from_dataframe(
    dataframe=train_file_df,
    directory=opts.train_data_path,
    x_col="file_names",
    y_col="class_names",
    subset="training",
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="categorical",
    target_size=(64, 64))

valid_gen = data_gen.flow_from_dataframe(
    dataframe=train_file_df,
    directory=opts.train_data_path,
    x_col="file_names",
    y_col="class_names",
    subset="validation",
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="categorical",
    target_size=(64, 64))

Found 1497 images belonging to 10 classes.
Found 264 images belonging to 10 classes.


In [12]:
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.models import Sequential, Model
from keras.models import load_model
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
from tensorflow.python.keras.utils.data_utils import Sequence

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [13]:
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=(64,64,3)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.5))
model.add(Conv2D(128, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(128, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))
model.compile(optimizers.rmsprop(lr=0.0005, decay=1e-6),loss="categorical_crossentropy",metrics=["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 64, 64, 32)        896       
_________________________________________________________________
activation_1 (Activation)    (None, 64, 64, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 62, 62, 64)        18496     
_________________________________________________________________
activation_2 (Activation)    (None, 62, 62, 64)        0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 31, 31, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 31, 31, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 31, 31, 64)        36928     
__________

In [14]:
#Fitting keras model, no test gen for now
STEP_SIZE_TRAIN=train_gen.n//train_gen.batch_size
STEP_SIZE_VALID=valid_gen.n//valid_gen.batch_size
#STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
model.fit_generator(generator=train_gen,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_gen,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=20
)
model.evaluate_generator(generator=valid_gen, steps=STEP_SIZE_VALID
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.9427346475422382, 0.79296875]

In [15]:
test_data_gen = ImageDataGenerator(rescale=1./255.)
test_gen=test_data_gen.flow_from_dataframe(
    dataframe=test_file_df,
    directory=opts.test_data_path,
    x_col="file_names",
    y_col="class_names",
    batch_size=32,
    seed=42,
    shuffle=False,
    class_mode="categorical",
    target_size=(64,64))
STEP_SIZE_TEST=test_gen.n//test_gen.batch_size
test_gen.reset() # resets batch index to 0

Found 925 images belonging to 10 classes.


In [16]:
test_gen.classes

[9,
 1,
 5,
 5,
 3,
 6,
 2,
 5,
 9,
 3,
 7,
 4,
 1,
 4,
 9,
 7,
 7,
 9,
 6,
 4,
 5,
 5,
 0,
 8,
 8,
 8,
 0,
 3,
 4,
 7,
 7,
 7,
 2,
 0,
 8,
 8,
 8,
 0,
 5,
 4,
 5,
 7,
 7,
 3,
 4,
 0,
 4,
 0,
 5,
 4,
 9,
 4,
 5,
 5,
 3,
 1,
 5,
 2,
 3,
 5,
 5,
 3,
 4,
 3,
 4,
 5,
 4,
 5,
 4,
 0,
 2,
 7,
 8,
 8,
 8,
 7,
 0,
 3,
 5,
 7,
 7,
 0,
 8,
 8,
 6,
 8,
 8,
 7,
 0,
 0,
 4,
 5,
 5,
 4,
 4,
 0,
 4,
 0,
 6,
 5,
 9,
 4,
 2,
 4,
 2,
 1,
 2,
 5,
 6,
 7,
 5,
 3,
 9,
 9,
 4,
 2,
 4,
 5,
 4,
 5,
 2,
 7,
 4,
 2,
 4,
 5,
 3,
 7,
 8,
 3,
 4,
 0,
 1,
 5,
 5,
 8,
 8,
 0,
 3,
 8,
 8,
 3,
 0,
 8,
 8,
 2,
 5,
 0,
 6,
 3,
 8,
 9,
 6,
 4,
 3,
 5,
 4,
 0,
 0,
 6,
 4,
 8,
 2,
 4,
 5,
 4,
 9,
 3,
 7,
 5,
 1,
 4,
 2,
 4,
 1,
 9,
 5,
 7,
 9,
 6,
 0,
 9,
 5,
 5,
 4,
 4,
 9,
 4,
 3,
 4,
 9,
 5,
 3,
 4,
 0,
 0,
 5,
 0,
 3,
 8,
 0,
 2,
 6,
 8,
 3,
 7,
 7,
 1,
 7,
 5,
 7,
 4,
 8,
 7,
 0,
 8,
 5,
 3,
 4,
 3,
 5,
 3,
 2,
 4,
 3,
 0,
 4,
 2,
 5,
 5,
 2,
 3,
 4,
 2,
 9,
 7,
 5,
 6,
 2,
 8,
 2,
 7,
 9,
 8,
 9,
 7,
 5,
 8,
 6,
 7,


In [17]:
pred=model.predict_generator(test_gen,
    steps=STEP_SIZE_TEST,
    verbose=1)
predicted_class_indices=np.argmax(pred,axis=1)
predicted_class_indices
#Fetch labels from train gen for testing
# labels = (train_gen.class_indices)
# labels = dict((v,k) for k,v in labels.items())
# predictions = [labels[k] for k in predicted_class_indices]
# print(predictions[0:6])




array([3, 1, 5, 9, 3, 3, 3, 5, 9, 2, 4, 0, 4, 4, 9, 9, 9, 9, 6, 0, 2, 5,
       8, 8, 8, 8, 7, 3, 4, 9, 4, 7, 2, 7, 9, 8, 8, 8, 5, 7, 2, 4, 1, 3,
       4, 3, 0, 0, 5, 7, 4, 4, 9, 5, 3, 1, 2, 2, 3, 5, 5, 2, 0, 9, 4, 2,
       0, 5, 4, 8, 2, 5, 8, 8, 8, 4, 7, 3, 2, 9, 9, 7, 9, 8, 6, 8, 8, 9,
       8, 8, 4, 5, 2, 0, 4, 3, 0, 3, 6, 9, 2, 7, 3, 4, 3, 1, 2, 5, 6, 7,
       3, 2, 2, 3, 3, 3, 4, 2, 7, 5, 3, 2, 4, 2, 0, 2, 8, 7, 8, 2, 4, 8,
       4, 5, 2, 8, 8, 7, 9, 8, 8, 2, 7, 8, 1, 6, 5, 8, 9, 2, 8, 1, 6, 7,
       8, 2, 0, 0, 3, 6, 4, 8, 3, 5, 9, 7, 9, 2, 5, 2, 1, 7, 2, 4, 1, 9,
       3, 4, 2, 6, 5, 2, 9, 9, 7, 7, 2, 4, 3, 0, 9, 2, 8, 4, 8, 8, 5, 9,
       3, 1, 7, 2, 6, 8, 2, 9, 0, 1, 0, 2, 9, 4, 8, 7, 7, 1, 5, 3, 4, 8,
       2, 3, 9, 0, 3, 3, 4, 8, 9, 2, 2, 3, 4, 2, 3, 4, 2, 6, 2, 8, 2, 1,
       7, 8, 2, 9, 2, 8, 1, 4, 4, 0, 9, 2, 8, 8, 2, 7, 7, 2, 4, 3, 3, 7,
       9, 7, 4, 2, 7, 2, 4, 0, 8, 1, 8, 7, 7, 7, 7, 9, 2, 3, 3, 8, 9, 9,
       9, 2, 8, 2, 4, 0, 9, 8, 3, 2, 7, 8, 4, 6, 5,

In [18]:
test_gen

<keras_preprocessing.image.dataframe_iterator.DataFrameIterator at 0x1c36767f28>

In [19]:
train_gen

<keras_preprocessing.image.dataframe_iterator.DataFrameIterator at 0x10c2c5080>

In [20]:
if not os.path.exists(opts.model_save_path):
    os.mkdir(opts.model_save_path)
model.save(os.path.join(opts.model_save_path, opts.model_name))

In [21]:
model = load_model(os.path.join(opts.model_save_path, opts.model_name))

In [22]:
import coremltools
coreml_model = coremltools.converters.keras.convert(model,
                                                    input_names="image", 
                                                    output_names="probability",
                                                    class_labels="labels.txt", 
                                                    image_input_names="image")
coreml_model.author = "Candy Dong"
coreml_model.short_description = "model for sound recognition"
coreml_model.save(os.path.join(opts.model_save_path, "CNN_v2.mlmodel"))


0 : conv2d_1_input, <keras.engine.input_layer.InputLayer object at 0x1c36f3ca90>
1 : conv2d_1, <keras.layers.convolutional.Conv2D object at 0x1c36f3ccf8>
2 : activation_1, <keras.layers.core.Activation object at 0x1c36f3cda0>
3 : conv2d_2, <keras.layers.convolutional.Conv2D object at 0x1c36f3ce80>
4 : activation_2, <keras.layers.core.Activation object at 0x1c36789208>
5 : max_pooling2d_1, <keras.layers.pooling.MaxPooling2D object at 0x1c36789fd0>
6 : conv2d_3, <keras.layers.convolutional.Conv2D object at 0x1c36f60e80>
7 : activation_3, <keras.layers.core.Activation object at 0x1c36789908>
8 : conv2d_4, <keras.layers.convolutional.Conv2D object at 0x1c36ee1a90>
9 : activation_4, <keras.layers.core.Activation object at 0x1c36efeb00>
10 : max_pooling2d_2, <keras.layers.pooling.MaxPooling2D object at 0x1c36efe1d0>
11 : conv2d_5, <keras.layers.convolutional.Conv2D object at 0x1c36951eb8>
12 : activation_5, <keras.layers.core.Activation object at 0x1c36963940>
13 : conv2d_6, <keras.layers.co

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 64, 64, 32)        896       
_________________________________________________________________
activation_1 (Activation)    (None, 64, 64, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 62, 62, 64)        18496     
_________________________________________________________________
activation_2 (Activation)    (None, 62, 62, 64)        0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 31, 31, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 31, 31, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 31, 31, 64)        36928     
__________