In [8]:
import librosa 
import os
import librosa  # for sound processing.
import numpy as np
import pandas as pd
from keras.utils import np_utils
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [81]:
def calculate_mel_spec(audio, transpose=True, expand_dims=(0,)):
    """
    Calculate a mal spectrogram from raw audio waveform
    Note: The parameters of the spectrograms are in the config.py file.
    Args:
        audio : numpy.array, raw waveform to compute the spectrogram

    Returns:
        numpy.array
        containing the mel spectrogram
    """
    # Compute spectrogram
    ham_win = np.hamming(n_window)
    # audio, _ = librosa.load(filename, sr=sample_rate)
    #     print(type(audio))

    audio = np.array(audio)
    spec = librosa.stft(
        audio,
        n_fft=n_window,
        hop_length=hop_length,
        window=ham_win,
        center=True,
        pad_mode='reflect'
    )

    mel_spec = librosa.feature.melspectrogram(
        S=np.abs(spec),  # amplitude, for energy: spec**2 but don't forget to change amplitude_to_db.
        sr=sr,
        n_mels=n_mels,
        fmin=f_min, fmax=f_max,
        htk=False, norm=None)

    if save_log_feature:
        mel_spec = librosa.amplitude_to_db(mel_spec)  # 10 * log10(S**2 / ref), ref default is 1

    if transpose:
        mel_spec = mel_spec.T

    if len(expand_dims) > 0:
        mel_spec = np.expand_dims(mel_spec, axis=expand_dims)

    return mel_spec


def pad_zeros(array, size, axis):
    shape = list(array.shape)

    if shape[axis] > size:
        shape[axis] = size
        shape = tuple(slice(0, i) for i in shape)
        return array[shape]

    shape[axis] = size - shape[axis]
    return np.concatenate((array, np.zeros(shape)), axis=axis)


def train(cfg):
    csv_path = data_path + "/metadata/UrbanSound8K.csv"
    data = pd.read_csv(csv_path)
    train_idx, test_idx = train_test_split(data.index, test_size=0.3, stratify=data['class'])
    valid_idx, test_idx = train_test_split(test_idx, test_size=0.33, stratify=data.loc[test_idx]['class'])

    # Split data on train / valid / test
    data.at[train_idx, 'split'] = 'train'
    data.at[valid_idx, 'split'] = 'valid'
    data.at[test_idx, 'split'] = 'test'

    if '.' in csv_path:
        data.to_csv(csv_path[:-4] + '_split' + '.csv')
    else:
        data.to_csv(csv_path + '_split' + '.csv')

    # Read audio files and generate features
    if 'fold' in data.columns and \
            sum([a.startswith("fold") for a in next(os.walk(data_path))[1]]) > 2:
        data['audio'] = data[['slice_file_name', 'fold']].apply(lambda x: librosa.load(data_path + "/fold{}/".format(x.fold) + x.slice_file_name, sr=sr), axis=1)

    else:
        data['audio'] = data['slice_file_name'].apply(lambda x: librosa.load(data_path + '/' + x, sr=sr))

    data['features'] = data['audio'].apply(calculate_mel_spec)

    # Pad features to the same shape
    if pad_zero is None:
        pad_size = data['features'].apply(lambda x: x.shape[1]).max()
    else:
        pad_size = pad_size

    data['features'] = data['features'].apply(lambda x: pad_zeros(x, pad_size, axis=1))
    print("Here")
    X_train, X_valid, X_test = (np.array(data[data.split == sp]['features'].tolist())
                                for sp in ['train', 'valid', 'test'])

    lb = LabelEncoder().fit(classes)
    y_train, y_valid, y_test = (
        np_utils.to_categorical(lb.transform(np.array(data[data.split == sp]['class'].tolist())))
        for sp in ['train', 'valid', 'test'])

    # Train model
    model = CNN()
    model.compile(optimizers.RMSprop(lr=0.0005, decay=1e-6), loss="categorical_crossentropy", metrics=["accuracy"])
    model.summary()
    model.fit(X_train, y_train, batch_size=64, epochs=150, validation_data=(X_valid, y_valid))

    if save_path:
        model.save(save_path)

    # Test model quality
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=lb.classes_))


def eval(cfg):
    if os.path.exists(cfg.data_path[:-4] + '_split_' + '.csv'):
        data = pd.read_csv(cfg.data_path[:-4] + '_split_' + '.csv')
        data = data[data.split == 'test']
    elif os.path.exists(cfg.data_path + '_split_' + '.csv'):
        data = pd.read_csv(cfg.data_path + '_split_' + '.csv')
        data = data[data.split == 'test']
    else:
        data = pd.read_csv(cfg.data_path)

    # Read audio files and generate features
    if 'fold' in data.columns() and \
            sum([a.startswith("fold") for a in next(os.walk("UrbanSound8K"))[1]]) > 2:
        data['audio'] = data[['slice_file_name', 'fold']].apply(
            lambda x: librosa.load("UrbanSound8K/" + "fold{}/".format(x.fold) + x.slice_file_name, sr=cfg.sr))

    else:
        data['audio'] = data['slice_file_name'].apply(lambda x: librosa.load(x, sr=cfg.sr))

    data['features'] = data['audio'].apply(calculate_mel_spec)
    X = np.array(data['features'].apply(lambda x: pad_zeros(x, pad_size, axis=1)).tolist())

    lb = LabelEncoder().fit(cfg.classes)
    y = np_utils.to_categorical(lb.transform(np.array(data['class'].tolist())))

    model = keras.load(cfg.model_path)
    print(classification_report(y, model.predict(X), target_names=lb.classes_))


def predict(cfg, return_audio=False):
    audio, _ = librosa.load(cfg.filename, sr=cfg.sr)
    features = calculate_mel_spec(audio)
    features = pad_zeros(features, cfg.pad_size, cfg.pad_axis)
    model = keras.load(cfg.model_path)
    pred_label = lb.inverse_transform([np.argmax(model(features))])
    if return_audio:
        return pred_label, audio
    else:
        return pred_label

In [39]:
import math


data_path = "UrbanSound8K"
sr = 44100
model_path = ''

n_window = 2048
hop_length = 511
n_mels = 64
max_len_seconds = 10.
max_frames = math.ceil(max_len_seconds * sr / hop_length)
pooling_time_ratio = 8

f_min = 0.
f_max = 22050.
save_log_feature = False

classes = ['air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling',
           'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music']

In [30]:
data = pd.read_csv("UrbanSound8K/metadata/UrbanSound8K.csv")

Available objects for config:
     AliasManager
     DisplayFormatter
     HistoryManager
     IPCompleter
     IPKernelApp
     LoggingMagics
     MagicsManager
     OSMagics
     PrefilterManager
     ScriptMagics
     StoreMagics
     ZMQInteractiveShell


In [43]:
train(cfg)

KeyboardInterrupt: 

In [82]:
csv_path = data_path + "/metadata/UrbanSound8K.csv"
data = pd.read_csv(csv_path)
train_idx, test_idx = train_test_split(data.index, test_size=0.3, stratify=data['class'])
valid_idx, test_idx = train_test_split(test_idx, test_size=0.33, stratify=data.loc[test_idx]['class'])

# Split data on train / valid / test
data.at[train_idx, 'split'] = 'train'
data.at[valid_idx, 'split'] = 'valid'
data.at[test_idx, 'split'] = 'test'

if '.' in csv_path:
    data.to_csv(csv_path[:-4] + '_split' + '.csv')
else:
    data.to_csv(csv_path + '_split' + '.csv')

In [83]:
data = data.iloc[:100]

In [84]:
 # Read audio files and generate features
if 'fold' in data.columns and \
        sum([a.startswith("fold") for a in next(os.walk(data_path))[1]]) > 2:
    data['audio'] = data[['slice_file_name', 'fold']].apply(lambda x: librosa.load(data_path + "/fold{}/".format(x.fold) + x.slice_file_name, sr=sr)[0], axis=1)

else:
    data['audio'] = data['slice_file_name'].apply(lambda x: librosa.load(data_path + '/' + x, sr=sr)[0])

data['features'] = data['audio'].apply(calculate_mel_spec)

In [92]:
data['features'].apply(lambda x:x.shape)

0     (1, 346, 64)
1     (1, 346, 64)
2     (1, 346, 64)
3     (1, 346, 64)
4     (1, 346, 64)
          ...     
95    (1, 346, 64)
96    (1, 346, 64)
97    (1, 346, 64)
98    (1, 346, 64)
99    (1, 346, 64)
Name: features, Length: 100, dtype: object

In [87]:
pad_size = None

In [113]:
# Pad features to the same shape
if pad_size is None:
    pad_size = data['features'].apply(lambda x: x.shape[1]).max()
else:
    pad_size = pad_size

data['features'] = data['features'].apply(lambda x: pad_zeros(x, pad_size, axis=1))

X_train, X_valid, X_test = (np.array(data[data.split == sp]['features'].tolist())
                            for sp in ['train', 'valid', 'test'])

lb = LabelEncoder().fit(classes)
y_train, y_valid, y_test = (
    np_utils.to_categorical(lb.transform(data[data.split == sp]['class'].tolist()), num_classes=len(lb.classes_))
    for sp in ['train', 'valid', 'test'])

In [114]:
y_train, y_valid, y_test

(array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],


In [118]:
X_train.shape[1:]

(1, 346, 64)

In [120]:
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
model = CNN(X_train.shape[1:])
model.compile(optimizers.RMSprop(lr=0.0005, decay=1e-6), loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()
model.fit(X_train, y_train, batch_size=64, epochs=150, validation_data=(X_valid, y_valid))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 1, 346, 64)        36928     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 1, 86, 64)         0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 1, 86, 32)         18464     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 1, 43, 32)         0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 1, 43, 32)         0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 1, 43, 16)         4624      
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 1, 21, 16)        

Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150


Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x227b8fecb50>

In [116]:
def CNN(input_shape):
    from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
    from keras.models import Sequential, Model
    from keras.layers import Conv2D, MaxPooling2D
    from keras import regularizers, optimizers

    model = Sequential()
    model.add(Conv2D(64, (3, 3), padding='same',
                     input_shape=input_shape, activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 4)))
    model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 2)))
    model.add(Dropout(0.25))
    model.add(Conv2D(16, (3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 2)))
    model.add(Conv2D(4, (3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(1, 2)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation='softmax'))
    return model

In [140]:
# if save_path:
#     with open(save_path + "/model.json", "w") as json_file:
#         json_file.write(model.to_json())
#     model.save_weights(save_path + "/model.h5")

    # json_file = open('model.json', 'r')
    # loaded_model_json = json_file.read()

# Test model quality
y_pred = np.argmax(model.predict(X_test), axis=-1)
print(classification_report(np.argmax(model.predict(X_test), axis=-1), y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         1

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



In [134]:
np.argmax(model.predict(X_test), axis=-1)

array([2, 3, 0, 0, 0, 9], dtype=int64)

In [138]:
np.argmax(model.predict(X_test), axis=-1), y_pred

(array([2, 3, 0, 0, 0, 9], dtype=int64),
 array([2, 3, 0, 0, 0, 9], dtype=int64))