In [2]:
import os
import mutagen
import mutagen.wave
import numpy as np
import pandas as pd
import librosa.display

In [3]:
def create_dataset_df(csv_file):
    dataset_df = pd.read_csv(csv_file)
    filepaths = []
    for i, row in dataset_df.iterrows():
        filepaths.append(os.path.join('UrbanSound8K/audio', 'fold'+str(row['fold']), row['slice_file_name']))
    dataset_df['filepath'] = filepaths
    return dataset_df

def compute_audio_statistics(dataset_df):
    metadata_dict = {'length': [], 'bitrate': [], 'channels': [], 'sample_rate': [], 'bits_per_sample': []}
    # Extract metadata
    for filepath in dataset_df['filepath']:
        metadata = get_audio_metadata_mutagen(filepath)
        for key in metadata_dict.keys():
            metadata_dict[key].append(metadata[key])
    # Add new columns to dataframe
    for key in metadata_dict.keys():
        dataset_df[key] = metadata_dict[key]

    return dataset_df

def get_audio_metadata_mutagen(filepath):
    metadata = {}
    f = mutagen.wave.WAVE(filepath)
    metadata['length'] = f.info.length
    metadata['bitrate'] = f.info.bitrate
    metadata['channels'] = f.info.channels
    metadata['sample_rate'] = f.info.sample_rate
    metadata['bits_per_sample'] = f.info.bits_per_sample
    return metadata

In [4]:
dataset_df = create_dataset_df('UrbanSound8K/metadata/UrbanSound8K.csv')
dataset_df = compute_audio_statistics(dataset_df)
dataset_df.describe()

Unnamed: 0,fsID,start,end,salience,fold,classID,length,bitrate,channels,sample_rate,bits_per_sample
count,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0
mean,116033.493816,38.645409,42.253312,1.347,5.385937,4.592877,3.603644,449531.1,1.915369,48456.979272,18.780119
std,57991.017218,74.292126,74.369669,0.476043,2.84682,2.894544,0.980913,548081.3,0.278348,15300.080707,4.227168
min,344.0,0.0,0.105962,1.0,1.0,0.0,0.000816,11025.0,1.0,8000.0,4.0
25%,69942.25,3.0,6.839398,1.0,3.0,2.0,4.0,352800.0,2.0,44100.0,16.0
50%,118279.0,10.376492,14.0,1.0,5.0,4.0,4.0,352800.0,2.0,44100.0,16.0
75%,166942.0,35.131372,38.866979,2.0,8.0,7.0,4.0,529200.0,2.0,48000.0,24.0
max,209992.0,600.125356,604.125356,2.0,10.0,9.0,4.0,45158400.0,2.0,192000.0,32.0


In [5]:
dataset_df['sample_rate'].value_counts(),\
dataset_df['length'].value_counts(),

(44100     5370
 48000     2502
 96000      610
 24000       82
 16000       45
 22050       44
 11025       39
 192000      17
 8000        12
 11024        7
 32000        4
 Name: sample_rate, dtype: int64,
 4.000000    7325
 1.440000      10
 1.330000      10
 1.090000       9
 1.020000       7
             ... 
 1.085397       1
 1.039546       1
 2.088503       1
 3.220000       1
 3.505986       1
 Name: length, Length: 973, dtype: int64)

As we can see, most recordings are 4 seconds long and were recorded at a frequency of 44,1 kHz.
So to make working with data easier, we will only use this data

## Data simplification
Reduce dataset to speed up and simplify process

In [6]:
reduced_dataset_df = dataset_df.loc[(dataset_df["sample_rate"] == 44100) &
                                (dataset_df["length"] == 4.0) &
                                (~dataset_df["class"].isin(["gun_shot", "car_horn", "children_playing", "street_music"]))]
reduced_dataset_df.describe()

Unnamed: 0,fsID,start,end,salience,fold,classID,length,bitrate,channels,sample_rate,bits_per_sample
count,3079.0,3079.0,3079.0,3079.0,3079.0,3079.0,3079.0,3079.0,3079.0,3079.0,3079.0
mean,107075.758363,41.469669,45.469669,1.314063,5.623904,4.238389,4.0,363613.738227,1.916856,44100.0,17.564144
std,54543.168993,77.985483,77.985483,0.464217,2.791456,2.769169,0.0,115129.056055,0.276144,0.0,3.821347
min,518.0,0.0,4.0,1.0,1.0,0.0,4.0,88200.0,1.0,44100.0,16.0
25%,62837.0,3.134062,7.134062,1.0,3.0,3.0,4.0,352800.0,2.0,44100.0,16.0
50%,102871.0,10.5,14.5,1.0,6.0,4.0,4.0,352800.0,2.0,44100.0,16.0
75%,156869.0,39.878609,43.878609,2.0,8.0,7.0,4.0,352800.0,2.0,44100.0,16.0
max,209864.0,534.628805,538.628805,2.0,10.0,8.0,4.0,705600.0,2.0,44100.0,32.0


In [7]:
reduced_dataset_df.groupby('class').describe()

Unnamed: 0_level_0,fsID,fsID,fsID,fsID,fsID,fsID,fsID,fsID,start,start,...,sample_rate,sample_rate,bits_per_sample,bits_per_sample,bits_per_sample,bits_per_sample,bits_per_sample,bits_per_sample,bits_per_sample,bits_per_sample
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
air_conditioner,674.0,118710.246291,54970.593077,13230.0,74507.0,134717.0,162103.0,204240.0,674.0,15.247326,...,44100.0,44100.0,674.0,18.872404,4.45933,16.0,16.0,16.0,24.0,32.0
dog_bark,435.0,83270.374713,49666.57385,7383.0,50223.0,72221.0,116483.0,207124.0,435.0,24.383341,...,44100.0,44100.0,435.0,17.195402,3.242286,16.0,16.0,16.0,16.0,32.0
drilling,515.0,106917.248544,55564.143122,518.0,58937.0,99192.0,166931.0,205610.0,515.0,38.262867,...,44100.0,44100.0,515.0,16.885437,2.51232,16.0,16.0,16.0,16.0,24.0
engine_idling,494.0,97593.639676,59899.387258,6988.0,39856.25,94632.0,152908.0,209864.0,494.0,17.967637,...,44100.0,44100.0,494.0,16.842105,2.457626,16.0,16.0,16.0,16.0,24.0
jackhammer,473.0,115350.369979,54206.656511,24728.0,62837.0,105029.0,165039.0,203929.0,473.0,124.597281,...,44100.0,44100.0,473.0,18.875264,5.855904,16.0,16.0,16.0,16.0,32.0
siren,488.0,113972.489754,42321.007181,30823.0,74726.0,107357.0,157866.0,208652.0,488.0,39.519991,...,44100.0,44100.0,488.0,16.262295,1.599811,16.0,16.0,16.0,16.0,32.0


## Feature Extraction
We will be using Mel spektrum for audio classification.
Created spectrum is already one-sided, so it does not contains duplicate informations

In [19]:
from tensorflow.keras.utils import Sequence

class MelDataGenerator(Sequence):

    N_FFT = 2048
    HOP_LENGTH = 512

    def __init__(self, dataframe, output_size, n_mels=128, shuffle=False, batch_size=10):
        """
        Initializes a data generator object
            :param dataframe: dataframe containing data such as class and file path
            :param output_size: output size after preprocessing
            :param n_mel: number of mel banks
            :param shuffle: shuffle the data after each epoch
            :param batch_size: The size of each batch returned by __getitem__
        """
        self.df = dataframe
        self.output_size = output_size
        self.n_mels = n_mels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.on_epoch_end()

    def on_epoch_end(self):
        self.indices = np.arange(len(self.df))
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __len__(self):
        return int(len(self.df) / self.batch_size)

    def __getitem__(self, idx):
        ## Initializing Batch
        X = np.empty((self.batch_size, *self.output_size))
        y = np.empty((self.batch_size, 1))

        # get the indices of the requested batch
        indices = self.indices[idx * self.batch_size:(idx+1)*self.batch_size]

        for i, data_index in enumerate(indices):
            selected_df = self.df.iloc[data_index]

            features = self.extract_features(selected_df["filepath"])

            X[i,] = np.reshape(features, self.output_size)
            y[i] = selected_df["classID"]

        return X, y

    def extract_features(self, file_path):
        audio, sample_rate = librosa.load(file_path)
        mel = librosa.feature.melspectrogram(y=audio,
                                             sr=sample_rate,
                                             n_fft=self.N_FFT,
                                             hop_length=self.HOP_LENGTH,
                                             n_mels=self.n_mels)

        mag_db_mel = librosa.amplitude_to_db(np.abs(mel))
        angle_mel =  np.angle(mel)

        result_features = np.stack((mag_db_mel, angle_mel))

        return result_features


In [24]:
from tensorflow.python.keras.optimizer_v2.adam import Adam
from tensorflow.python.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.python.keras import Sequential

n_mels = 128
features_size = (n_mels, 173, 2)
batch_size = 16
num_of_classes = 6

# Creating CNN leNet based model
model = Sequential()

model.add(Conv2D(32, (5, 5), input_shape=features_size, activation='relu'))
model.add(Conv2D(32, (5, 5), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_of_classes, activation='softmax'))

optimizer = Adam(learning_rate= 0.001)

model.compile(optimizer, loss='categorical_crossentropy', metrics=["accuracy"])
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_17 (Conv2D)           (None, 124, 169, 32)      1632      
_________________________________________________________________
conv2d_18 (Conv2D)           (None, 120, 165, 32)      25632     
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 60, 82, 32)        0         
_________________________________________________________________
conv2d_19 (Conv2D)           (None, 58, 80, 16)        4624      
_________________________________________________________________
conv2d_20 (Conv2D)           (None, 56, 78, 16)        2320      
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 28, 39, 16)        0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 17472)            

In [27]:
train_df = reduced_dataset_df.loc[reduced_dataset_df["fold"] < 9]
valid_df = reduced_dataset_df.loc[reduced_dataset_df["fold"] == 9]
test_df =  reduced_dataset_df.loc[reduced_dataset_df["fold"] == 10]

train_gen = MelDataGenerator(dataframe=train_df,
                             output_size=features_size,
                             n_mels=n_mels,
                             shuffle=True,
                             batch_size=batch_size)

valid_gen = MelDataGenerator(dataframe=valid_df,
                             output_size=features_size,
                             n_mels=n_mels,
                             shuffle=True,
                             batch_size=batch_size)


In [29]:
history = model.fit(train_gen, validation_data=valid_gen, epochs=5)

Epoch 1/5

KeyboardInterrupt: 