In [1]:
# Basic Libraries
import tensorflow as tf
import pandas as pd
import numpy as np
import scipy.signal

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


In [2]:
# Libraries for Classification and building Models
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical 
from keras.preprocessing.image import ImageDataGenerator
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [3]:
# Project Specific Libraries

import os
import librosa
import librosa.display
import glob 

###  MFCC

In [4]:
# Extracting MFCC's For every audio file
import pandas as pd
import os
import librosa

audio_dataset_path='audio/'
metadata=pd.read_csv('metadata/ff1010bird_metadata_2018.csv')
metadata.head()

Unnamed: 0,itemid,datasetid,fold,hasbird
0,55.wav,ff1010bird,1,0
1,87.wav,ff1010bird,1,0
2,99.wav,ff1010bird,1,0
3,100.wav,ff1010bird,1,1
4,104.wav,ff1010bird,1,0


In [5]:
N_MELS = 40
FFT_POINTS = 882 * 2
SR = 44100
HAMMING_SIGNAL = scipy.signal.hamming
FRAMES = 500


In [6]:
# creating a function that extracts and lable the file 
# normalising the audio 

def feature_extract(audio_dataset_path, data_fold, data_id, data_labels):
    feature_file = []
    label_file = []

    for i in range(len(data_id)):
        #file_name = os.path.join(os.path.abspath(audio_dataset_path), 'fold'+str(row["fold"])+'/',str(row["itemid"]))
        file_name = audio_dataset_path + 'fold' + str(data_fold[i]) + '/' + str(data_id[i])
        [wave, _] = librosa.core.load(file_name ,sr=SR)  # read wav file (fs = 44.1 kHz)
        wave = librosa.stft(wave, FFT_POINTS, win_length=FFT_POINTS,
                            window=HAMMING_SIGNAL)  # STFT computation (fft_points = 882*2, overlap= 50%, analysis_window=40ms)
        wave = np.abs(wave) ** 2
        spectrogram = librosa.feature.melspectrogram(S=wave, n_mels=N_MELS)  # mel bands (40)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        norm_spectrogram = spectrogram - np.amin(spectrogram)
        norm_spectrogram = norm_spectrogram / float(np.amax(norm_spectrogram))

        if int(norm_spectrogram.shape[1]) < FRAMES:  # 10 sec samples gives 500 frames
            z_pad = np.zeros((N_MELS, FRAMES))
            z_pad[:, :-(FRAMES - norm_spectrogram.shape[1])] = norm_spectrogram
            feature_file.append(z_pad)
        else:
            img = norm_spectrogram[:, np.r_[0:FRAMES]]  # final_shape = 40*500
            feature_file.append(img)

        if len(data_labels) > 0:
            label_file.append(data_labels[i])

    feature_file = np.array(feature_file)
    feature_file = np.reshape(feature_file, (len(data_id), N_MELS, FRAMES, 1))

    if len(data_labels) > 0:  # In case of training data
        label_file = np.array(label_file)
        return feature_file, label_file
    else:  # In case of testing data
        return feature_file

In [7]:
#load the extracted feautures
X= np.load('features_data_X.npy')
y= np.load('features_data_y.npy')

In [8]:
X.shape

(15690, 40, 500, 1)

In [9]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [10]:
y.shape

(15690, 2)

### Splitting dataset 

In [11]:
#splitting the dataset to training and validation
X_trainset, X_val, y_trainset, y_val = train_test_split(X, y, random_state = 1)

In [12]:
X_trainset.shape

(11767, 40, 500, 1)

In [13]:
X_val.shape

(3923, 40, 500, 1)

In [14]:
# splitting the training dataset to obtain some test set for the testing phase
X_train, X_test, y_train, y_test = train_test_split(X_trainset, y_trainset, random_state = 1)

In [15]:
X_train.shape

(8825, 40, 500, 1)

In [16]:
X_test.shape

(2942, 40, 500, 1)

In [17]:
X_val.shape

(3923, 40, 500, 1)

In [18]:
input_dim = (40, 500, 1)

### CNN Model

In [19]:
model = Sequential()
model.add(Conv2D(48, kernel_size=3, activation='relu', input_shape=input_dim))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Conv2D(48, kernel_size=5, strides=2, padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))

model.add(Conv2D(96, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Conv2D(96, kernel_size=5, strides=2, padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Flatten())

model.add(Dense(384, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(2, activation='sigmoid'))

In [20]:
optimizer = tf.keras.optimizers.Adam(lr=3e-4)
model.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics = ['accuracy'])



In [21]:
model.load_weights('saved_models/audio_classification2.hdf5')

In [22]:
model.predict(X_test[:10], verbose=2)

1/1 - 0s


array([[0.7232486 , 0.39200416],
       [0.40062755, 0.50363076],
       [0.14526865, 0.8324777 ],
       [0.78451556, 0.55488473],
       [0.998427  , 0.79706776],
       [0.9064108 , 0.08116361],
       [0.07993132, 0.67066914],
       [0.6509493 , 0.10312608],
       [0.12525183, 0.793732  ],
       [0.27268595, 0.70655274]], dtype=float32)