In [None]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
from tensorflow import keras
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Flatten, Dropout, Activation
from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tqdm
import sklearn
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
import pandas as pd
import librosa
import warnings
warnings.filterwarnings("ignore") # Ignore All Warnings

In [None]:
working_dir = "/kaggle/input/audiodata/"

def load_audio_data(split):
    df_mfcc = pd.DataFrame(columns=['mfcc_feature'])
    df_mel = pd.DataFrame(columns=['mel_feature'])
    bookmark = 0
    mylist = pd.read_csv(f"{working_dir}/audio_dataset/{split}.csv")
    labels = mylist.iloc[:, 1]
    with tqdm.tqdm(total=mylist.shape[0]) as pbar:
        for _, row in mylist.iterrows():
            pbar.update(1)
            signal, sample_rate = librosa.load(
                f'{working_dir}audio_dataset/{split}/' + row["Name"],
                mono=True,
                duration=4.5,
                sr=22050 * 2
            )
            num_samples = 4 * sample_rate

            if len(signal) < num_samples:
                num_repeats = (num_samples // len(signal)) + 1
                signal = np.tile(signal, num_repeats)

            if len(signal) > num_samples:
                signal = signal[:num_samples]

            if len(signal) < num_samples:
                num_missing_samples = num_samples - len(signal)
                last_dim_padding = (0, num_missing_samples)
                signal = np.pad(signal, last_dim_padding, "constant")

            sample_rate = np.array(sample_rate)
            
            # Compute MFCCs
            mfccs = np.mean(
                librosa.feature.mfcc(
                    y=signal, sr=sample_rate, n_mfcc=13
                ),
                axis=0
            )
            df_mfcc.loc[bookmark] = [mfccs]

            # Compute Mel Spectrogram
            mel = np.mean(
                librosa.feature.melspectrogram(
                    y=signal, sr=sample_rate
                ),
                axis=0
            )
            df_mel.loc[bookmark] = [mel]
            
            bookmark += 1
            
    df_mfcc_list = pd.DataFrame(df_mfcc['mfcc_feature'].values.tolist())
    df_mel_list = pd.DataFrame(df_mel['mel_feature'].values.tolist())
    df_mfcc_labels = pd.concat([df_mfcc_list, labels], axis=1)
    df_mel_labels = pd.concat([df_mel_list, labels], axis=1)
    df_mfcc_labels = sklearn.utils.shuffle(df_mfcc_labels)
    df_mel_labels = sklearn.utils.shuffle(df_mel_labels)
    
    df_mfcc_labels = df_mfcc_labels.fillna(0)
    df_mel_labels = df_mel_labels.fillna(0)
    
    return df_mfcc_labels, df_mel_labels

In [None]:
train_mfcc, train_mel = load_audio_data("train")
test_mfcc, test_mel = load_audio_data("val")

In [None]:
test_mel

In [None]:
train_mfcc_features = train_mfcc.iloc[:, :-1]
train_mfcc_labels = train_mfcc.iloc[:, -1:]

test_mfcc_features = test_mfcc.iloc[:, :-1]
test_mfcc_labels = test_mfcc.iloc[:, -1:]

train_mel_features = train_mel.iloc[:, :-1]
train_mel_labels = train_mel.iloc[:, -1:]

test_mel_features = test_mel.iloc[:, :-1]
test_mel_labels = test_mel.iloc[:, -1:]

lb = LabelEncoder()

y_train_mfcc = to_categorical(lb.fit_transform(train_mfcc_labels))
y_test_mfcc = to_categorical(lb.fit_transform(test_mfcc_labels))

y_train_mel = to_categorical(lb.fit_transform(train_mel_labels))
y_test_mel = to_categorical(lb.fit_transform(test_mel_labels))

x_train_mfcc = np.expand_dims(np.array(train_mfcc_features), axis=2)
x_test_mfcc = np.expand_dims(np.array(test_mfcc_features), axis=2)

x_train_mel = np.expand_dims(np.array(train_mel_features), axis=2)
x_test_mel = np.expand_dims(np.array(test_mel_features), axis=2)

In [33]:
model_mfcc = Sequential()
model_mel = Sequential()

# Model for MFCC
model_mfcc.add(Conv1D(256, 5,padding='same', input_shape=(345,1)))
model_mfcc.add(Activation('relu'))
model_mfcc.add(Conv1D(128, 5,padding='same'))
model_mfcc.add(Activation('relu'))
model_mfcc.add(Dropout(0.1))
model_mfcc.add(MaxPooling1D(pool_size=(8)))
model_mfcc.add(Conv1D(128, 5,padding='same'))
model_mfcc.add(Activation('relu'))
model_mfcc.add(Conv1D(128, 5,padding='same'))
model_mfcc.add(Activation('relu'))
model_mfcc.add(Conv1D(128, 5,padding='same'))
model_mfcc.add(Activation('relu'))
model_mfcc.add(Dropout(0.2))
model_mfcc.add(Conv1D(128, 5,padding='same'))
model_mfcc.add(Activation('relu'))
model_mfcc.add(Flatten())
model_mfcc.add(Dense(13))
model_mfcc.add(Activation('softmax'))
opt_mfcc = keras.optimizers.RMSprop(learning_rate=0.00001, decay=1e-6)

# Model for Mel Spectrogram
model_mel.add(Conv1D(256, 5,padding='same', input_shape=(345,1)))
model_mel.add(Activation('relu'))
model_mel.add(Conv1D(128, 5,padding='same'))
model_mel.add(Activation('relu'))
model_mel.add(Dropout(0.1))
model_mel.add(MaxPooling1D(pool_size=(8)))
model_mel.add(Conv1D(128, 5,padding='same'))
model_mel.add(Activation('relu'))
model_mel.add(Conv1D(128, 5,padding='same'))
model_mel.add(Activation('relu'))
model_mel.add(Conv1D(128, 5,padding='same'))
model_mel.add(Activation('relu'))
model_mel.add(Dropout(0.2))
model_mel.add(Conv1D(128, 5,padding='same'))
model_mel.add(Activation('relu'))
model_mel.add(Flatten())
model_mel.add(Dense(13))
model_mel.add(Activation('softmax'))
opt_mel = keras.optimizers.RMSprop(learning_rate=0.00001, decay=1e-6)

# Compile models
model_mfcc.compile(loss='categorical_crossentropy', optimizer=opt_mfcc, metrics=['accuracy'])
model_mel.compile(loss='categorical_crossentropy', optimizer=opt_mel, metrics=['accuracy'])

# Fit models
cnnhistory_mfcc = model_mfcc.fit(x_train_mfcc, y_train_mfcc, batch_size=128, epochs=300, validation_data=(x_test_mfcc, y_test_mfcc))
cnnhistory_mel = model_mel.fit(x_train_mel, y_train_mel, batch_size=128, epochs=300, validation_data=(x_test_mel, y_test_mel))

Epoch 1/300
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 128ms/step - accuracy: 0.1402 - loss: 2.5295 - val_accuracy: 0.1290 - val_loss: 2.4637
Epoch 2/300
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.1523 - loss: 2.4695 - val_accuracy: 0.1935 - val_loss: 2.4358
Epoch 3/300
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.1886 - loss: 2.4283 - val_accuracy: 0.2448 - val_loss: 2.4069
Epoch 4/300
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.2196 - loss: 2.4070 - val_accuracy: 0.2415 - val_loss: 2.3786
Epoch 5/300
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.2225 - loss: 2.3784 - val_accuracy: 0.2672 - val_loss: 2.3431
Epoch 6/300
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.2447 - loss: 2.3458 - val_accuracy: 0.2696 - val_loss: 2.3061
Epoch 7/300
[1m38/38[0m 

In [34]:
model_mfcc.evaluate(x_test_mfcc, y_test_mfcc)

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6026 - loss: 1.1256


[1.150840163230896, 0.5996691584587097]

In [35]:
model_mel.evaluate(x_test_mel, y_test_mel)

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5862 - loss: 1.4769


[1.4774755239486694, 0.5814722776412964]