In [None]:
import librosa
from tqdm import tqdm
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
import os

In [None]:
os.chdir('/Users/macbook/Desktop/')

In [None]:
bhoop_sm = glob('./Raga_2.0/Bhoop/*')
malhr_sm = glob('./Raga_2.0/Malhar/*')
ngtiv_ng = glob('./Raga_2.0/Negative/*')

In [None]:
#malhr_fm = glob('./dataset/Malhar/Female/*')
#malhr_ml = glob('./dataset/Malhar/Male/*')
#malhr_ng = glob('./dataset/Malhar/Negative/*')

In [None]:
bhoop = bhoop_sm
malhr = malhr_sm
ngtiv = ngtiv_ng

In [None]:
print("Size of Classes:")
print("Bhoop:", len(bhoop))
print("Malhar:", len(malhr))
print("negative:", len(ngtiv))

# Data Pre-Processing

In [None]:
LENGTH  = 5      # Seconds
SR      = 44100 
N_FFT   = 2048    # width for FFT
HOP_LG  = 512     # hop lenght for FFT
N_MELS  = 128     # mel filters
F_MIN   = 20      # min frequency
F_MAX   = 8000    # max frequency

In [None]:
def get_mel_features(file_name):
    """
    return mel spectrogram for given file
    """
    import warnings
    warnings.filterwarnings('ignore')
    
    # open file
    wav, sr = librosa.load(file_name, sr=SR)
    
    # fix lengtrh
    if len(wav) < LENGTH*sr:
        pad_width = sr*LENGTH - len(wav)
        wav = np.pad(wav, pad_width=(pad_width, 0))
    else:
        wav = wav[:LENGTH*sr]
    
    # get mel spectrogram
    mel_spgr = librosa.feature.melspectrogram(wav, sr, 
                                   n_fft      = N_FFT, 
                                   hop_length = HOP_LG,)
    
    mel_spgr = librosa.power_to_db(mel_spgr)
    
    
    # normalise
    eps = 10e-6
    mean    = mel_spgr.mean()
    std     = mel_spgr.std()
    mel_spgr = (mel_spgr - mean) / (std + eps)
    
    return mel_spgr

In [None]:
get_mel_features(bhoop[51])

In [None]:
# superwised learning X, y 
# [[1, 2, 3], ...] -> [label, ..]

In [None]:
def get_data(file_list, target):
    X_data, y_data = [], []
    for file in tqdm(file_list):
        x = get_mel_features(file)
        X_data.append(x)
        y_data.append(target)

    X_data = np.array(X_data)
    y_data = np.array(y_data)
    
    return X_data, y_data

In [None]:
label_names = {
    'ngtiv':np.array([1,0,0]),
    'bhoop':np.array([0,1,0]),
    'malhr':np.array([0,0,1]),
}

label_indx2name = {
    0:'ngtiv',
    1:'bhoop',
    2:'malhr',
}

In [None]:
x1, y1 = get_data(bhoop, label_names['bhoop'])
x2, y2 = get_data(malhr, label_names['malhr'])
x3, y3 = get_data(ngtiv, label_names['ngtiv'])

In [None]:
X_data_dash = np.concatenate((x1,x2,x3), axis=0)
y_data = np.concatenate((y1,y2,y3), axis=0)

In [None]:
X_data = X_data_dash.transpose(0,2,1) # after batch size leading axis should be time
X_data.shape

In [None]:
y_data.shape

# Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Input, Dense, GlobalMaxPool1D
from tensorflow.keras.optimizers import Adam

In [None]:
input_   = Input(shape=X_data.shape[1:])
lstm     = LSTM(20, return_sequences=True, dropout=0.2, recurrent_dropout=0.1)
maxPool  = GlobalMaxPool1D()
dense    = Dense(3, activation='softmax')

In [None]:
model = Sequential()

In [None]:
model.add(input_)
model.add(lstm)
model.add(maxPool)
model.add(dense)

In [None]:
model.summary()

In [None]:
model.compile(
    loss='CategoricalCrossentropy',
    optimizer=Adam(lr=0.001),
    metrics=['accuracy'],
)

In [None]:
history = model.fit(
    X_data, y_data,
    batch_size=8,
    epochs=5,
    validation_split=0.2
)

In [None]:
y_pred = model(X_data)

In [None]:
X_data.shape

In [None]:
def get_features(wav, sr):
    
    mel_spgr = librosa.feature.melspectrogram(wav, sr, 
                                   n_fft      = N_FFT, 
                                   hop_length = HOP_LG,)
    
    mel_spgr = librosa.power_to_db(mel_spgr)
    
    
    # normalise
    eps     = 10e-6
    mean    = mel_spgr.mean()
    std     = mel_spgr.std()
    mel_spgr = (mel_spgr - mean) / (std + eps)
    
    return mel_spgr

In [None]:
test_file = './Bhoop_mk.mp3'

In [None]:
wav, sr = librosa.load(test_file, sr=SR)

In [None]:
#10sec sliding window
for i in range(100):
    wav_10 = wav[10*i*sr : 10*(i+1)*sr]
    features = get_features(wav_10, sr).T
    features = np.array([features])
    output_tensor = model(features)
    max_index = np.argmax(output_tensor.numpy(), axis=1)
    print(10*i,'\t->\t',10*(i+1),'\t', label_indx2name[max_index[0]])