In [1]:
import os
from keras.models import Sequential,Model
from keras.layers import Conv2D,MaxPool2D,GlobalMaxPool2D,Flatten,Dense,Dropout,Input,Lambda,BatchNormalization
from keras.callbacks import ModelCheckpoint,EarlyStopping, ReduceLROnPlateau
import keras.backend as K
import librosa
import numpy as np
import random
import string
import matplotlib.pyplot as plt
import librosa.display
from sklearn.utils import shuffle
import cv2

2023-04-04 23:53:30.774154: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#convert song to mel spectogram as siamese network doesn't work on sound directly
def create_spectrogram(clip,sample_rate,save_path):
    plt.interactive(False)
    fig=plt.figure(figsize=[0.72,0.72])
    ax=fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    S=librosa.feature.melspectrogram(y=clip,sr=sample_rate)
    librosa.display.specshow(librosa.power_to_db(S,ref=np.max))
    fig.savefig(save_path,dpi=400,bbox_inches='tight',pad_inches=0)
    plt.close()
    fig.clf()
    plt.close(fig)
    plt.close('all')
    del save_path,clip,sample_rate,fig,ax,S

In [3]:
#encoder of siamese network
# def get_encoder(input_size):
#     #convolutional neural network layers
#     model=Sequential()
#     model.add(Conv2D(32,(3,3),input_shape=(150,150,3),activation='relu'))
#     model.add(BatchNormalization())
#     model.add(Dropout(0.5))
#     model.add(Conv2D(64,(3,3),activation='relu'))
#     model.add(MaxPool2D(2,2))
#     model.add(Dropout(0.5))

#     model.add(Conv2D(64,(3,3),activation='relu'))
#     model.add(BatchNormalization())
#     model.add(Dropout(0.5))
#     model.add(Conv2D(64,(3,3),activation='relu'))
#     model.add(MaxPool2D(2,2))
#     model.add(Dropout(0.5))


#     model.add(GlobalMaxPool2D())

#     return model

In [4]:
from keras.applications import VGG16
def get_encoder(input_size):
    # Use VGG16 as the encoder
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_size)
    for layer in base_model.layers:
        layer.trainable = False
    x = base_model.output
    x = GlobalMaxPool2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    encoder = Model(inputs=base_model.input, outputs=x)
    return encoder

In [5]:
def get_siamese_network(encoder,input_size):
    
    #define tensors of 2 input
    input1=Input(input_size)
    input2=Input(input_size)

    #generate encoding i.e (feature vector) of the 2 imgs
    encoder_l=encoder(input1)
    encoder_r=encoder(input2)
    
    #add customized layer to compute absolute diff between encoding
    L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoder_l, encoder_r])
        
    # Add a dense layer with a sigmoid unit to generate the similarity score
    output=Dense(1,activation='sigmoid')(L1_distance)
    
    #connect inputs with output
    siam_model=Model(inputs=[input1,input2],outputs=output)
    return siam_model

def accuracy(y_true, y_pred):
    """
    Custom metric function to calculate accuracy.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    false_negatives = K.sum(K.round(K.clip(y_true * (1-y_pred), 0, 1)))
    true_negatives = K.sum(K.round(K.clip((1-y_true) * (1-y_pred), 0, 1)))
    false_positives = K.sum(K.round(K.clip((1-y_true) * y_pred, 0, 1)))

    accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives + K.epsilon())
    return accuracy

encoder=get_encoder((150,150,3))
siamese_net=get_siamese_network(encoder,(150,150,3))
siamese_net.compile(loss='binary_crossentropy',optimizer='adam',metrics=[accuracy])

2023-04-04 23:53:42.451264: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from keras.preprocessing.image import ImageDataGenerator
# Use data augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

In [7]:
#tar= target var
def different_label_index(X):
    idx1=0
    idx2=0
    while idx1==idx2:
        idx1=np.random.randint(0,len(X))
        idx2=np.random.randint(0,len(X))
    return idx1,idx2

def load_img(path):
    img=cv2.imread(path)
    img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    img=cv2.resize(img,(150,150))
    return img


def batch_generator(X,batch_size):
    while True:
        data=[np.zeros((batch_size,150,150,3)) for i in range(2)]
        tar=[np.zeros(batch_size,)]

        #Generating same pairs.
        for i in range(0,batch_size//2):
            idx1=np.random.randint(0,len(X))
            img1=load_img(X[idx1])
            img1=img1/255

            data[0][i,:,:,:]=img1
            data[1][i,:,:,:]=img1
            tar[0][i]=1

        #Generating different pairs.
        for k in range(batch_size//2,batch_size):
            idx1,idx2=different_label_index(X)
            img1=load_img(X[idx1])
            img1=img1/255
            img2=load_img(X[idx2])
            img2=img2/255

            data[0][k,:,:,:]=img1
            data[1][k,:,:,:]=img2
            tar[0][k]=0
        np.delete(data[0],np.where(~data[0].any(axis=1))[0], axis=0) #Remove the data points in case they have zero value.
        np.delete(data[1],np.where(~data[1].any(axis=1))[0], axis=0) 
        yield data,tar

In [8]:
import scipy
# Lists all the files in the folder.
songs_list = [f for f in os.listdir('/Users/prernachheda/Desktop/chord/seismese_net_songs') if not f.startswith('.')]



counter = 1
def get_spec_name(song_name):
    global counter
    spec_name = f"{song_name}_{counter}.png"
    counter += 1
    return spec_name

for song in songs_list:
    print(song)
    songfile, sr = librosa.load('/Users/prernachheda/Desktop/chord/seismese_net_songs/'+song)
    duration = librosa.get_duration(y=songfile, sr=sr)
    # Apply pre-emphasis filter
    preemphasis_coeff = 0.07
    preemphasis_filter = np.array([1, -preemphasis_coeff])
    songfile = scipy.signal.lfilter(preemphasis_filter, [1], songfile.ravel())
    song_name = os.path.splitext(song)[0]
    prev = 0
    for i in range(1, int((duration // 10) + 1)):
        if i == int((duration // 10)):
            """Since we are dividing the song in 10s segment there might be case that after taking 10
            fragments also few more seconds are left so in this case extra becomes extra=extra+(10-extra) 
            from the previous segment."""
            extra = int((int(duration) / 10 - int(int(duration) / 10)) * 10) 
            st = (sr * i * 10) - (10 - extra)
            end = st + 10
            songfrag = np.copy(songfile[st:end])
        else:
            songfrag = np.copy(songfile[prev:(sr * i * 10)])
        
            specname = get_spec_name(song_name)
            create_spectrogram(songfrag, sr, '/Users/prernachheda/Desktop/chord/test_spect/' + specname)
        
        prev = sr * i * 10



Imagine Dragons-Bones.mp3
A.R. Rahman,Arijit Singh - Enna Sona.mp3
Shankar-Ehsaan-Loy,Shankar Mahadevan - Aaj Kal Zindagi.mp3
Sohail Sen,Benny Dayal,Aditi Singh Sharma,Irshad Kamil - Choomantar.mp3
#Eminem, Royce Da 5'9##, Black Thought, Q-Tip, Denaun# - #Yah Yah (feat. Royce Da 5'9##, Black Thought, Q-Tip & Denaun)#.mp3
Vishal Dadlani,Shilpa Rao - I Feel Good.mp3
Aditi Singh Sharma,Amitabh Bhattacharya - Offo.mp3
Mohit Chauhan - Masakali.mp3
#Eminem, Royce Da 5'9##, White Gold# - #You Gon’ Learn (feat. Royce Da 5'9## & White Gold)#.mp3
Anuv Jain - Alag Aasmaan.mp3
#Eminem, KXNG Crooked, Royce Da 5'9##, Joell Ortiz# - #I Will (feat. KXNG Crooked, Royce Da 5'9## & Joell Ortiz)#.mp3
Sonu Nigam,Jayesh Gandhi,Amrita Kak - Just Chill.mp3
Sanam - Gulabi Aankhen.mp3
stephen sanchez, em beihold-until i found you.mp3
AP Dhillon,Gurinder Gill,Intense - Excuses.mp3
Sohail Sen,Rahat Fateh Ali Khan,Irshad Kamil - Isq Risk.mp3
Eminem, Juice WRLD - Godzilla (feat. Juice WRLD).mp3
Shaan,KK - Dus Bahan

In [9]:
batch_size=10
specfilelist=os.listdir('/Users/prernachheda/Desktop/chord/test_spect/')
specfilelist=['/Users/prernachheda/Desktop/chord/test_spect/'+filename for filename in specfilelist]
specfilelist=shuffle(specfilelist)

X_train=specfilelist[0:int(0.80*len(specfilelist))]
X_test=specfilelist[int(0.80*len(specfilelist)):]
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=1e-6)
es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
mc = ModelCheckpoint('embdmodel_1.hdf5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history=siamese_net.fit_generator(batch_generator(X_train,batch_size),steps_per_epoch=len(X_train)//batch_size,epochs=80,validation_data=batch_generator(X_test,batch_size),
                            validation_steps=len(X_test)//batch_size,callbacks=[reduce_lr,es,mc],shuffle=True)

Epoch 1/80


  history=siamese_net.fit_generator(batch_generator(X_train,batch_size),steps_per_epoch=len(X_train)//batch_size,epochs=80,validation_data=batch_generator(X_test,batch_size),


Epoch 1: val_loss improved from inf to 0.56712, saving model to embdmodel_1.hdf5
Epoch 2/80
Epoch 2: val_loss improved from 0.56712 to 0.51526, saving model to embdmodel_1.hdf5
Epoch 3/80
Epoch 3: val_loss improved from 0.51526 to 0.47779, saving model to embdmodel_1.hdf5
Epoch 4/80
Epoch 4: val_loss improved from 0.47779 to 0.45403, saving model to embdmodel_1.hdf5
Epoch 5/80
Epoch 5: val_loss improved from 0.45403 to 0.44650, saving model to embdmodel_1.hdf5
Epoch 6/80
Epoch 6: val_loss improved from 0.44650 to 0.43397, saving model to embdmodel_1.hdf5
Epoch 7/80
Epoch 7: val_loss improved from 0.43397 to 0.41485, saving model to embdmodel_1.hdf5
Epoch 8/80
Epoch 8: val_loss improved from 0.41485 to 0.40611, saving model to embdmodel_1.hdf5
Epoch 9/80
Epoch 9: val_loss improved from 0.40611 to 0.39131, saving model to embdmodel_1.hdf5
Epoch 10/80
Epoch 10: val_loss improved from 0.39131 to 0.38863, saving model to embdmodel_1.hdf5
Epoch 11/80
Epoch 11: val_loss improved from 0.38863 

Epoch 27: val_loss improved from 0.30642 to 0.30580, saving model to embdmodel_1.hdf5
Epoch 28/80
Epoch 28: val_loss improved from 0.30580 to 0.29080, saving model to embdmodel_1.hdf5
Epoch 29/80
Epoch 29: val_loss did not improve from 0.29080
Epoch 30/80
Epoch 30: val_loss did not improve from 0.29080
Epoch 31/80
Epoch 31: val_loss improved from 0.29080 to 0.28937, saving model to embdmodel_1.hdf5
Epoch 32/80
Epoch 32: val_loss improved from 0.28937 to 0.28275, saving model to embdmodel_1.hdf5
Epoch 33/80
Epoch 33: val_loss did not improve from 0.28275
Epoch 34/80
Epoch 34: val_loss did not improve from 0.28275
Epoch 35/80
Epoch 35: val_loss improved from 0.28275 to 0.27666, saving model to embdmodel_1.hdf5
Epoch 36/80
Epoch 36: val_loss improved from 0.27666 to 0.27550, saving model to embdmodel_1.hdf5
Epoch 37/80
Epoch 37: val_loss did not improve from 0.27550
Epoch 38/80
Epoch 38: val_loss did not improve from 0.27550
Epoch 39/80
Epoch 39: val_loss improved from 0.27550 to 0.27215,

Epoch 54: val_loss improved from 0.24987 to 0.24116, saving model to embdmodel_1.hdf5
Epoch 55/80
Epoch 55: val_loss improved from 0.24116 to 0.23931, saving model to embdmodel_1.hdf5
Epoch 56/80
Epoch 56: val_loss did not improve from 0.23931
Epoch 57/80
Epoch 57: val_loss improved from 0.23931 to 0.23667, saving model to embdmodel_1.hdf5
Epoch 58/80
Epoch 58: val_loss did not improve from 0.23667
Epoch 59/80
Epoch 59: val_loss did not improve from 0.23667
Epoch 60/80
Epoch 60: val_loss did not improve from 0.23667
Epoch 61/80
Epoch 61: val_loss did not improve from 0.23667
Epoch 62/80
Epoch 62: val_loss improved from 0.23667 to 0.22106, saving model to embdmodel_1.hdf5
Epoch 63/80
Epoch 63: val_loss improved from 0.22106 to 0.22041, saving model to embdmodel_1.hdf5
Epoch 64/80
Epoch 64: val_loss did not improve from 0.22041
Epoch 65/80
Epoch 65: val_loss improved from 0.22041 to 0.22009, saving model to embdmodel_1.hdf5
Epoch 66/80
Epoch 66: val_loss did not improve from 0.22009
Epoc