In [2]:
!pip install ijson
!pip install tensorflow



In [3]:
import json
import os
import math
import librosa
import numpy as np
import ijson
from tensorflow import keras
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [4]:
DATASET_PATH = r"D:\Data\process2"
JSON_PATH = r"D:/pythonProject/data/data_all2-3.json"
SAMPLE_RATE = 22050
TRACK_DURATION = 30  # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION

In [6]:
def save_mfcc(dataset_path, json_path, num_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    """Extracts MFCCs from music dataset and saves them into a json file along witgh genre labels.

        :param dataset_path (str): Path to dataset
        :param json_path (str): Path to json file used to save MFCCs
        :param num_mfcc (int): Number of coefficients to extract
        :param n_fft (int): Interval we consider to apply FFT. Measured in # of samples
        :param hop_length (int): Sliding window for FFT. Measured in # of samples
        :param: num_segments (int): Number of segments we want to divide sample tracks into
        :return:
        """

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping": [],
        "labels": [],
        "mfcc": [],
        "tempo":[],
        "spectral_contrast": [],
        "chroma_cqt":[],
        "beat_dispersion":[],
        "beat_hist":[],
        "zero_cross_rate":[]     
    }
    
#     data = {
#         "mapping": [],
#         "labels": [],
#         "feature": []
#     }
    
    samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)
        # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        fmin = 27.5
        fmax = 16000
        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split("\\")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            # process all audio files in genre sub-dir
            for f in filenames:

                # load audio file
                file_path = os.path.join(dirpath, f)
                signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
                
                # Extract form and structure features
                tempo, beat_frames = librosa.beat.beat_track(y=signal, sr=sample_rate, start_bpm=100)
                beat_times = librosa.frames_to_time(beat_frames, sr=sample_rate)
                
                
                # Calculate beat dispersion
                ibis=np.diff(beat_times)
                beat_dispersion = np.std(ibis) / np.mean(ibis)
                
                # Compute beat times using beat tracking algorithm
                # Compute histogram of beat times
                n_bins = 100  # Number of histogram bins
                hist, bin_edges = np.histogram(beat_times, bins=n_bins)

                # Normalize the histogram to have unit area
                beat_hist = hist / np.sum(hist)
                                
                # Extract harmonic features
                chroma_cqt = librosa.feature.chroma_cqt(y=signal, sr=sample_rate)
                chroma_cqt=chroma_cqt.T

                
                # process all segments of audio file
                for d in range(num_segments):
                  try:
                    # calculate start and finish sample for current segment
                    start = samples_per_segment * d
                    finish = start + samples_per_segment
                    
                    
                    # Extract melodic features
                    chroma_stft = librosa.feature.chroma_stft(y=signal[start:finish], sr=sample_rate, hop_length=hop_length)
                    chroma_stft=chroma_stft.T
                    
                    S = librosa.feature.melspectrogram(y= signal[start:finish], sr=sample_rate, n_fft=n_fft,
                                   hop_length=hop_length,
                                   fmin=fmin,
                                   fmax=fmax,
                                   n_mels=138)
                    
                    # Zero-crossing rate
                    zero_cross = librosa.zero_crossings(y=signal[start:finish])
                    zero_cross_rate = len(zero_cross) / len(signal[start:finish])

                    # Spectral contrast
                    S = librosa.feature.melspectrogram(y= signal[start:finish], sr=sample_rate, n_fft=n_fft,
                                   hop_length=hop_length)
                    spectral_contrast = librosa.feature.spectral_contrast(S=S)
                    spectral_contrast=spectral_contrast.T
                    
                    
                    # extract mfcc
                    mfcc = librosa.feature.mfcc(y=signal[start:finish], sr=sample_rate, n_mfcc=num_mfcc, n_fft=n_fft,
                                                hop_length=hop_length)
                    mfcc = mfcc.T

                    # store only mfcc feature with expected number of vectors
                    if len(mfcc) == num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["spectral_contrast"].append(spectral_contrast.tolist())
                        data["chroma_cqt"].append(chroma_cqt.tolist())
                        data["beat_dispersion"].append(beat_dispersion)
                        data["tempo"].append(tempo)
                        data["zero_cross_rate"].append(zero_cross_rate)
                        data["beat_hist"].append(beat_hist.tolist())
                        
#                         data["feature"].append(x)
                        data["labels"].append(i - 1)
                        # print("{}, segment:{}".format(file_path, d + 1))
                  except EOFError as e:
                      print(e)

   # save MFCCs to json file
    with open(json_path, "w") as fp:
      json.dump(data, fp, indent=4)



In [7]:
if __name__ == "__main__":
    save_mfcc(DATASET_PATH, JSON_PATH, num_segments=2)


Processing: rap


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)



Processing: rnb


In [11]:
def extract_json(filename, key):
    with open(filename, 'rb') as input_file:
         return np.array(list(ijson.items(input_file, key+'.item', use_float=True)))


In [22]:
def prepare(X):
    """Loads data and splits it into train, validation and test sets.
    :param test_size (float): Value in [0, 1] indicating percentage of data set to allocate to test split
    """
    X = X[..., np.newaxis] 
    X_tensor= tf.convert_to_tensor(X)

    return X_tensor


In [27]:
def load_data():
    """Loads training dataset from json file.
        :param data_path (str): Path to json file containing data
        :return X (ndarray): Inputs
        :return y (ndarray): Targets
    """
    mfcc= extract_json(JSON_PATH, 'mfcc')
    tempo= extract_json(JSON_PATH, 'tempo')
    beat_hist = extract_json(JSON_PATH, 'beat_hist')
    zero_cross_rate = extract_json(JSON_PATH, 'zero_cross_rate')
    chroma_cqt = extract_json(JSON_PATH, 'chroma_cqt')
    beat_dispersion = extract_json(JSON_PATH, 'beat_dispersion')
    spectral_contrast = extract_json(JSON_PATH, 'spectral_contrast')
    y = extract_json(JSON_PATH, 'labels')
    
    
    mfcc= mfcc[..., np.newaxis] 
    chroma_cqt= chroma_cqt[..., np.newaxis] 
    spectral_contrast= spectral_contrast[..., np.newaxis]
    
    mfcc= tf.convert_to_tensor(mfcc)
    chroma_cqt= tf.convert_to_tensor(chroma_cqt)
    spectral_contrast= tf.convert_to_tensor(spectral_contrast)
    tempo= tf.convert_to_tensor(tempo)
    beat_hist= tf.convert_to_tensor(beat_hist)
    zero_cross_rate= tf.convert_to_tensor(zero_cross_rate)
    beat_dispersion= tf.convert_to_tensor(beat_dispersion)
    
    
#     X= extract_json(JSON_PATH, 'feature')
    return mfcc,tempo,beat_hist,zero_cross_rate,chroma_cqt,beat_dispersion,spectral_contrast,y

In [28]:
mfcc,tempo,beat_hist,zero_cross_rate,chroma_cqt,beat_dispersion,spectral_contrast, y = load_data()

In [29]:
print("tempo shape:",tempo.shape)
print("beat_hist shape:",beat_hist.shape)
print("spectral_contrast shape:",spectral_contrast.shape)
print("chroma_cqt shape:",chroma_cqt.shape)
print("beat_dispersion shape:",beat_dispersion.shape)
print("zero_cross_rate shape:",zero_cross_rate.shape)
print("mfcc shape:",mfcc.shape)

tempo shape: (374,)
beat_hist shape: (374, 100)
spectral_contrast shape: (374, 646, 7, 1)
chroma_cqt shape: (374, 1292, 12, 1)
beat_dispersion shape: (374,)
zero_cross_rate shape: (374,)
mfcc shape: (374, 646, 13, 1)


In [46]:
def build_model():
    """Generates CNN model

    :param input_shape (tuple): Shape of input set
    :return model: CNN model
    """

    # # build network topology
#     model = keras.Sequential()
#     model.add(keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
#     model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
#     model.add(keras.layers.BatchNormalization())

#     # 2nd conv layer
#     model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
#     model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
#     model.add(keras.layers.BatchNormalization())

#     # 3rd conv layer
#     model.add(keras.layers.Conv2D(32, (2, 2), activation='relu'))
#     model.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
#     model.add(keras.layers.BatchNormalization())
    
#     # 2 LSTM layers
#     model.add(keras.layers.TimeDistributed(keras.layers.LSTM(256, return_sequences=True)))
#     model.add(keras.layers.TimeDistributed(keras.layers.LSTM(128, return_sequences=True)))
#     model.add(keras.layers.Flatten())
    
#     keras.layers.TimeDistributed(
    # Define the input layers
    mfcc_input = keras.layers.Input(shape=(646,13,1))
    tempo_input = keras.layers.Input(shape=(1,))
    beat_hist_input = keras.layers.Input(shape=(100,))
    spectral_contrast_input = keras.layers.Input(shape=(646, 7,1))  
    chroma_cqt_input = keras.layers.Input(shape=(1292, 12,1))  
    beat_dispersion_input = keras.layers.Input(shape=(1,))
    zero_cross_rate_input = keras.layers.Input(shape=(1,))

    
    # Add MFCCs 
    mfcc_branch= keras.Sequential()
    mfcc_branch.add(keras.layers.Conv2D(32, (3, 3), activation='relu'))
    mfcc_branch.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    mfcc_branch.add(keras.layers.BatchNormalization())

    mfcc_branch.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
    mfcc_branch.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    mfcc_branch.add(keras.layers.BatchNormalization())
                    
    mfcc_branch.add(keras.layers.Conv2D(32, (2, 2), activation='relu'))
    mfcc_branch.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
    mfcc_branch.add(keras.layers.BatchNormalization())
    
    
    mfcc_branch.add(keras.layers.TimeDistributed(keras.layers.LSTM(256, return_sequences=True)))
    mfcc_branch.add(keras.layers.TimeDistributed(keras.layers.LSTM(128, return_sequences=True)))
    mfcc_branch.add(keras.layers.Flatten())

    mfcc_output= mfcc_branch(mfcc_input)
    
    
    # Add spectral_contrast features 
    spectral_contrast_branch= keras.Sequential()
    spectral_contrast_branch.add(keras.layers.Conv2D(32, (3, 3), activation='relu'))
    spectral_contrast_branch.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    spectral_contrast_branch.add(keras.layers.BatchNormalization())

    spectral_contrast_branch.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
    spectral_contrast_branch.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    spectral_contrast_branch.add(keras.layers.BatchNormalization())
    
    spectral_contrast_branch.add(keras.layers.TimeDistributed(keras.layers.LSTM(256, return_sequences=True)))
    spectral_contrast_branch.add(keras.layers.TimeDistributed(keras.layers.LSTM(128, return_sequences=True)))
    spectral_contrast_branch.add(keras.layers.Flatten())

    spectral_contrast_output=spectral_contrast_branch(spectral_contrast_input)
    
    
    # Add chroma_cqt contrast 
    chroma_cqt_branch= keras.Sequential()
    chroma_cqt_branch.add(keras.layers.Conv2D(32, (3, 3), activation='relu'))
    chroma_cqt_branch.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    chroma_cqt_branch.add(keras.layers.BatchNormalization())

    chroma_cqt_branch.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
    chroma_cqt_branch.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    chroma_cqt_branch.add(keras.layers.BatchNormalization())

    chroma_cqt_branch.add(keras.layers.Conv2D(32, (2, 2), activation='relu'))
    chroma_cqt_branch.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
    chroma_cqt_branch.add(keras.layers.BatchNormalization())
    
    chroma_cqt_branch.add(keras.layers.TimeDistributed(keras.layers.LSTM(256, return_sequences=True)))
    chroma_cqt_branch.add(keras.layers.TimeDistributed(keras.layers.LSTM(128, return_sequences=True)))
    chroma_cqt_branch.add(keras.layers.Flatten())

    chroma_cqt_output = chroma_cqt_branch(chroma_cqt_input)    

       
    # Add rhythmic features as input
    tempo_branch= keras.Sequential()
    tempo_branch.add(keras.layers.Dense(64, activation='relu'))
    tempo_output = tempo_branch(tempo_input)
    
    
    zero_cross_rate_branch= keras.Sequential()
    zero_cross_rate_branch.add(keras.layers.Dense(64, activation='relu'))
    zero_cross_rate_output=zero_cross_rate_branch(zero_cross_rate_input)
                    
    beat_hist_branch= keras.Sequential()
    beat_hist_branch.add(keras.layers.Dense(64, activation='relu'))
    beat_hist_output=beat_hist_branch(beat_hist_input)
                    
    beat_dispersion_branch= keras.Sequential()
    beat_dispersion_branch.add(keras.layers.Dense(64, activation='relu'))
    beat_dispersion_output=beat_dispersion_branch(beat_dispersion_input)
    
    
#     beat_times_branch= keras.Sequential()
#     beat_times_branch.add(keras.layers.Dense(64, activation='relu'))
#     beat_times_output=beat_times_branch(beat_dispersion_input)
    
    # Concatenate
    concat = keras.layers.Concatenate()([tempo_output, zero_cross_rate_output, chroma_cqt_output, beat_dispersion_output, spectral_contrast_output, mfcc_output, beat_hist_output])
#     model = keras.models.Model(inputs=[mfcc_input,chroma_stft_input,tonnetz_input,chroma_cqt_input,tempo_input], outputs=concat)

    # LSTM layer        
    x = keras.layers.Dense(128, activation='relu', kernel_regularizer= keras.regularizers.l2(0.001))(concat)

# Dense layer    
    x = keras.layers.Dense(64, activation='relu', kernel_regularizer= keras.regularizers.l2(0.001))(x)

# Output layer
    outputs = keras.layers.Dense(2, activation='softmax')(x)

# Define model
    model = keras.models.Model(inputs=[mfcc_input , tempo_input , beat_hist_input , spectral_contrast_input, chroma_cqt_input , beat_dispersion_input, zero_cross_rate_input  ], outputs=outputs)

#     # dense layer
#     model.add(keras.layers.Dense(128, activation='relu', kernel_regularizer= keras.regularizers.l2(0.001)))
#     model.add(keras.layers.Dropout(0.2))

#     model.add(keras.layers.Dense(64, activation='relu', kernel_regularizer= keras.regularizers.l2(0.001)))
#     model.add(keras.layers.Dropout(0.2))
#     # output layer
#     model.add(keras.layers.Dense(2, activation='softmax'))
    
    return model

In [16]:
tempo.shape[2]

IndexError: tuple index out of range

In [47]:
if __name__ == "__main__":

    # get train, validation, test splits
#     X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.2)

    # create network
#     input_shape = (None,)
    # print(X_train.shape)
    model = build_model()

    # compile model
    optimiser = keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimiser,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    
    
    model.summary()

    # train model
    history = model.fit([mfcc,tempo,beat_hist, spectral_contrast,chroma_cqt,beat_dispersion, zero_cross_rate], y, validation_data=([mfcc,tempo,beat_hist, spectral_contrast,chroma_cqt,beat_dispersion, zero_cross_rate], y), batch_size=64, epochs=30)
    
    test_loss, test_acc = model.evaluate([mfcc,tempo,beat_hist, spectral_contrast,chroma_cqt,beat_dispersion, zero_cross_rate], y, verbose=2)
    print('\nTest accuracy:', test_acc)

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_51 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_56 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_54 (InputLayer)          [(None, 1292, 12, 1  0           []                               
                                )]                                                                
                                                                                                  
 input_55 (InputLayer)          [(None, 1)]          0           []                         

Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
12/12 - 2s - loss: 0.3352 - accuracy: 0.9064 - 2s/epoch - 190ms/step

Test accuracy: 0.9064171314239502


array([list([2.786394557823129, 3.4829931972789114, 4.20281179138322, 4.899410430839002, 5.61922902494331, 6.315827664399093, 7.035646258503402, 7.732244897959184, 8.428843537414966, 9.125442176870749, 9.845260770975056, 10.541859410430838, 11.261678004535147, 11.95827664399093, 12.678095238095239, 13.374693877551021, 14.071292517006803, 14.79111111111111, 15.51092970521542, 16.207528344671204, 16.904126984126986, 17.600725623582765, 18.320544217687075, 19.017142857142858, 19.736961451247165, 20.433560090702947, 21.153378684807254, 21.849977324263037, 22.54657596371882, 23.26639455782313, 23.962993197278912, 24.659591836734695, 25.379410430839002, 26.076009070294784, 26.79582766439909, 27.492426303854874, 28.212244897959184, 28.908843537414967]),
       list([2.786394557823129, 3.4829931972789114, 4.20281179138322, 4.899410430839002, 5.61922902494331, 6.315827664399093, 7.035646258503402, 7.732244897959184, 8.428843537414966, 9.125442176870749, 9.845260770975056, 10.541859410430838, 11

In [75]:
a=beat_times.flatten()
a=np.array(beat_times)
max_len = min(len(l) for l in a)
max_len

12