<h1>Speech Rap Singing Classification</h1>

<b>data source</b>: Provided by Speech Graphics & Rapport, not publicly available

<h2>Data Preprocesing</h2>
<h3>Initial Feature Extraction<h3>

In [221]:
import os
import librosa
import math
import json

In [232]:

"""
@param dataset_path: relative location of the dataset.
@param json_path:
@param n_mfcc: number of mfcc variables to receive
@param hop_length: how far along to move the window
@param num_segments: how many parts to break each file into
"""
def save_mfcc(dataset_path,json_path, n_mfcc=13, n_fft=2048, hop_length=512, num_segments=5, sample_rate=20000):
    #dictionary to store data
    # - mapping: mapp labels to values
    # - labels: target
    data={
        "mapping":[],
        "mfcc":[],
        "labels":[]
    }
    #the length of each sample, we know this for this dataset. Could be calculated dynamically
    DURATION = 3
    samples_per_track = sample_rate * DURATION
    #how many data points we expect to appear in each segment we break our track into
    num_samples_per_segment = int(samples_per_track/num_segments)
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment/ hop_length)
    
    print ("num_samples_per_segment",num_samples_per_segment)
    #loop through all the genres
    #dirpath: folder we are currently in
    #dirnames: all the names of the subfolders
    #filenames: all the filenames
    #i: the count. It MUST be included
    #os.walk returns a generator, that creates a tuple of values: 
    #(current_path, directories in current_path, files in current_path).
    # - each iteration is a different genre
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        #ensure we're not yet at the dataset level
        if dirpath is not dataset_path:
            #save the semantic label (the mappings etc)
            #dirpath_components: the individual folder names that make up the full path
            dirpath_components = dirpath.split("/") #genre/blues => ["genre","blues"]
            semantic_label = dirpath_components[-1]
            #use the parent directory of a sound file as its label
            data["mapping"].append(semantic_label)
            print ("\nProcessing", semantic_label)
            for f in filenames:
                # load audio file : the path of the file is just its directory path plus its name
                file_path = os.path.join(dirpath,f)
                signal, sr = librosa.load(file_path,sr=sample_rate)
                #process segments to extract MFCC and store data
                for s in range (0, num_segments):
                    start_sample = num_samples_per_segment * s
                    finish_sample = start_sample + num_samples_per_segment
                    
                    #the mfcc for data points between start_sample and finish_sample
                    mfcc = librosa.feature.mfcc(signal[start_sample:finish_sample],sr=sr,n_fft=n_fft,n_mfcc=n_mfcc, hop_length = hop_length)
                    mfcc=mfcc.T
                    
                    #store mfcc for segement if it has the expected length
                    if len(mfcc)==expected_num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        # first iteration was the dataset path
                        data["labels"].append(i-1)
                        print ("{},segment{}".format(file_path,s))
    #save what we ahve created as a jason file
    with open(json_path,"w") as fp:
        json.dump(data,fp,indent=4)
        


In [233]:
DATASET_PATH="./data/"
JSON_PATH="data.json"
SAMPLE_RATE = 16000
N_MFCC = 13
N_FFT = 2048
HOP_LENGTH=512
NUM_SEGMENTS = 1
import time
startTime = time.perf_counter()
save_mfcc(DATASET_PATH,JSON_PATH, N_MFCC, N_FFT, HOP_LENGTH, NUM_SEGMENTS, SAMPLE_RATE)
endTime = time.perf_counter()

print ("Run Complete. Time taken:",endTime-startTime)

num_samples_per_segment 48000

Processing rap

Processing rap\sing002
./data/rap\sing002\sing002_1.wav,segment0
./data/rap\sing002\sing002_2.wav,segment0
./data/rap\sing002\sing002_3.wav,segment0
./data/rap\sing002\sing002_4.wav,segment0
./data/rap\sing002\sing002_5.wav,segment0
./data/rap\sing002\sing002_6.wav,segment0
./data/rap\sing002\sing002_7.wav,segment0

Processing rap\sing006
./data/rap\sing006\sing006_1.wav,segment0
./data/rap\sing006\sing006_2.wav,segment0
./data/rap\sing006\sing006_3.wav,segment0
./data/rap\sing006\sing006_4.wav,segment0
./data/rap\sing006\sing006_5.wav,segment0
./data/rap\sing006\sing006_6.wav,segment0
./data/rap\sing006\sing006_7.wav,segment0
./data/rap\sing006\sing006_8.wav,segment0

Processing rap\sing025
./data/rap\sing025\sing025_1.wav,segment0
./data/rap\sing025\sing025_10.wav,segment0
./data/rap\sing025\sing025_2.wav,segment0
./data/rap\sing025\sing025_3.wav,segment0
./data/rap\sing025\sing025_4.wav,segment0
./data/rap\sing025\sing025_5.wav,segment0

./data/rap\sing060\sing060_7.wav,segment0
./data/rap\sing060\sing060_8.wav,segment0
./data/rap\sing060\sing060_9.wav,segment0

Processing rap\sing061
./data/rap\sing061\sing061_1.wav,segment0
./data/rap\sing061\sing061_11.wav,segment0
./data/rap\sing061\sing061_12.wav,segment0
./data/rap\sing061\sing061_13.wav,segment0
./data/rap\sing061\sing061_15.wav,segment0
./data/rap\sing061\sing061_17.wav,segment0
./data/rap\sing061\sing061_18.wav,segment0
./data/rap\sing061\sing061_19.wav,segment0
./data/rap\sing061\sing061_2.wav,segment0
./data/rap\sing061\sing061_20.wav,segment0
./data/rap\sing061\sing061_21.wav,segment0
./data/rap\sing061\sing061_22.wav,segment0
./data/rap\sing061\sing061_23.wav,segment0
./data/rap\sing061\sing061_24.wav,segment0
./data/rap\sing061\sing061_26.wav,segment0
./data/rap\sing061\sing061_27.wav,segment0
./data/rap\sing061\sing061_28.wav,segment0
./data/rap\sing061\sing061_29.wav,segment0
./data/rap\sing061\sing061_3.wav,segment0
./data/rap\sing061\sing061_30.wav,se

./data/rap\sing092\sing092_20.wav,segment0
./data/rap\sing092\sing092_22.wav,segment0
./data/rap\sing092\sing092_23.wav,segment0
./data/rap\sing092\sing092_24.wav,segment0
./data/rap\sing092\sing092_26.wav,segment0
./data/rap\sing092\sing092_28.wav,segment0
./data/rap\sing092\sing092_29.wav,segment0
./data/rap\sing092\sing092_3.wav,segment0
./data/rap\sing092\sing092_30.wav,segment0
./data/rap\sing092\sing092_31.wav,segment0
./data/rap\sing092\sing092_32.wav,segment0
./data/rap\sing092\sing092_33.wav,segment0
./data/rap\sing092\sing092_34.wav,segment0
./data/rap\sing092\sing092_35.wav,segment0
./data/rap\sing092\sing092_36.wav,segment0
./data/rap\sing092\sing092_37.wav,segment0
./data/rap\sing092\sing092_38.wav,segment0
./data/rap\sing092\sing092_39.wav,segment0
./data/rap\sing092\sing092_4.wav,segment0
./data/rap\sing092\sing092_42.wav,segment0
./data/rap\sing092\sing092_44.wav,segment0
./data/rap\sing092\sing092_45.wav,segment0
./data/rap\sing092\sing092_46.wav,segment0
./data/rap\si

./data/singing\sing011\sing011_9.wav,segment0

Processing singing\sing012
./data/singing\sing012\sing012_1.wav,segment0
./data/singing\sing012\sing012_10.wav,segment0
./data/singing\sing012\sing012_11.wav,segment0
./data/singing\sing012\sing012_12.wav,segment0
./data/singing\sing012\sing012_13.wav,segment0
./data/singing\sing012\sing012_14.wav,segment0
./data/singing\sing012\sing012_15.wav,segment0
./data/singing\sing012\sing012_16.wav,segment0
./data/singing\sing012\sing012_17.wav,segment0
./data/singing\sing012\sing012_18.wav,segment0
./data/singing\sing012\sing012_19.wav,segment0
./data/singing\sing012\sing012_2.wav,segment0
./data/singing\sing012\sing012_20.wav,segment0
./data/singing\sing012\sing012_21.wav,segment0
./data/singing\sing012\sing012_22.wav,segment0
./data/singing\sing012\sing012_23.wav,segment0
./data/singing\sing012\sing012_24.wav,segment0
./data/singing\sing012\sing012_25.wav,segment0
./data/singing\sing012\sing012_3.wav,segment0
./data/singing\sing012\sing012_4.wav

./data/singing\sing020\sing020_13.wav,segment0
./data/singing\sing020\sing020_14.wav,segment0
./data/singing\sing020\sing020_2.wav,segment0
./data/singing\sing020\sing020_3.wav,segment0
./data/singing\sing020\sing020_4.wav,segment0
./data/singing\sing020\sing020_5.wav,segment0
./data/singing\sing020\sing020_6.wav,segment0
./data/singing\sing020\sing020_7.wav,segment0
./data/singing\sing020\sing020_8.wav,segment0
./data/singing\sing020\sing020_9.wav,segment0

Processing singing\sing021
./data/singing\sing021\sing021_1.wav,segment0
./data/singing\sing021\sing021_10.wav,segment0
./data/singing\sing021\sing021_11.wav,segment0
./data/singing\sing021\sing021_12.wav,segment0
./data/singing\sing021\sing021_13.wav,segment0
./data/singing\sing021\sing021_14.wav,segment0
./data/singing\sing021\sing021_15.wav,segment0
./data/singing\sing021\sing021_16.wav,segment0
./data/singing\sing021\sing021_17.wav,segment0
./data/singing\sing021\sing021_18.wav,segment0
./data/singing\sing021\sing021_19.wav,seg

./data/speech\01b\01bc0214.wav,segment0
./data/speech\01b\01bc0215.wav,segment0
./data/speech\01b\01bc0216.wav,segment0
./data/speech\01b\01bc0217.wav,segment0
./data/speech\01b\01bc0218.wav,segment0
./data/speech\01b\01bc0219.wav,segment0
./data/speech\01b\01bc021a.wav,segment0
./data/speech\01b\01bc021b.wav,segment0
./data/speech\01b\01bc021c.wav,segment0
./data/speech\01b\01bc021e.wav,segment0
./data/speech\01b\01bc021f.wav,segment0
./data/speech\01b\01bo0302.wav,segment0
./data/speech\01b\01bo0303.wav,segment0
./data/speech\01b\01bo0304.wav,segment0
./data/speech\01b\01bo0305.wav,segment0
./data/speech\01b\01bo0306.wav,segment0
./data/speech\01b\01bo0307.wav,segment0
./data/speech\01b\01bo0308.wav,segment0
./data/speech\01b\01bo0309.wav,segment0
./data/speech\01b\01bo030b.wav,segment0
./data/speech\01b\01bo030c.wav,segment0
./data/speech\01b\01bo030d.wav,segment0
./data/speech\01b\01bo030e.wav,segment0
./data/speech\01b\01bo030f.wav,segment0
./data/speech\01b\01bo030g.wav,segment0


./data/speech\01e\01eo0310.wav,segment0
./data/speech\01e\01eo0311.wav,segment0
./data/speech\01e\01eo0312.wav,segment0
./data/speech\01e\01eo0313.wav,segment0
./data/speech\01e\01eo0314.wav,segment0
./data/speech\01e\01eo0315.wav,segment0
./data/speech\01e\01eo0316.wav,segment0
./data/speech\01e\01eo0317.wav,segment0
./data/speech\01e\01eo0318.wav,segment0
./data/speech\01e\01eo0319.wav,segment0
./data/speech\01e\01eo031a.wav,segment0
./data/speech\01e\01eo031b.wav,segment0
./data/speech\01e\01eo031c.wav,segment0
./data/speech\01e\01eo031d.wav,segment0
./data/speech\01e\01eo031e.wav,segment0

Processing speech\01f
./data/speech\01f\01fc0203.wav,segment0
./data/speech\01f\01fc0204.wav,segment0
./data/speech\01f\01fc0205.wav,segment0
./data/speech\01f\01fc0206.wav,segment0
./data/speech\01f\01fc0207.wav,segment0
./data/speech\01f\01fc0208.wav,segment0
./data/speech\01f\01fc0209.wav,segment0
./data/speech\01f\01fc020a.wav,segment0
./data/speech\01f\01fc020b.wav,segment0
./data/speech\01f

Run Complete. Time taken: 9.644381600000088


<h3>Further Preprocessing<h3>

In [234]:
import numpy as np
#load data

# split data into training and test

def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)

        #convert lists into numpy arrays
        # x
        inputs=np.array(data["mfcc"])
        targets=np.array(data["labels"])
        return inputs,targets

In [235]:
inputs,targets = load_data("data.json")
#inputs.shape[0] num samples
#inputs.shape[1] num time readings
#inputs.shape[2] num values per time interval
print ("input shape", inputs.shape)

unique_elements, counts_elements = np.unique(targets, return_counts=True)
print("Frequency of unique values of the targets:")
print(np.asarray((unique_elements, counts_elements)))

input shape (1440, 94, 13)
Frequency of unique values of the targets:
[[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
   19  20  21  22  23  24  25  26  28  29  30  31  32  33  34  35  36  37
   38  39  40  41  42  43  44  45  46  47  48  49  50  51  53  54  55  56
   57]
 [  7   8  10  21  16  12  23  40  12  11  11   9  15  29  22  38  28   6
   13   8  12  57  10  43  12   7  11  26  26  12  12  22  21   9  25  23
   20  22  33  27  15  17  14  24  29  10  27  13  12  30  91  98 100  95
   96]]


In [236]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(inputs,targets,test_size=0.3)

print ("x_train",x_train.shape)
print ("x_test",x_test.shape)
print ("y_train",y_train.shape)
print ("y_test",y_test.shape)


x_train (1008, 94, 13)
x_test (432, 94, 13)
y_train (1008,)
y_test (432,)


In [237]:
print(x)
print(x.shape)

0       [[-259.7991333008, 52.0887832642, 14.845832824...
1       [[-51.5158119202, 84.026473999, 13.5537509918,...
2       [[-24.3105449677, 90.9373779297, -54.699748992...
3       [[-218.6664428711, 47.8084869385, 37.528297424...
4       [[-202.5080108643, 117.9148178101, 50.77450180...
                              ...                        
2875    [[-278.1077270508, 88.3571166992, -9.653070449...
2876    [[-271.6434936523, 73.972366333, -1.8853588104...
2877    [[-431.7222290039, 123.4328384399, -15.5955619...
2878    [[-351.3446655273, 134.8014984131, 14.16081047...
2879    [[-397.8897094727, 140.8511505127, -19.8133964...
Name: mfcc, Length: 2880, dtype: object
(2880,)


In [238]:
print(y)

0          rap
1          rap
2          rap
3          rap
4          rap
         ...  
2875    speech
2876    speech
2877    speech
2878    speech
2879    speech
Name: label, Length: 2880, dtype: object


In [249]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1)

print ("x_train",x_train.shape)
print ("x_test",x_test.shape)
print ("y_train",y_train.shape)
print ("y_test",y_test.shape)

x_train (2592,)
x_test (288,)
y_train (2592,)
y_test (288,)


In [250]:
import json
def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)

        #convert lists into numpy arrays
        # x
        inputs=np.array(data["mfcc"])
        targets=np.array(data["labels"])
        return inputs,targets
inputs,targets = load_data("data.json")
print ("input shape", inputs.shape)

input shape (1440, 94, 13)


<h1> Traditional Dense Network </h1>
    @todo delete this later, it is purely illustrative of why the pure mlp is insufficient

In [251]:
from tensorflow import keras
#we will use only a traditional dense mlp for this first test, but CNNs would be far more accurate
#takes multidimensional input and treats it as a vector
#mfcc data is mfcc values taken at itervals. Second dimension is values for that interval.
flattenLayer = keras.layers.Flatten(input_shape=(inputs.shape[1],inputs.shape[2]))
#relu: activation function outputs 0 if net input is less than 0. else outputs h.
#relu is very very effective for training (much faster convergence)
# reduced probability of vanishing gradient: derivative of sigmoid can't be higher than 0,25 so it shrinks and becomes tiny
# relu does not have this problem. So relu allows us to have much deeper networks
#3 hidden layers
denseLayer1 = keras.layers.Dense(512,activation="relu")
denseLayer2 = keras.layers.Dense(256,activation="relu")
denseLayer3 = keras.layers.Dense(64,activation="relu")

#we have 10 categories
#softmax is an activation function that normalised the output (so total is 1)
outputLayer = keras.layers.Dense(10,activation="softmax")
model=keras.Sequential([flattenLayer,denseLayer1,denseLayer2,denseLayer3,outputLayer])

#Adam is a very very effecting sgd variant for deep learning
optimizer = keras.optimizers.Adam(learning_rate=0.0001)
#put all the components together
model.compile(optimizer=optimizer,
              loss="sparse_categorical_crossentropy", 
              metrics=["accuracy"]
              )
#describe our network
model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 1222)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 512)               626176    
_________________________________________________________________
dense_9 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_10 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_11 (Dense)             (None, 10)                650       
Total params: 774,602
Trainable params: 774,602
Non-trainable params: 0
_________________________________________________________________


In [252]:
#train the network
#batch-size usually 16-128 samples, quick, memory light and fairly accurate
#note the very higha ccuract of the test set and low accuracy of the training set (~98% vs ~58%)
# we are overfitting
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=50,batch_size=32)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [None]:
import matplotlib.pyplot as plt
def plot_history(history):
    fig,axs = plt.subplots(2)
    axs[0].plot(history.history["accuracy"],label="train accuracy")
    axs[0].plot(history.history["val_accuracy"],label="test accuracy")
    axs[0].set_ylabel("accuracy")
    #loc sets location
    axs[0].legend(loc="lower right")
    axs[0].set_title("accuracy eval")
    
    axs[1].plot(history.history["loss"],label="train error")
    axs[1].plot(history.history["val_loss"],label="test error")
    axs[1].set_ylabel("error")
    #loc sets location
    axs[1].legend(loc="upper right")
    axs[1].set_title("error eval")
    axs[1].set_xlabel("error")
    axs[1].set_xlabel("epoch")
    
    #just keeps the images from overlapping
    fig.tight_layout() 
    #plt.show()

In [248]:
#mode.fit returns "A History object. Its History.history attribute is a record of training loss values and metrics
#values at successive epochs, as well as validation loss values and validation metrics values (if applicable)"
history = model.fit(x_train, y_train, validation_data=(x_test, y_test),epochs=50, batch_size=32)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [None]:
#we will use only a traditional dense mlp for this first test, but CNNs would be far more accurate
#takes multidimensional input and treats it as a vector
#mfcc data is mfcc values taken at itervals. Second dimension is values for that interval.
flattenLayer = keras.layers.Flatten(input_shape=(inputs.shape[1],inputs.shape[2]))
#relu: activation function outputs 0 if net input is less than 0. else outputs h.
#relu is very very effective for training (much faster convergence)
# reduced probability of vanishing gradient: derivative of sigmoid can't be higher than 0,25 so it shrinks and becomes tiny
# relu does not have this problem. So relu allows us to have much deeper networks
#3 hidden layers
#IMPORTANT:: we are also adding REGULARIZER as well as dropout
denseLayer1 = keras.layers.Dense(512,activation="relu", kernel_regularizer=keras.regularizers.l2(0.0001))
#param is dropout probability
dropoutLayer1 = keras.layers.Dropout(0.3)
denseLayer2 = keras.layers.Dense(256,activation="relu",kernel_regularizer=keras.regularizers.l2(0.0001))
dropoutLayer2 = keras.layers.Dropout(0.3)
denseLayer3 = keras.layers.Dense(64,activation="relu",kernel_regularizer=keras.regularizers.l2(0.0001))
dropoutLayer3 = keras.layers.Dropout(0.3)
#we have 10 categories
#softmax is an activation function that normalised the output (so total is 1)
outputLayer = keras.layers.Dense(10,activation="softmax")
model_with_dropout=keras.Sequential([flattenLayer,denseLayer1,dropoutLayer1,denseLayer2,dropoutLayer2,
                                     denseLayer3,dropoutLayer3,outputLayer])

#Adam is a very very effecting sgd variant for deep learning
optimizer = keras.optimizers.Adam(learning_rate=0.0001)
#put all the components together
model_with_dropout.compile(optimizer=optimizer,
              loss="sparse_categorical_crossentropy", 
              metrics=["accuracy"]
              )
#describe our network
model_with_dropout.summary()