Aim: classification on mbay songs: given a mbay song, can we predict its genre?

In [0]:
# loading packages

%tensorflow_version 1.x
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from PIL import Image
import pathlib
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import keras
from keras import layers
import keras
from keras.models import Sequential
import warnings
from sklearn.metrics import classification_report,confusion_matrix
from numpy import argmax
import librosa.display
import IPython.display
import random
from keras.layers import Activation, Dense, Dropout, Conv2D, Flatten, MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling1D, AveragePooling2D, Input, Add
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from google.colab import drive
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import neighbors, datasets
from keras.preprocessing.image import ImageDataGenerator
!pip install split_folders
import split_folders
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from random import sample

warnings.filterwarnings('ignore')

In [0]:
# Work directory

drive.mount('/content/gdrive', force_remount=True)

In [0]:
duree = 15 # sample duration to take from the audio file for analysis
genres = 'Bang Bordero Klag Terta'.split()
MaxNbFiles = 75 # maximum number of songs of each genre to consider for analysis

Waveplots and spectograms of a sample of each genre

In [0]:
# Help: https://www.kdnuggets.com/2020/02/audio-data-analysis-deep-learning-python-part-1.html

# Files: https://drive.google.com/drive/folders/0B-znYQjmJbqKTWVGUG9mQVNPZTQ

In [0]:
# Images for poster

pathlib.Path(f'img_data/').mkdir(parents=True, exist_ok=True)

cmap = plt.get_cmap('inferno')
plt.figure(figsize=(14,5))

for g in genres:
  filename = sample(os.listdir(f'/content/gdrive/My Drive/Discothèque Mbaye/{g}'),1)[0]
  songname = f'/content/gdrive/My Drive/Discothèque Mbaye/{g}/{filename}'

  x, srx = librosa.load(songname, mono=True)
  y, sr = librosa.load(songname, mono=True,offset=librosa.get_duration(x)/2, duration=duree)

  librosa.display.waveplot(y, sr=sr,offset=librosa.get_duration(x)/2);
  plt.savefig(f'img_data/wave_{g}.png')
  plt.clf()

  plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
  plt.savefig(f'img_data/spect_{g}.png')
  plt.clf()

Working on features from spectogram (stored in a csv file)

In [0]:
# # Read the songs, extract the features and write the data in a csv file
# Skip this cell if data are already stored in a csv file

# Header for our CSV file

header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()

# Extracting features from Spectrogram: Mel-frequency cepstral coefficients (MFCC), Spectral Centroid, Zero Crossing Rate, Chroma Frequencies, and Spectral Roll-off

file = open('dataset_'+str(duree)+'s.csv', 'w', newline='') # 'a' for openning an existing file for writing
with file:
    writer = csv.writer(file,delimiter=';')
    writer.writerow(header)

for g in genres:
  NbFiles = len(os.listdir(f'/content/gdrive/My Drive/Discothèque Mbaye/{g}'))
  i = 1
  while (i <= min(MaxNbFiles,NbFiles)):
    filename = os.listdir(f'/content/gdrive/My Drive/Discothèque Mbaye/{g}')[i-1]
    songname = f'/content/gdrive/My Drive/Discothèque Mbaye/{g}/{filename}'
    x, srx = librosa.load(songname, mono=True)
    y, sr = librosa.load(songname, mono=True,offset=librosa.get_duration(x)/2, duration=duree)
    rmse = librosa.feature.rmse(y=y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    filename = 'song' + str(i)
    to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
    for e in mfcc:
      to_append += f' {np.mean(e)}'
    to_append += f' {g}'
    file = open('dataset_'+str(duree)+'s.csv', 'a', newline='')
    with file:
      writer = csv.writer(file,delimiter=';')
      writer.writerow(to_append.split())
    i=i+1

In [0]:
# Reading the csv file

data = pd.read_csv('/content/dataset_15s.csv', sep=';')
data.head()# Dropping unneccesary columns
data = data.drop(['filename'],axis=1)

# Converting to numeric type
numeric = list(data.columns)
del numeric[-1]
data[numeric] = data[numeric].apply(pd.to_numeric, errors='coerce')

#data['label'].unique() # checking we have all the genres

# Encoding the Labels
genre_list = data.iloc[:, -1]
encoder = LabelEncoder()
y = encoder.fit_transform(genre_list)#Scaling the Feature columns
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))

# Dividing data into training and Testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0,test_size=0.3)

print(X_train.shape); print(X_test.shape)

In [0]:
# Table for poster

train = pd.crosstab(index=encoder.inverse_transform(y_train), columns="number", margins=True, margins_name = 'Total')
test = pd.crosstab(index=encoder.inverse_transform(y_test), columns="number", margins=True, margins_name = 'Total')
pd.DataFrame({'train': train['number'], 'test': test['number']});

Multi-Layer Perceptron Classifier

In [0]:
# From https://www.pluralsight.com/guides/machine-learning-neural-networks-scikit-learn

mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
y_pred = mlp.fit(X_train,y_train).predict(X_test)

MLPreport = classification_report(y_pred,y_test,output_dict=True)
MLPprecision = pd.DataFrame(MLPreport).transpose().precision[0:4]
MLPrecall = pd.DataFrame(MLPreport).transpose().recall[0:4]

position = classification_report(y_pred,y_test).replace(" ", "").find('accuracy');
MLPperf = float(classification_report(y_pred,y_test).replace(" ", "")[(position+8):(position+12)]); 

Artificial Neural Networks

In [0]:
ANN = Sequential();
ANN.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)));
ANN.add(layers.Dense(128, activation='relu'));
ANN.add(layers.Dense(64, activation='relu'));
ANN.add(layers.Dense(10, activation='softmax'));
ANN.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy']);

ANN.fit(X_train,y_train,epochs=100,batch_size=128);

y_pred = ANN.predict_classes(X_test);

ANNreport = classification_report(y_pred,y_test,output_dict=True)
ANNprecision = pd.DataFrame(ANNreport).transpose().precision[0:4]
ANNrecall = pd.DataFrame(ANNreport).transpose().recall[0:4]

position = classification_report(y_pred,y_test).replace(" ", "").find('accuracy');
ANNperf = float(classification_report(y_pred,y_test).replace(" ", "")[(position+8):(position+12)]);

Multinomial regression

In [0]:
multi = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = multi.predict(X_test)

MNRreport = classification_report(y_pred,y_test,output_dict=True)
MNRprecision = pd.DataFrame(MNRreport).transpose().precision[0:4]
MNRrecall = pd.DataFrame(MNRreport).transpose().recall[0:4]

position = classification_report(y_pred,y_test).replace(" ", "").find('accuracy');
MNRperf = float(classification_report(y_pred,y_test).replace(" ", "")[(position+8):(position+12)]);

K-Nearest Neighbors

In [0]:
# https://scikit-learn.org/stable/modules/neighbors.html#classification
# https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html#sphx-glr-auto-examples-neighbors-plot-classification-py

n_neighbors = 15
nghbs = neighbors.KNeighborsClassifier(n_neighbors).fit(X_train, y_train)
y_pred = nghbs.predict(X_test)

KNNreport = classification_report(y_pred,y_test,output_dict=True)
KNNprecision = pd.DataFrame(KNNreport).transpose().precision[0:4]
KNNrecall = pd.DataFrame(KNNreport).transpose().recall[0:4]

position = classification_report(y_pred,y_test).replace(" ", "").find('accuracy');
KNNperf = float(classification_report(y_pred,y_test).replace(" ", "")[(position+8):(position+12)]);

Support Vector Machines

In [0]:
SVM_model = svm.SVC()#(kernel='linear')
SVM_model.fit(X_train,y_train)
y_pred = SVM_model.predict(X_test)

SVMreport = classification_report(y_pred,y_test,output_dict=True)
SVMprecision = pd.DataFrame(SVMreport).transpose().precision[0:4]
SVMrecall = pd.DataFrame(SVMreport).transpose().recall[0:4]

position = classification_report(y_pred,y_test).replace(" ", "").find('accuracy');
SVMperf = float(classification_report(y_pred,y_test).replace(" ", "")[(position+8):(position+12)]);

Random Forests

In [0]:
ForAlea = RandomForestClassifier(max_depth=5, random_state=0)
y_pred = ForAlea.fit(X_train, y_train).predict(X_test)

RFreport = classification_report(y_pred,y_test,output_dict=True)
RFprecision = pd.DataFrame(RFreport).transpose().precision[0:4]
RFrecall = pd.DataFrame(RFreport).transpose().recall[0:4]

position = classification_report(y_pred,y_test).replace(" ", "").find('accuracy');
RFperf = float(classification_report(y_pred,y_test).replace(" ", "")[(position+8):(position+12)]);

Naive Bayes

In [0]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

NBreport = classification_report(y_pred,y_test,output_dict=True)
NBprecision = pd.DataFrame(NBreport).transpose().precision[0:4]
NBrecall = pd.DataFrame(NBreport).transpose().recall[0:4]

position = classification_report(y_pred,y_test).replace(" ", "").find('accuracy');
NBperf = float(classification_report(y_pred,y_test).replace(" ", "")[(position+8):(position+12)]); 

Linear Discriminant Analysis

In [0]:
LinDis = LinearDiscriminantAnalysis()
y_pred = LinDis.fit(X_train, y_train).predict(X_test)

LDAreport = classification_report(y_pred,y_test,output_dict=True)
LDAprecision = pd.DataFrame(LDAreport).transpose().precision[0:4]
LDArecall = pd.DataFrame(LDAreport).transpose().recall[0:4]

position = classification_report(y_pred,y_test).replace(" ", "").find('accuracy');
LDAperf = float(classification_report(y_pred,y_test).replace(" ", "")[(position+8):(position+12)]);

Convolutional Neural Networks

In [0]:
# https://levelup.gitconnected.com/audio-data-analysis-using-deep-learning-with-python-part-2-4a1f40d3708d
# https://missinglink.ai/guides/convolutional-neural-networks/python-convolutional-neural-network-creating-cnn-keras-tensorflow-plain-python/

# Extracting spectograms

cmap = plt.get_cmap('inferno')
plt.figure(figsize=(8,8))

for g in genres:
    pathlib.Path(f'img_data/{g}').mkdir(parents=True, exist_ok=True)
    NbFiles = len(os.listdir(f'/content/gdrive/My Drive/Discothèque Mbaye/{g}'))
    i=1
    while (i <= min(MaxNbFiles,NbFiles)):
        filename = os.listdir(f'/content/gdrive/My Drive/Discothèque Mbaye/{g}')[i-1]
        songname = f'/content/gdrive/My Drive/Discothèque Mbaye/{g}/{filename}'
        x, srx = librosa.load(songname, mono=True)
        y, sr = librosa.load(songname, mono=True,offset=librosa.get_duration(x)/2, duration=duree)
        plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
        plt.axis('off');
        filename = 'chanson' + str(i)
        plt.savefig(f'img_data/{g}/{filename}.png')
        plt.clf()
        i=i+1

In [0]:
# Split the data into the train set and test set

split_folders.ratio('/content/img_data', output="/content/data", seed=0, ratio=(.7, .3))

In [0]:
train_datagen = ImageDataGenerator(rescale=1./255)
        #rescale=1./255, # rescale all pixel values from 0-255, so after this step all our pixel values are in range (0,1)
        #shear_range=0.2, #to apply some random transformations
        #zoom_range=0.2, #to apply zoom
        #horizontal_flip=True) # image will be flipped horizontally

test_datagen = ImageDataGenerator(rescale=1./255) #rescale=1./255

training_set = train_datagen.flow_from_directory(
        '/content/data/train',
        target_size=(64, 64),
        batch_size=25,
        classes = genres,
        class_mode='categorical',
        shuffle = False)

test_set = test_datagen.flow_from_directory(
        '/content/data/val',
        target_size=(64, 64),
        batch_size=25,
        classes = genres,
        class_mode='categorical',
        shuffle = False )

In [0]:
# Table for poster

train = pd.crosstab(index=encoder.inverse_transform(training_set.classes), columns="number", margins=True, margins_name = 'Total')
test = pd.crosstab(index=encoder.inverse_transform(test_set.classes), columns="number", margins=True, margins_name = 'Total')
pd.DataFrame({'train': train['number'], 'test': test['number']})

In [0]:
model = Sequential()
input_shape=(64, 64, 3)

#1st hidden layer
model.add(Conv2D(64, (3, 3), strides=(2, 2), input_shape=input_shape))
model.add(AveragePooling2D((2, 2), strides=(2,2)))
model.add(Activation('relu'))

#2nd hidden layer
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(AveragePooling2D((2, 2), strides=(2,2)))
model.add(Activation('relu'))

#3rd hidden layer
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(AveragePooling2D((2, 2), strides=(2,2)))
model.add(Activation('relu'))

#Flatten
model.add(Flatten())
model.add(Dropout(rate=0.5))

#Add fully connected layer.
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(rate=0.5))

#Output layer
model.add(Dense(4))
model.add(Activation('softmax'))
#model.summary()

#epochs = 10
#batch_size = 25
#learning_rate = 0.0001
#decay_rate = learning_rate / epochs
#momentum = 0.9
#sgd = SGD(lr=learning_rate
          #, momentum=momentum, decay=decay_rate, nesterov=False
#          )
model.compile(optimizer="sgd", loss="categorical_crossentropy", metrics=['accuracy']);

#history = model.fit_generator(training_set,  #steps_per_epoch=5, epochs=200 #,validation_data=test_set,validation_steps=25    );

model.fit_generator(training_set, epochs=100);

# patient early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)

# evaluate the model
#_, train_acc = model.evaluate(training_set, verbose=0)
#_, test_acc = model.evaluate(test_set, verbose=0)
#print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

#plt.figure(figsize=(14,5))
# plot training history
#plt.plot(history.history['loss'], label='train')
#plt.plot(history.history['val_loss'], label='test')
#plt.legend()
#plt.show()

In [0]:
test_set.reset()
y_pred = model.predict_generator(test_set)

predicted_class_indices=np.argmax(y_pred,axis=1)
#print(classification_report(test_set.classes,predicted_class_indices))

position = classification_report(test_set.classes,predicted_class_indices).replace(" ", "").find('accuracy')
CNNperf = float(classification_report(test_set.classes,predicted_class_indices).replace(" ", "")[(position+8):(position+12)]);

CNNreport = classification_report(test_set.classes,predicted_class_indices,output_dict=True)
CNNprecision = pd.DataFrame(CNNreport).transpose().precision[0:4]
CNNrecall = pd.DataFrame(CNNreport).transpose().recall[0:4]

Recapitulating performances

In [0]:
performances = pd.DataFrame({'accuracy':[MLPperf, ANNperf, MNRperf, KNNperf,SVMperf, RFperf, NBperf, LDAperf]},#,CNNperf
                            index=['MLP','ANN','MNR','KNN','SVM','RF','NB','LDA'])#,'CNN'
#print(performances);

In [0]:
# Arrow for overall performances

# Help with annotations: https://www.oreilly.com/library/view/python-data-science/9781491912126/ch04.html

plt.figure(figsize=(14,5))
plt.arrow(0.15, 0.5, 0.75, 0, head_width=0.05, head_length=0.03, linewidth=4, color='g', length_includes_head=True,overhang=-0.25)
#plt.text(0.5,0.7,'ANN',withdash=True,
         #dashdirection=1,
         #dashlength=50,
           #dashpad=5,
           #rotation=0,
         # dashpush=10
         #)
for m in ['MLP','ANN','MNR','KNN','SVM']:
  plt.annotate(m, xy=(performances.accuracy[m], 0.5),xytext=(performances.accuracy[m], 0.8),
                   arrowprops=dict(arrowstyle="->",connectionstyle="angle3,angleA=50,angleB=-75"));

for m in ['RF','NB','LDA']:#m = 'MLP'
  plt.annotate(m, xy=(performances.accuracy[m], 0.5),xytext=(performances.accuracy[m], 0.2),
                   arrowprops=dict(arrowstyle="->",connectionstyle="angle3,angleA=50,angleB=-75"));

In [0]:
precision = [] # Accuracy of positive predictions: Precision = TP/(TP + FP)
recall = [] # Fraction of positives that were correctly identified: Recall = TP/(TP+FN)
method = []
audio_genre = []

for m in ['MLP','ANN','MNR','KNN','SVM','RF','NB','LDA','CNN']:
  for i in range(4):
    precision.append(eval(m+'precision[i]'))
    recall.append(eval(m+'recall[i]'))
    audio_genre.append(encoder.inverse_transform([i])[0])
    method.append(m)

recap = pd.DataFrame({'genre':audio_genre,'precision':precision,'recall':recall,'method':method})

Spider chart

In [0]:
# Help: https://plotly.com/python/radar-chart/

import plotly.express as px

fig = px.line_polar(recap, r="precision", theta="genre", color="method", line_close=True,
                    color_discrete_sequence=px.colors.sequential.Plasma_r)
fig.show()

In [0]:
fig = px.line_polar(recap, r="recall", theta="genre",  line_close=True, color="method",
                    color_discrete_sequence=px.colors.sequential.Plasma_r)
fig.show()