In [1]:
%matplotlib inline
import sys
import matplotlib.pyplot as plt
import librosa
import numpy as np
import librosa.display
import re
import glob
import time
import itertools
from multiprocessing import Pool
import tensorflow as tf

In [2]:
class MusicFile():
    def __init__(self, file_path):
        self.file_path = file_path
        self.file_name = file_path.split('/')[-1]
        self.audio, _ = librosa.core.load(file_path, duration=25)
        self.genre = file_path.split('/')[-2]

    def flourier_transform(self):
        return librosa.core.stft(self.audio)

    def mel_spectrogram(self):
        return librosa.feature.melspectrogram(self.audio)

    def mel_chroma(self):
        return librosa.feature.chroma_stft(self.audio)

class MusicFileGrapher():
    def __init__(self, music_file):
        self.music_file = music_file

    def graph_ft(self):
        self.graph(self.music_file.flourier_transform())
        
    def graph_ms(self):
        self.graph(self.music_file.mel_spectrogram())
        
    def graph_chroma(self):
        self.graph(self.music_file.mel_chroma)
        
    def graph(self, data):
        Xdb = librosa.amplitude_to_db(data)
        plt.figure(figsize=(14, 5))
        librosa.display.specshow(Xdb, x_axis='time', y_axis='hz')
        plt.xlabel(self.music_file.file_name)

In [3]:
all_file_paths = glob.glob('genres/*/*.au')

def add_music_file(audio_file_path):
    return MusicFile(audio_file_path)

start_time = time.time()

p = Pool(20)
music_files = p.map(add_music_file, all_file_paths)

print("Took", time.time() - start_time)

Took 46.46477675437927


In [4]:
all_genres = sorted(set(map(lambda music_file: music_file.genre, music_files)))

In [5]:
def genre_to_int(genre):
    return all_genres.index(genre)

In [6]:
def genre_to_onehot(genre):
    array = np.zeros(len(all_genres))
    array[genre_to_int(genre)] = 1
    return array

In [7]:
genre_to_onehot('reggae')

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.])

In [8]:
music_files = sorted(music_files, key=lambda music_file: music_file.file_path)

mel_x_train = []
stft_x_train = []
y_train = []

mel_x_test = []
stft_x_test = []
y_test = []

for genre, genre_music_file_group in itertools.groupby(music_files, key=lambda music_file: music_file.genre):
    one_hot_genre = genre_to_onehot(genre)
    for i, music_file in enumerate(genre_music_file_group):
        if i < 70:
            mel_x_train.append(music_file.mel_spectrogram())
            stft_x_train.append(music_file.flourier_transform())

            y_train.append(one_hot_genre)
        else:
            mel_x_test.append(music_file.mel_spectrogram())
            stft_x_test.append(music_file.flourier_transform())

            y_test.append(one_hot_genre)

In [9]:
mel_total_max = max(np.max(mel_x_train), np.max(mel_x_test))

mel_normalized_x_train = np.array(mel_x_train) / mel_total_max
mel_normalized_x_test = np.array(mel_x_test) / mel_total_max

mel_normalized_x_train = mel_normalized_x_train.reshape((*mel_normalized_x_train.shape, 1))
mel_normalized_x_test = mel_normalized_x_test.reshape((*mel_normalized_x_test.shape, 1))

stft_total_max = max(np.max(stft_x_train), np.max(stft_x_test))

stft_normalized_x_train = np.array(stft_x_train) / stft_total_max
stft_normalized_x_test = np.array(stft_x_test) / stft_total_max

stft_normalized_x_train = stft_normalized_x_train.reshape((*stft_normalized_x_train.shape, 1))
stft_normalized_x_test = stft_normalized_x_test.reshape((*stft_normalized_x_test.shape, 1))

In [10]:
mel_model = tf.keras.models.Sequential()
# Must define the input shape in the first layer of the neural network
mel_model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=mel_normalized_x_train[0].shape)) 
mel_model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
mel_model.add(tf.keras.layers.Dropout(0.3))
mel_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'))
mel_model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
mel_model.add(tf.keras.layers.Dropout(0.5))
mel_model.add(tf.keras.layers.Flatten())
mel_model.add(tf.keras.layers.Dense(256, activation='relu'))
mel_model.add(tf.keras.layers.Dropout(0.5))
mel_model.add(tf.keras.layers.Dense(10, activation='softmax'))
# Take a look at the model summary
mel_model.summary()

W0622 16:48:04.719112 139829181630208 deprecation.py:506] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 128, 1077, 64)     320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 64, 538, 64)       0         
_________________________________________________________________
dropout (Dropout)            (None, 64, 538, 64)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 538, 32)       8224      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 269, 32)       0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 269, 32)       0         
_________________________________________________________________
flatten (Flatten)            (None, 275456)            0

In [11]:
mel_model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [12]:
mel_model.fit(mel_normalized_x_train,
         np.array(y_train),
         batch_size=100,
         epochs=10,
         validation_split=0.1)

Train on 630 samples, validate on 70 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2bf840a4e0>

In [13]:
# Evaluate the model on test set
score = mel_model.evaluate(mel_normalized_x_test, np.array(y_test), verbose=0)
# Print test accuracy
print('\n', 'Test accuracy:', score[1])


 Test accuracy: 0.31


In [10]:
stft_model = tf.keras.models.Sequential()
# Must define the input shape in the first layer of the neural network
stft_model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=stft_normalized_x_train[0].shape)) 
stft_model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
stft_model.add(tf.keras.layers.Dropout(0.3))
stft_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'))
stft_model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
stft_model.add(tf.keras.layers.Dropout(0.5))
stft_model.add(tf.keras.layers.Flatten())
stft_model.add(tf.keras.layers.Dense(256, activation='relu'))
stft_model.add(tf.keras.layers.Dropout(0.5))
stft_model.add(tf.keras.layers.Dense(10, activation='softmax'))
# Take a look at the model summary
stft_model.summary()

W0622 17:02:58.283594 139829553825536 deprecation.py:506] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 1025, 1077, 64)    320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 512, 538, 64)      0         
_________________________________________________________________
dropout (Dropout)            (None, 512, 538, 64)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 512, 538, 32)      8224      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 256, 269, 32)      0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256, 269, 32)      0         
_________________________________________________________________
flatten (Flatten)            (None, 2203648)           0

In [11]:
stft_model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [None]:
stft_model.fit(abs(stft_normalized_x_train),
         np.array(y_train),
         batch_size=60,
         epochs=10,
         validation_split=0.1)

In [None]:
# Evaluate the model on test set
score = stft_model.evaluate(stft_normalized_x_test, np.array(y_test), verbose=0)
# Print test accuracy
print('\n', 'Test accuracy:', score[1])