In [1]:
import IPython.display as ipd
import matplotlib.pyplot as plt
import librosa.display
import time
import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import youtube_dl
import librosa
import os
import cv2
import re
import tensorflow as tf

from tensorflow.keras import optimizers
from tensorflow.keras.layers import Flatten, Dense, Conv2D, MaxPool2D, Dropout
from tensorflow.keras.models import Sequential
from __future__ import unicode_literals
from os import path, listdir 
from os.path import isfile, join
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder



# Song divided every 5s concatenate with features

In [None]:
def music_path(path):
    directories = [i for i in listdir(path) if not i.startswith(".")]
    for genre in directories:
        for song in listdir(f"{path}/{genre}"):
            if not song.startswith("."):
                yield f"{path}/{genre}/{song}", genre
                
def add_features(song, sr):
    res = []
    for part in song:
        union = part
        
        mfcc = librosa.feature.mfcc(part, sr)
        print(mfcc.shape)
        for element in mfcc:
            union = np.concatenate((union, element), axis=None)
        
        chroma_stft = librosa.feature.chroma_stft(part, sr)
        for element in chroma_stft:
            union = np.concatenate((union, element), axis=None)
        
        spectral_centroid = librosa.feature.spectral_centroid(part, sr)
        union = np.concatenate((union, spectral_centroid), axis=None)
        
        zero_crossing_rate = librosa.feature.zero_crossing_rate(part, sr)
        union = np.concatenate((union, zero_crossing_rate), axis=None)
        res.append(union)
    return np.array(res)
        
def split_song(song, sr, seconds = 5):
    res = []
    for i in range(1, len(song)//(sr*seconds)):
        res.append(song[(i-1)*sr*seconds : i*sr*seconds])
    return np.array(res)
                
def load_song(path):
    x , sr = librosa.load(path, mono=True, sr=44100)
    splited_song = split_song(x,sr,5)
    for song in add_features(splited_song, sr):
        yield song

In [None]:
def songs_array(default = 'music'):
    data = []
    y = []
    for song_path, genre in music_path(default): 
        for part_song in load_song(song_path):
            data.append(part_song)
            y.append(genre)
    return np.array(data), np.array(y)

In [None]:
%%time
x,y = songs_array()

In [None]:
x.shape, y.shape

# Song divided every 5s, only mean features

In [None]:
def add_features(song, sr):
    data = []
    
    for part in song:
        #s_part = np.array(part)
        res = []
        union = 0
        mfcc = librosa.feature.mfcc(part, sr)
        for element in mfcc:
            #union = sum(element)
            res.append((sum(element)/mfcc.shape[1]))
        #res.append((union/mfcc.shape[1]))
        
        union = 0
        rms = librosa.feature.rms(part, sr)
        union = sum(rms[0])
        res.append((union/rms.shape[1]))
        
        union = 0
        chroma_stft = librosa.feature.chroma_stft(part, sr)
        for element in chroma_stft:
            #union = sum(element)
            res.append((sum(element)/chroma_stft.shape[1]))
        #res.append((union/chroma_stft.shape[1]))
        
        union = 0
        spectral_bandwidth = librosa.feature.spectral_bandwidth(part, sr)
        union = sum(spectral_bandwidth[0])
        res.append((union/spectral_bandwidth.shape[1]))
        
        union = 0
        spectral_contrast = librosa.feature.spectral_contrast(part, sr)
        for element in spectral_contrast:
            #union = sum(element)
            res.append((sum(element)/spectral_contrast.shape[1]))
        #res.append((union/spectral_contrast.shape[1]))
        
        union = 0
        spectral_flatness = librosa.feature.spectral_flatness(part)
        union = sum(spectral_flatness[0])
        res.append((union/spectral_flatness.shape[1]))
        
        union = 0
        spectral_rolloff = librosa.feature.spectral_rolloff(part, sr)
        union = sum(spectral_rolloff[0])
        res.append((union/spectral_rolloff.shape[1]))
        
        union = 0
        spectral_centroid = librosa.feature.spectral_centroid(part, sr)
        union = sum(spectral_centroid[0])
        res.append((union/spectral_centroid.shape[1]))
        
        union = 0
        tonnetz = librosa.feature.tonnetz(part, sr)
        for element in tonnetz:
            #union = sum(element)
            res.append((sum(element)/tonnetz.shape[1]))
        #res.append((union/tonnetz.shape[1]))
        
        union = 0
        zero_crossing_rate = librosa.feature.zero_crossing_rate(part, sr)
        union = sum(zero_crossing_rate[0])
        
        res.append((union/zero_crossing_rate.shape[1]))
        
        union = 0
        tempogram = librosa.feature.tempogram(part, sr)
        for element in tempogram:
            #union = sum(element)
            res.append((sum(element)/tempogram.shape[1]))
        #res.append((union/tempogram.shape[1]))
        
        union = 0
        fourier_tempogram = librosa.feature.fourier_tempogram(part, sr)
        for element in fourier_tempogram:
            #union = abs(sum(element))
            res.append((sum(element).real/fourier_tempogram.shape[1]))
        #res.append((union/fourier_tempogram.shape[1]))
        
        tempo = librosa.beat.tempo(part, sr)
        res.append(tempo[0])
        
        beat_track = librosa.beat.beat_track(part, sr)
        res.append(beat_track[0])
        
        union = 0
        plp = librosa.beat.plp(part, sr)
        for element in plp:
            union += element
        res.append((union/plp.shape[0]))
        
        data.append(res)

    return np.array(data)

In [None]:
%%time
x,y = songs_array()

In [None]:
x.shape, y.shape

# Song divided every 5s, only features

In [None]:
def add_features_full_song(song, sr):
    res = []
    
    union = song
    mfcc = librosa.feature.mfcc(song, sr)
    for element in mfcc:
        union = np.concatenate((union, element), axis=None)
    res.append(union)

    union = song
    chroma_stft = librosa.feature.chroma_stft(song, sr)
    for element in chroma_stft:
        union = np.concatenate((union, element), axis=None)
    res.append(union)

    union = song
    spectral_centroid = librosa.feature.spectral_centroid(song, sr)
    union = np.concatenate((union, spectral_centroid), axis=None)
    res.append(union)

    union = song
    zero_crossing_rate = librosa.feature.zero_crossing_rate(song, sr)
    union = np.concatenate((union, zero_crossing_rate), axis=None)
    res.append(union)
    
    return np.array(res)

def load_full_song(path):
    x , sr = librosa.load(path, mono=True, sr=44100)
    song_features = add_features_full_song(x, sr)
    song = np.array([song_features])
    return song

In [None]:
def full_songs_array(default = 'music'):
    data = []
    y = []
    for song_path, genre in music_path(default): 
        for part_song in load_full_song(song_path):
            data.append(part_song)
            y.append(genre)
    return np.array(data), np.array(y)

In [None]:
%%time
x,y = full_songs_array()

In [None]:
x.shape, y.shape

# Image song

In [None]:
def music_path(path, to_predict):
    directories = [i for i in listdir(path) if not i.startswith(".")]
    
    if to_predict:
        directories.remove('Image')
        
    for genre in directories:
        for song in listdir(f"{path}/{genre}"):
            if not song.startswith("."):
                yield f"{path}/{genre}/{song}", genre
                
def check_folder(path):
    path_folder = path.split('/')
    if not os.path.isdir(f'{path_folder[0]}/{path_folder[1]}'):
        os.mkdir(f'{path_folder[0]}/{path_folder[1]}')
        
def save_image(song, path):
    check_folder(path)
    stft = librosa.stft(song)
    song_db = librosa.amplitude_to_db(stft)
    librosa.display.specshow(song_db)
    plt.savefig(f'{path}.png')
    plt.close()
    
def create_image(default = 'music', to_predict=False):
    data = []
    y = []
    for song_path, genre in music_path(default, to_predict):
        tree_dir = song_path.split("/")
        x , sr = librosa.load(song_path, mono=True, sr=44100)
        if to_predict:
            save_image(x, f"one/Image/{tree_dir[-1].replace('.wav','').replace('.mp3','')}")
        else:
            save_image(x, f"music_image/{genre}/{tree_dir[-1].replace('.wav','')}")

In [None]:
create_image()

## Leemos las imagenes

In [2]:
img_size = 255

In [3]:
def read_img(path):
    img = cv2.imread(path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_resized = cv2.resize(img, (img_size, img_size))
    return img_resized

def img_path(path, to_predict):
    directories = [i for i in listdir(path) if not i.startswith(".")]
    
    if to_predict:
        directories.remove('Music')
    
    for genre in directories:
        for song_img in listdir(f"{path}/{genre}"):
            if not song_img.startswith("."):
                yield f"{path}/{genre}/{song_img}", genre

def get_img_data(path = 'music_image', to_predict=False):
    data = []
    y = []
    for img_p, genre in img_path(path, to_predict):
        data.append(read_img(img_p))
        y.append(genre)
    return np.array(data), np.array(y)

In [4]:
x,y = get_img_data()

In [5]:
x.shape

(599, 255, 255, 3)

# Data Preparation

In [6]:
genre = pd.DataFrame(data=y, columns=["Genero"])
ohe = OneHotEncoder()
ohe.fit(genre[["Genero"]])
genre_ohe = ohe.transform(genre[["Genero"]]).todense()
#genre_ohe

In [7]:
directories = sorted([i for i in listdir("music_image") if not i.startswith(".")])
df_ohe = pd.DataFrame(data=genre_ohe, columns=directories)
df_ohe

Unnamed: 0,blues,classical,country,disco,hiphop,jazz,metal,pop,reggae,rock
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
X_train, X_val, y_train, y_val = train_test_split(x, genre_ohe, train_size=0.7, random_state=42)

In [9]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((419, 255, 255, 3), (180, 255, 255, 3), (419, 10), (180, 10))

In [10]:
y_val.sum(axis=-2).mean()

18.0

In [11]:
y_val.sum(axis=-2)

matrix([[16., 28., 12., 15., 17., 18., 19., 17., 22., 16.]])

In [12]:
X_val.max()

255

In [13]:
X_train = X_train / 255
X_val = X_val / 255

X_train.reshape(-1, img_size, img_size, 1)
X_val.reshape(-1, img_size, img_size, 1)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((419, 255, 255, 3), (180, 255, 255, 3), (419, 10), (180, 10))

In [14]:
model = Sequential([
    Conv2D(128,3,padding='same', activation='relu', input_shape= (img_size, img_size, 3)),
    MaxPool2D(),
    
    Conv2D(256, 3, padding='same', activation='relu'),
    MaxPool2D(),
    
    Conv2D(512, 3, padding='same', activation='relu'),
    MaxPool2D(),
    
    Conv2D(1024, 3, padding='same', activation='relu'),
    MaxPool2D(),
    Dropout(0.4),
    
    Flatten(),
    Dense(128,activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

2021-10-26 23:29:19.253659: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-26 23:29:19.255853: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-26 23:29:19.259061: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [15]:
'''model = Sequential([
    Dense(512, activation="relu", input_shape = (img_size, img_size, 3)),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(1_000, activation="relu"),
    Dense(3, activation="softmax")
])'''

'model = Sequential([\n    Dense(512, activation="relu", input_shape = (img_size, img_size, 3)),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(1_000, activation="relu"),\n    Dense(3, activation="softmax")\n])'

In [16]:
optimizer = optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer,
              loss="categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 255, 255, 128)     3584      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 127, 127, 128)     0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 127, 127, 256)     295168    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 63, 63, 256)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 63, 63, 512)       1180160   
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 31, 31, 512)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 31, 31, 1024)      4

In [17]:
X_train.shape, y_train.shape

((419, 255, 255, 3), (419, 10))

In [18]:
history = model.fit(X_train, 
                    y_train,
         validation_data=(X_val, 
                          y_val),
         epochs=500,
         verbose=1,
         batch_size=32)

2021-10-26 23:29:25.251569: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-26 23:29:25.254371: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3800020000 Hz


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500


Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
 1/14 [=>............................] - ETA: 1:10 - loss: 0.0124 - accuracy: 1.0000

KeyboardInterrupt: 

In [19]:
#model.save('saved_model/test_full_bad.h5')

# Predict data

In [None]:
#model = models.load_model('saved_model/test.h5')

In [None]:
create_image('one', True)

In [20]:
x,y = get_img_data('one', True)

In [21]:
x.shape

(1, 255, 255, 3)

In [22]:
y.shape

(1,)

In [23]:
y_pred = model.predict(x)

In [24]:
directories = [i for i in listdir("music_image") if not i.startswith(".")]
res = {}
for d in directories:
    res[d] = 0

for part_song in range(len(y_pred)):
    for percent_predict in range(len(y_pred[part_song])):
        res[directories[percent_predict]] += y_pred[part_song][percent_predict]

for i in res.items():
    print(f"{i[0]} -> {(i[1]/x.shape[0])*100}")

hiphop -> 0.0
classical -> 0.0
blues -> 0.0
metal -> 0.0
jazz -> 0.0
country -> 100.0
pop -> 0.0
rock -> 0.0
disco -> 0.0
reggae -> 0.0
