In [1]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import pickle
import librosa
import cv2
import os

In [2]:
import tensorflow as tf
import tensorflow.keras.layers as layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [4]:
def load_data(resize_w=250, resize_h=250, n_mfcc=13, max_len=500):
    X_img = []
    X_mfcc = []
    y = []
    for genre in os.listdir("images"):
        for img_file, audio_file in zip(os.listdir("images/{}".format(genre)), os.listdir("audio/{}".format(genre))):
            img = cv2.imread("images/{}/{}".format(genre, img_file))
            if img is not None:
                img = cv2.resize(img, (resize_w, resize_h))
                X_img.append(img)
                
                audio, sr = librosa.load("audio/{}/{}".format(genre, audio_file))
                mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
                mfcc = np.transpose(mfcc)
                mfcc = pad_sequences([mfcc], maxlen=max_len, padding='post', truncating='post')[0]
                
                X_mfcc.append(mfcc)
                
                y.append(genre)
                
        print("Fin {}".format(genre))
                                        
    return np.array(X_img), np.array(X_mfcc), np.array(y),

In [5]:
X_image, X_mfcc, y, = load_data(75, 75, 13, 500)
print(X_image.shape, X_mfcc.shape, y.shape)

Fin dubstep
Fin jazz
Fin rap
Fin rnb
Fin rock
(1028, 75, 75, 3) (1028, 500, 13) (1028,)


In [6]:
# encoder = LabelEncoder()
# y = encoder.fit_transform(y)
# y = tf.keras.utils.to_categorical(y, num_classes=len(encoder.classes_))
# pickle.dump(encoder, open("encoder.pkl", "wb"))

In [21]:
encoder = pickle.load(open("encoder.pkl", "rb"))
y = encoder.transform(y)
y = tf.keras.utils.to_categorical(y, num_classes=len(encoder.classes_))

In [23]:
def partition_dataset(X1, X2, y, train_split=0.7, val_split=0.15, test_split=0.15):
    # Calculez les indices de séparation en fonction des proportions
    num_samples = len(X1)
    num_train = int(train_split * num_samples)
    num_val = int(val_split * num_samples)

    # Générez un indice aléatoire de permutation
    permutation_indices = np.random.permutation(num_samples)

    # Appliquer la permutation aux deux jeux de données et aux étiquettes
    X1_permuted = [X1[i] for i in permutation_indices]
    X2_permuted = [X2[i] for i in permutation_indices]
    y_permuted = [y[i] for i in permutation_indices]

    # Divisez les données en ensembles d'entraînement, de validation et de test
    X1_train = X1_permuted[:num_train]
    X2_train = X2_permuted[:num_train]
    y_train = y_permuted[:num_train]

    X1_val = X1_permuted[num_train:num_train + num_val]
    X2_val = X2_permuted[num_train:num_train + num_val]
    y_val = y_permuted[num_train:num_train + num_val]

    X1_test = X1_permuted[num_train + num_val:]
    X2_test = X2_permuted[num_train + num_val:]
    y_test = y_permuted[num_train + num_val:]
    
    X1_train = np.array(X1_train)
    X2_train = np.array(X2_train)
    y_train = np.array(y_train)
    
    X1_val = np.array(X1_val)
    X2_val = np.array(X2_val)
    y_val = np.array(y_val)
    
    X1_test = np.array(X1_test)
    X2_test = np.array(X2_test)
    y_test = np.array(y_test)
    
    return [X1_train, X2_train, y_train], [X1_val, X2_val, y_val], [X1_test, X2_test, y_test]

In [24]:
train, val, test = partition_dataset(X_image, X_mfcc, y)

In [25]:
X_train_img, X_train_mfcc, y_train = train
X_test_img, X_test_mfcc, y_test = test
X_val_img, X_val_mfcc, y_val = val

In [71]:
print(X_train_img.shape, X_test_img.shape, X_val_img.shape)
print(X_train_mfcc.shape, X_test_mfcc.shape, X_val_mfcc.shape)

(719, 75, 75, 3) (155, 75, 75, 3) (154, 75, 75, 3)
(719, 500, 13) (155, 500, 13) (154, 500, 13)


In [27]:
y_train.shape, y_test.shape, y_val.shape

((719, 5), (155, 5), (154, 5))

# Fusion

In [90]:
input_img = layers.Input(shape=X_train_img[0].shape)
img_model = layers.Conv2D(64, 3, activation="relu")(input_img)
img_model = layers.MaxPooling2D(2)(img_model)
img_model = layers.Conv2D(128, 4, activation="relu")(img_model)
img_model = layers.MaxPooling2D(2)(img_model)
img_model = layers.Conv2D(128, 4, activation="relu")(img_model)
img_model = layers.MaxPooling2D(2)(img_model)
img_model = layers.Conv2D(64, 3, activation="relu")(img_model)
img_model = layers.MaxPooling2D(2)(img_model)
img_model = layers.GlobalAvgPool2D()(img_model)

input_mfcc = layers.Input(shape=X_train_mfcc[0].shape)
mfcc_model = layers.Conv1D(64, 3, activation="relu")(input_mfcc)
mfcc_model = layers.MaxPooling1D(2)(mfcc_model)
mfcc_model = layers.Conv1D(128, 3, activation="relu")(mfcc_model)
mfcc_model = layers.MaxPooling1D(2)(mfcc_model)
mfcc_model = layers.Conv1D(128, 3, activation="relu")(mfcc_model)
mfcc_model = layers.MaxPooling1D(2)(mfcc_model)
mfcc_model = layers.Conv1D(64, 3, activation="relu")(mfcc_model)
mfcc_model = layers.MaxPooling1D(2)(mfcc_model)
mfcc_model = layers.GlobalAvgPool1D()(mfcc_model)

merged = layers.concatenate([img_model, mfcc_model])
output = layers.Dense(64, activation="relu")(merged)
output = layers.Dropout(0.3)(output)
output = layers.Dense(len(encoder.classes_), activation="softmax")(output)
model = Model(inputs=[input_img, input_mfcc], outputs=output)


model.summary()

Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_31 (InputLayer)          [(None, 75, 75, 3)]  0           []                               
                                                                                                  
 input_32 (InputLayer)          [(None, 500, 13)]    0           []                               
                                                                                                  
 conv2d_63 (Conv2D)             (None, 73, 73, 64)   1792        ['input_31[0][0]']               
                                                                                                  
 conv1d_60 (Conv1D)             (None, 498, 64)      2560        ['input_32[0][0]']               
                                                                                           

In [91]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath="model.h5",
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True
    )
    
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit([X_train_img, X_train_mfcc], 
          y_train, 
          validation_data=([X_val_img, X_val_mfcc], y_val), 
          epochs=10, 
          callbacks=[model_checkpoint_callback]
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x203d30fc970>

In [92]:
model.load_weights("model.h5")

In [93]:
model.evaluate([X_test_img, X_test_mfcc], y_test)



[0.8766364455223083, 0.6580645442008972]

In [94]:
y_pred = np.argmax(model.predict([X_test_img, X_test_mfcc]), axis=1)
y_true = np.argmax(y_test, axis=1)
report = classification_report(y_true, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.86      0.40      0.55        15
           1       0.88      0.85      0.86        33
           2       0.53      0.67      0.59        36
           3       0.49      0.50      0.49        38
           4       0.78      0.76      0.77        33

    accuracy                           0.66       155
   macro avg       0.71      0.63      0.65       155
weighted avg       0.68      0.66      0.66       155



In [35]:
model.save("best_model_fusion.h5")