In [3]:
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

data_path = 'Data/genres_original/'

features = []
labels = []

def audio_to_spectrogram(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
        S_db = librosa.power_to_db(S, ref=np.max)
        return S_db
    except Exception as e:
        print(f"Erreur lors de l'extraction du spectrogramme pour {file_path}: {e}")
        return None

for genre_folder in os.listdir(data_path):
    genre_path = os.path.join(data_path, genre_folder)
    if os.path.isdir(genre_path): 
        for file in os.listdir(genre_path):
            if file.endswith('.wav'):  
                file_path = os.path.join(genre_path, file)
                spectrogram = audio_to_spectrogram(file_path)
                
                if spectrogram is not None:
                   
                    spectrogram_resized = np.resize(spectrogram, (224, 224, 1))
                    spectrogram_resized = np.repeat(spectrogram_resized, 3, axis=-1)
                    
                    features.append(spectrogram_resized)
                    labels.append(genre_folder)  

print(f"Nombre de spectrogrammes extraits : {len(features)}")

features = np.array(features)
labels = np.array(labels)

features = features.astype('float32') / 255.0

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
x = Dense(128, activation='relu')(x)
predictions = Dense(len(np.unique(labels)), activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)

for layer in base_model.layers:
    layer.trainable = False

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

model.save('vgg_genre_model.h5')
print("Modèle VGG16 sauvegardé sous 'vgg_genre_model.h5'.")
joblib.dump(label_encoder, 'label_encoder.pkl')


  y, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Erreur lors de l'extraction du spectrogramme pour Data/genres_original/jazz\jazz.00054.wav: 
Nombre de spectrogrammes extraits : 999
Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 4s/step - accuracy: 0.1055 - loss: 2.3227 - val_accuracy: 0.1900 - val_loss: 2.2672
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 4s/step - accuracy: 0.2352 - loss: 2.1918 - val_accuracy: 0.1800 - val_loss: 2.1502
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 4s/step - accuracy: 0.2958 - loss: 2.0200 - val_accuracy: 0.3000 - val_loss: 2.0013
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 4s/step - accuracy: 0.3179 - loss: 1.9449 - val_accuracy: 0.2350 - val_loss: 1.9468
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 4s/step - accuracy: 0.3070 - loss: 1.8738 - val_accuracy: 0.3050 - val_loss: 1.8799
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.34
Classification Report:
              precision    recall  f1-score   support

       blues       0.00      0.00      0.00        21
   classical       0.43      1.00      0.60        12
     country       0.23      0.29      0.26        24
       disco       0.44      0.68      0.54        22
      hiphop       0.38      0.20      0.26        15
        jazz       0.00      0.00      0.00        27
       metal       0.76      0.72      0.74        18
         pop       0.25      0.05      0.09        19
      reggae       0.26      0.45      0.33        22
        rock       0.18      0.35      0.24        20

    accuracy                           0.34       200
   macro avg       0.29      0.38      0.31       200
weighted avg       0.27      0.34      0.28       200

Modèle VGG16 sauvegardé sous 'vgg_genre_model.h5'.


['label_encoder.pkl']