In [33]:
import tensorflow as tf
import tensorflow_hub as tfhub
import librosa 
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
from tensorflow.keras import layers, models

yamnet_model = tfhub.load("https://tfhub.dev/google/yamnet/1")

In [None]:
dataset_path = "../librosa/genres_original"
dataset_csv_path = "song_embeddings.csv"

X = []
y = []

genre_map = {"blues": 0,
             "classical": 1,
             "country": 2,
             "disco": 3,
             "hiphop": 4,
             "jazz": 5,
             "metal": 6,
             "pop": 7,
             "reggae": 8,
             "rock": 9}

if not os.path.exists(dataset_csv_path):
    for genre in os.listdir(dataset_path):
        genre_path = os.path.join(dataset_path, genre)
        for file in tqdm(os.listdir(genre_path), desc=f"Extracting the songs segment in folder {genre}"):
            file_path = os.path.join(genre_path, file)
            try:
                waveform, _ = librosa.load(file_path, sr=16000, mono=True)
                waveform = waveform.astype(np.float32)

                _, embeddings, _ = yamnet_model(waveform)
                song_embeddings = tf.reduce_mean(embeddings, axis=0)

                X.append(song_embeddings.numpy())
                y.append(genre_map[genre]) 

            except:
                print(f"Error has occured in file path {file_path}")

Extracting the songs segment in folder reggae: 100%|██████████| 100/100 [00:20<00:00,  4.91it/s]
  waveform, _ = librosa.load(file_path, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Extracting the songs segment in folder jazz:  30%|███       | 30/100 [00:06<00:18,  3.88it/s]

Error has occured in file path ../librosa/genres_original/jazz/jazz.00054.wav


Extracting the songs segment in folder jazz: 100%|██████████| 100/100 [00:31<00:00,  3.19it/s]
Extracting the songs segment in folder pop: 100%|██████████| 100/100 [00:25<00:00,  3.87it/s]
Extracting the songs segment in folder rock: 100%|██████████| 100/100 [00:20<00:00,  4.79it/s]
Extracting the songs segment in folder country: 100%|██████████| 100/100 [00:20<00:00,  4.92it/s]
Extracting the songs segment in folder hiphop: 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]
Extracting the songs segment in folder blues: 100%|██████████| 100/100 [00:20<00:00,  4.78it/s]
Extracting the songs segment in folder metal: 100%|██████████| 100/100 [00:20<00:00,  4.81it/s]
Extracting the songs segment in folder classical: 100%|██████████| 100/100 [00:20<00:00,  4.85it/s]
Extracting the songs segment in folder disco: 100%|██████████| 100/100 [00:20<00:00,  4.93it/s]


In [None]:
if not os.path.exists(dataset_csv_path):
    df = pd.DataFrame(X)
    df["label"] = y

    df.to_csv(dataset_csv_path)

In [None]:
df = pd.read_csv(dataset_csv_path, index_col=0)

X = np.array(df.drop(columns=["label"]))
y = np.array(df["label"])

genre_label = np.array(["blues",
               "classical",
               "country",
               "disco",
               "hiphop",
               "jazz",
               "metal",
               "pop",
               "reggae",
               "rock"])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    test_size=0.2,
                                                    stratify=y)

model = models.Sequential([
    layers.Input(shape=(1024,)),
    layers.Dense(256, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dense(len(genre_label), activation="softmax")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)


In [None]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            patience=5,
            restore_best_weights=True
        )
    ]
)

ValueError: Invalid dtype: object

In [None]:
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)

print(classification_report(
    y_test,
    y_pred_labels,
    target_names=genre_label
))

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
              precision    recall  f1-score   support

       blues       0.90      0.95      0.93        20
   classical       1.00      0.95      0.97        20
     country       1.00      0.85      0.92        20
       disco       0.94      0.80      0.86        20
      hiphop       0.86      0.90      0.88        20
        jazz       1.00      0.95      0.97        20
       metal       1.00      0.85      0.92        20
         pop       0.81      0.85      0.83        20
      reggae       0.77      0.85      0.81        20
        rock       0.58      0.75      0.65        20

    accuracy                           0.87       200
   macro avg       0.89      0.87      0.87       200
weighted avg       0.89      0.87      0.87       200



In [None]:
waveform, _ = librosa.load("../librosa/rock.991.mp3", sr=16000, mono=True)
waveform = waveform.astype(np.float32)

_, embeddings, _ = yamnet_model(waveform)
song_embeddings = tf.reduce_mean(embeddings, axis=0)

pred = model.predict(song_embeddings.numpy().reshape(1, -1))
predicted_class = tf.argmax(pred, axis=1).numpy()[0]

print(genre_label[predicted_class])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
classical


In [None]:
X.shape

(1022976, 1)