In [1]:
import tensorflow as tf
import tensorflow_hub as tfhub
import librosa 
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
from tensorflow.keras import layers, models

yamnet_model = tfhub.load("https://tfhub.dev/google/yamnet/1")

2026-01-06 23:12:11.521967: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-01-06 23:12:13.183617: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-06 23:12:16.477044: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
  if not hasattr(np, "object"):
  from pkg_resources import parse_version
2026-01-06 23:12:22.740005: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [2]:
dataset_path = "../librosa/vibes"
dataset_csv_path = "songs_embedding.csv"

X = []
y = []

vibe_map = {"calm": 0,
            "chaotic": 1,
            "chill": 2,
            "energetic": 3}

if not os.path.exists(dataset_csv_path):
    for vibe in os.listdir(dataset_path):
        vibe_path = os.path.join(dataset_path, vibe)
        for file in tqdm(os.listdir(vibe_path), desc=f"Extracting the songs embbeding in folder {vibe}"):
            file_path = os.path.join(vibe_path, file)
            try:
                waveform, _ = librosa.load(file_path, sr=16000, mono=True)
                waveform = waveform.astype(np.float32)

                _, embeddings, _ = yamnet_model(waveform)
                song_embeddings = tf.reduce_mean(embeddings, axis=0)

                X.append(song_embeddings.numpy())
                y.append(vibe_map[vibe]) 

            except:
                print(f"Error has occured in file path {file_path}")

Extracting the songs embbeding in folder calm:   0%|          | 0/300 [00:00<?, ?it/s]2026-01-06 23:12:30.725373: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 24379392 exceeds 10% of free system memory.
Extracting the songs embbeding in folder calm:   0%|          | 1/300 [00:04<23:33,  4.73s/it]2026-01-06 23:12:31.125688: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 24379392 exceeds 10% of free system memory.
Extracting the songs embbeding in folder calm:   1%|          | 2/300 [00:04<10:25,  2.10s/it]2026-01-06 23:12:31.485865: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 24379392 exceeds 10% of free system memory.
Extracting the songs embbeding in folder calm:   1%|          | 3/300 [00:05<06:25,  1.30s/it]2026-01-06 23:12:31.697308: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 24379392 exceeds 10% of free system memory.
Extracting the songs embbedi

Error has occured in file path ../librosa/vibes/calm/jazz.00054.wav


Extracting the songs embbeding in folder calm: 100%|██████████| 300/300 [01:20<00:00,  3.73it/s]
Extracting the songs embbeding in folder chill: 100%|██████████| 200/200 [00:46<00:00,  4.32it/s]
Extracting the songs embbeding in folder chaotic: 100%|██████████| 200/200 [00:43<00:00,  4.57it/s]
Extracting the songs embbeding in folder energetic: 100%|██████████| 300/300 [01:07<00:00,  4.46it/s]


In [3]:
if not os.path.exists(dataset_csv_path):
    df = pd.DataFrame(X)
    df["label"] = y

    df.to_csv(dataset_csv_path)

In [4]:
df = pd.read_csv(dataset_csv_path, index_col=0)

X = np.array(df.drop(columns=["label"]))
y = np.array(df["label"])

vibe_label = np.array(["calm",
                       "chaotic",
                       "chill",
                       "energetic"])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    test_size=0.2,
                                                    stratify=y)

model = models.Sequential([
    layers.Input(shape=(1024,)),
    layers.Dense(256, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dense(len(vibe_label), activation="softmax")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)


In [5]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            patience=5,
            restore_best_weights=True
        )
    ]
)

Epoch 1/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.7574 - loss: 0.6040 - val_accuracy: 0.8188 - val_loss: 0.8071
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8967 - loss: 0.3127 - val_accuracy: 0.8500 - val_loss: 0.7273
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9186 - loss: 0.2550 - val_accuracy: 0.8687 - val_loss: 0.6577
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9155 - loss: 0.2421 - val_accuracy: 0.8750 - val_loss: 0.6600
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9327 - loss: 0.1799 - val_accuracy: 0.8875 - val_loss: 0.5529
Epoch 6/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9468 - loss: 0.1781 - val_accuracy: 0.8938 - val_loss: 0.6080
Epoch 7/50
[1m20/20[0m [32m━━━━

In [6]:
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)

print(classification_report(
    y_test,
    y_pred_labels,
    target_names=vibe_label
))

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
              precision    recall  f1-score   support

        calm       0.90      0.90      0.90        60
     chaotic       0.88      0.90      0.89        40
       chill       0.89      0.85      0.87        40
   energetic       0.85      0.87      0.86        60

    accuracy                           0.88       200
   macro avg       0.88      0.88      0.88       200
weighted avg       0.88      0.88      0.88       200



In [18]:
waveform, _ = librosa.load("../librosa/energetic.2.mp3", sr=16000, mono=True)
waveform = waveform.astype(np.float32)

_, embeddings, _ = yamnet_model(waveform)
song_embeddings = tf.reduce_mean(embeddings, axis=0)

pred = model.predict(song_embeddings.numpy().reshape(1, -1))
predicted_class = tf.argmax(pred, axis=1).numpy()[0]

print(vibe_label[predicted_class])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
energetic
