In [1]:
import numpy as np
import pandas as pd
import os
import librosa
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers, models
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")

# def extract_song_embeddings(file_path):
#     try:
#         waveform, _ = librosa.load(file_path, sr=16000, mono=True)
#         waveform = waveform.astype(np.float32)

#         _, embbedings, _ = yamnet_model(waveform)
#         song_embeddings = tf.reduce_mean(embbedings, axis=0)
        
#         return song_embeddings.numpy()
#     except:
#         print(f"Error has occured in file path {file_path}")
#         return None
    
def extract_song_embeddings(segment):
    _, embbedings, _ = yamnet_model(segment)
    song_embeddings = tf.reduce_mean(embbedings, axis=0)
        
    return song_embeddings.numpy()

def extract_song_segments(file_path):
    segment_seconds = 5
    overlap_seconds = 0 
    segments = []

    try:
        waveform, sr = librosa.load(file_path, sr=16000, mono=True)

        segment_samples = segment_seconds * sr
        hop_samples = segment_samples - int(overlap_seconds * sr)

        for start in range(0, 
                           len(waveform) - segment_samples + 1, 
                           hop_samples):
                           
            segment = waveform[start: start + segment_samples]
            segments.append(segment)

        return segments
    
    except:
        print(f"Error has occured in file path {file_path}")
        return None

2026-01-07 22:02:13.483389: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-01-07 22:02:14.019463: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-07 22:02:16.319585: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
  if not hasattr(np, "object"):
  from pkg_resources import parse_version
2026-01-07 22:02:18.809078: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [2]:
dataset_path = "vibes"
dataset_csv_path = "song_embeddings.csv"

X = []
y = []
vibe_map = {"relaxing": 0,
             "chaotic": 1,
             "chill": 2,
             "energetic": 3}

if not os.path.exists(dataset_csv_path):
    for vibe in os.listdir(dataset_path):
        vibe_path = os.path.join(dataset_path, vibe)
        for file in tqdm(os.listdir(vibe_path), desc=f"Extracting songs embedding in folder {vibe}"):
            file_path = os.path.join(vibe_path, file)
            segments = extract_song_segments(file_path)
            if segments is not None:
                for segment in segments:
                    embeddings = extract_song_embeddings(segment)
                    X.append(embeddings)
                    y.append(vibe_map[vibe])

Extracting songs embedding in folder chill: 100%|██████████| 200/200 [01:20<00:00,  2.48it/s]
Extracting songs embedding in folder chaotic: 100%|██████████| 200/200 [00:54<00:00,  3.70it/s]
Extracting songs embedding in folder energetic: 100%|██████████| 300/300 [01:12<00:00,  4.12it/s]
  waveform, sr = librosa.load(file_path, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Extracting songs embedding in folder relaxing:  39%|███▉      | 118/300 [00:32<00:40,  4.53it/s]

Error has occured in file path vibes/relaxing/jazz.00054.wav


Extracting songs embedding in folder relaxing: 100%|██████████| 300/300 [01:18<00:00,  3.84it/s]


In [3]:
if not os.path.exists(dataset_csv_path):
    df = pd.DataFrame(X)
    df["label"] = y

    df.to_csv(dataset_csv_path)

In [4]:
df = pd.read_csv(dataset_csv_path,
                 index_col=0)

X = np.array(df.drop(columns=["label"]))
y = np.array(df["label"])

vibe_labels = ["Relaxing", 
               "Chaotic", 
               "Chill", 
               "Energetic"]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=42, 
                                                    test_size=0.20,
                                                    stratify=y)

model = models.Sequential([
    layers.Input(shape=(1024,)),
    layers.Dense(256, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dense(len(vibe_labels), activation="softmax")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            patience=5,
            restore_best_weights=True
        )
    ]
)

Epoch 1/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.8151 - loss: 0.5108 - val_accuracy: 0.8789 - val_loss: 0.6103
Epoch 2/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8658 - loss: 0.3515 - val_accuracy: 0.8820 - val_loss: 0.4148
Epoch 3/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8815 - loss: 0.3124 - val_accuracy: 0.8967 - val_loss: 0.3309
Epoch 4/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8966 - loss: 0.2786 - val_accuracy: 0.8914 - val_loss: 0.3073
Epoch 5/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8979 - loss: 0.2690 - val_accuracy: 0.8789 - val_loss: 0.3367
Epoch 6/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9060 - loss: 0.2457 - val_accuracy: 0.9061 - val_loss: 0.2719
Epoch 7/50
[1m120/120[0m

In [5]:
y_preds = model.predict(X_test)
y_pred_labels = np.argmax(y_preds, axis=1)

print(classification_report(y_test,
                            y_pred_labels,
                            target_names=vibe_labels))

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
              precision    recall  f1-score   support

    Relaxing       0.95      0.97      0.96       358
     Chaotic       0.85      0.90      0.87       240
       Chill       0.89      0.90      0.90       240
   Energetic       0.91      0.86      0.89       359

    accuracy                           0.91      1197
   macro avg       0.90      0.91      0.90      1197
weighted avg       0.91      0.91      0.91      1197



In [17]:
z = []
song_segments = extract_song_segments("testing_dataset/kehlani.mp3")

for song_segment in song_segments:
    song_embeddings = extract_song_embeddings(song_segment)
    z.append(song_embeddings)

z = np.array(z)

y_pred = model.predict(z)

song_pred = np.mean(y_pred, axis=0)
y_pred_label = np.argmax(song_pred)

print("Predicted Class:", vibe_labels[y_pred_label])

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Predicted Class: Energetic
