In [2]:
from src.music_recommender.models.hybrid import HybridModel
import joblib
from src.music_recommender.config import Config
import os
import warnings

import joblib
import pandas as pd
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split

from src.music_recommender.config import Config
from src.music_recommender.data.pipeline import create_extraction_pipeline
from src.music_recommender.evaluation.evaluator import get_best_models, get_top_3_models
from src.music_recommender.models.hybrid import HybridModel
from src.music_recommender.models.model_registry import (
    MODEL_CLASS_LOOKUP,
    TARGET_GROUPS,
)
from src.music_recommender.training.trainer import train_models
from src.music_recommender.utils.helpers import tree
from src.music_recommender.utils.logger import get_logger

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="Objective did not converge")


In [3]:

logger = get_logger(context="training")
cfg = Config()

logger.info("Loading dataset...")
audio_data = pd.read_csv(cfg.paths.processed / "matched_metadata.csv")

bins = [0, 80, 100, 120, 140, 170, float("inf")]
numeric_labels = [0, 1, 2, 3, 4, 5]
audio_data["tempo_bins"] = pd.cut(
    audio_data["tempo"], bins=bins, labels=numeric_labels, right=False
)

X = audio_data["track_id"].map(
    lambda id: cfg.paths.processed / "audio" / f"{str(id).zfill(6)}.mp3"
)
y = audio_data[
    [
        "danceability",
        "energy",
        "key",
        "loudness",
        "mode",
        "speechiness",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo_bins",
    ]
]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
logger.info(f"Train: {len(X_train)} samples, Test: {len(X_test)} samples")

logger.info("Extracting audio features...")
extraction_pipeline = create_extraction_pipeline(cfg)
X_train_extracted = extraction_pipeline.fit_transform(X_train)
X_test_extracted = extraction_pipeline.transform(X_test)
logger.info(f"Feature extraction complete. Shape: {X_train_extracted.shape}")




[32m2025-11-09 17:10:14.499[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mLoading dataset...[0m
[32m2025-11-09 17:10:14.542[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mTrain: 1383 samples, Test: 346 samples[0m
[32m2025-11-09 17:10:14.543[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mExtracting audio features...[0m
Loading audio: 100%|██████████| 1383/1383 [01:04<00:00, 21.34it/s]
Computing spectrograms: 100%|██████████| 1383/1383 [01:01<00:00, 22.51it/s]
Extracting features: 100%|██████████| 1383/1383 [00:00<00:00, 3234.55it/s]
Loading audio: 100%|██████████| 346/346 [00:15<00:00, 21.82it/s]
Computing spectrograms: 100%|██████████| 346/346 [00:15<00:00, 22.35it/s]
Extracting features: 100%|██████████| 346/346 [00:00<00:00, 3733.80it/s]
[32m2025-11-09 17:12:53.083[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m41[0m - [1mFeature extraction complete. Shape: (138

In [4]:
cfg = Config()

In [5]:
hybrid_model = joblib.load(cfg.paths.models/"hybrid_model.joblib")

In [6]:
hybrid_model

0,1,2
,top_models,"defaultdict(<...-05)])""}})})})"
,best_models,"defaultdict(<...79844)])""}})})"
,skip_stacking,{'regression': ['structure']}
,target_groups,"{'classification': {'key': ['key'], 'mode': ['mode'], 'tempo_bins': ['tempo_bins']}, 'regression': {'energy_mood': ['energy', 'valence', ...], 'production': ['loudness', 'acousticness', ...], 'structure': ['speechiness']}}"
,lookup_table,"{'classification': {'Random Forest': <class 'sklea...stClassifier'>, 'SVM': <class 'sklea..._classes.SVC'>, 'XGBoost': <class 'xgboo...GBClassifier'>}, 'regression': {'ElasticNet': <class 'sklea...t.ElasticNet'>, 'Lasso': <class 'sklea...escent.Lasso'>, 'Random Forest': <class 'sklea...estRegressor'>, 'Ridge': <class 'sklea..._ridge.Ridge'>, ...}}"
,random_state,42
,cv,4
,final_estimator_reg,Ridge()
,final_estimator_class,LogisticRegre...ndom_state=42)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,5000

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [7]:
prediction = hybrid_model.predict(X_train_extracted.iloc[[0]])
prediction

Unnamed: 0,energy,valence,danceability,loudness,acousticness,instrumentalness,liveness,speechiness,key,mode,tempo_bins
0,0.854202,0.434965,0.497951,-6.734079,0.063845,0.839302,0.20591,0.088295,11,1,2


In [8]:
feature_cols = prediction.columns.tolist()

In [9]:
feature_cols[:-3]

['energy',
 'valence',
 'danceability',
 'loudness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'speechiness']

In [10]:
spotify_dataset = pd.read_csv(cfg.paths.data / "raw/spotify-12m-songs/tracks_features.csv")

In [11]:
print(audio_data["tempo"].min(), audio_data["tempo"].max())
print(audio_data["tempo"].isna().sum())
print((audio_data["tempo"] < 0).sum())


0.0 220.165
0
0


In [12]:
bins = [0, 80, 100, 120, 140, 170, float("inf")]
numeric_labels = [0, 1, 2, 3, 4, 5]
spotify_dataset["tempo_bins"] = pd.cut(
    spotify_dataset["tempo"], bins=bins, labels=numeric_labels, right=False
)

In [13]:
print(spotify_dataset['tempo'].max(),spotify_dataset['tempo'].min())

248.93400000000003 0.0


In [14]:
spotify_dataset.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,tempo_bins
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,2
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,2
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,4
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,...,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,1
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,...,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,3


In [15]:
spotify_dataset.isna().sum()

id                   0
name                 3
album               11
album_id             0
artists              0
artist_ids           0
track_number         0
disc_number          0
explicit             0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
time_signature       0
year                 0
release_date         0
tempo_bins           0
dtype: int64

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
spotify_features_scaled = scaler.fit_transform(spotify_dataset[feature_cols])

In [17]:
import numpy as np

In [18]:
np.isnan(spotify_features_scaled).sum() /len(spotify_features_scaled)

np.float64(0.0)

In [25]:
similarities = cosine_similarity(prediction,spotify_features_scaled)[0]

top_indices = np.argsort(similarities)[-10:][::-1]

recommendations = spotify_dataset.iloc[top_indices].copy()
recommendations['similarity_score'] = similarities[top_indices]


In [26]:
recommendations = spotify_dataset.iloc[top_indices].copy()
recommendations['similarity_score'] = similarities[top_indices]

In [27]:
recommendations

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,tempo_bins,similarity_score
869583,2fYPYaRlwpRurSVocP57qR,Semi Stereo,Autoepy,5ls8ZFTbOGzqFcRPSHFE7L,['Epy'],['3ligaExqbyo0NTJq2wgLn9'],7,1,False,0.399,...,0.122,0.102,0.563,128.84,150366,4.0,2019,2019-03-31,3,0.850109
694436,0v9M80vJob9QT0Utw065xh,Vocals & Hand Clapping - Remastered,Songs for Heroes,6s9WCxK9oqkQ1n3xHlQ4Nt,"['Phil Dadson', 'From Scratch']","['7vdrVrksJJYdYulingIwBw', '4a8I5bCVU2LJp8Sydw...",3,1,False,0.562,...,0.00242,0.0868,0.417,157.723,357933,4.0,1991,1991-10-15,4,0.808992
861939,7qB8p8hMkaUZB7Ei3TTnDw,Get Happy,Plays The Music Of Harold Arlen,3b8hcyHkJS56xQjuG58eUT,['Tommy Flanagan'],['6dUZplQfg5blo0h9HiJ94d'],7,1,False,0.453,...,0.362,0.0842,0.567,122.672,312187,4.0,2016,2016-07-08,3,0.801018
1200611,1fGBEGpELqPxu1ycokLAAy,Toyland,Dark Xmas,7rFJUWV6c961lfGViA7DnV,"['KRSR', 'SWIM']","['0yYoe9glX1NBRybKRXpQR4', '2u96w3T0EikMkttEFz...",2,1,False,0.551,...,0.241,0.259,0.319,136.02,229412,4.0,2014,2014,3,0.798253
987542,535qfjYWFpQBjvIpWNG1ZG,"Sunflower, Vol. 6",...but i'd rather be with you,1Misn812Z5W216eNI7NuIy,['Molly Tuttle'],['4LX0KCPnH7gvxEbVXqXmAE'],9,1,False,0.579,...,0.0187,0.103,0.459,137.766,235418,4.0,2020,2020-08-28,3,0.794237
26891,5XEy6kbQsZfr1In2xpUU1j,Autumn Leaves,Bobby Campo,2auWhhrlto8QHZC3lkhugp,['Bobby Campo'],['4GSi7NUcGP1LYjVui2FFbP'],2,1,False,0.478,...,0.514,0.165,0.596,123.463,294933,4.0,2000,2000,3,0.791441
660688,1UNMuMRoAjMet1koig3Wux,"L’Incoronazione di Dario, RV 719, Sinfonia: I....",Vivaldi & Handel: Arias,7KAJrBecmXSV3vorRaxIjg,"['Antonio Vivaldi', 'Clavecin en Concert', 'Lu...","['2QOIawHpSlOwXDvSqQ9YJR', '6BpexBKOksYLliherk...",9,1,False,0.419,...,0.454,0.107,0.429,131.891,148507,4.0,2017,2017-01-27,3,0.789073
188394,0YuP44jZHVo2I0V6uvDXAh,Tycho,Robert Dick - Third Stone From the Sun,2AbsItfkj8XLO8YQneoevr,"['Robert Dick', 'Jerome Harris', 'Jim Black']","['3Ij67v8rlRqZoxi5ysssrm', '0BLhUsMlOfOupCibR5...",5,1,False,0.506,...,0.395,0.298,0.588,102.249,331667,4.0,1993,1993-01-01,2,0.787258
288914,3q0c9gm7FORbHbhRtnXQvL,His Love Just Washed Away,Blonder Tongue Audio Baton,0bOwhaZsDgLWVIUd4mU7U5,['Swirlies'],['0Zk7We5BZoDR40himAI86D'],4,1,False,0.551,...,0.385,0.142,0.397,135.961,324320,4.0,1993,1993,3,0.786259
169447,5My700tMomi3DWqZVlrmyC,Rapture,Nightscape,2lU9BSGrAbOyNUVpJkCF4T,"['Jon Mayer', 'Rufus Reid', 'Roy McCurdy']","['7LAhrNaA7XqM2ZVnlIRnjV', '7mcseJPsc7KCQrITzr...",5,1,False,0.453,...,0.328,0.0803,0.432,128.18,309373,4.0,2009,2009-01-01,3,0.782779


In [31]:
recommendations[["name","album","artists","similarity_score"]]

Unnamed: 0,name,album,artists,similarity_score
869583,Semi Stereo,Autoepy,['Epy'],0.850109
694436,Vocals & Hand Clapping - Remastered,Songs for Heroes,"['Phil Dadson', 'From Scratch']",0.808992
861939,Get Happy,Plays The Music Of Harold Arlen,['Tommy Flanagan'],0.801018
1200611,Toyland,Dark Xmas,"['KRSR', 'SWIM']",0.798253
987542,"Sunflower, Vol. 6",...but i'd rather be with you,['Molly Tuttle'],0.794237
26891,Autumn Leaves,Bobby Campo,['Bobby Campo'],0.791441
660688,"L’Incoronazione di Dario, RV 719, Sinfonia: I....",Vivaldi & Handel: Arias,"['Antonio Vivaldi', 'Clavecin en Concert', 'Lu...",0.789073
188394,Tycho,Robert Dick - Third Stone From the Sun,"['Robert Dick', 'Jerome Harris', 'Jim Black']",0.787258
288914,His Love Just Washed Away,Blonder Tongue Audio Baton,['Swirlies'],0.786259
169447,Rapture,Nightscape,"['Jon Mayer', 'Rufus Reid', 'Roy McCurdy']",0.782779
