In [3]:
from src.music_recommender.models.hybrid import HybridModel
import joblib
from src.music_recommender.config import Config

In [4]:
import os
import warnings

import joblib
import pandas as pd
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split

from src.music_recommender.config import Config
from src.music_recommender.data.pipeline import create_extraction_pipeline
from src.music_recommender.evaluation.evaluator import get_best_models, get_top_3_models
from src.music_recommender.models.hybrid import HybridModel
from src.music_recommender.models.model_registry import (
    MODEL_CLASS_LOOKUP,
    TARGET_GROUPS,
)
from src.music_recommender.training.trainer import train_models
from src.music_recommender.utils.helpers import tree
from src.music_recommender.utils.logger import get_logger

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="Objective did not converge")

logger = get_logger(context="training")
cfg = Config()

logger.info("Loading dataset...")
audio_data = pd.read_csv(cfg.paths.processed / "matched_metadata.csv")

bins = [0, 80, 100, 120, 140, 170, float("inf")]
numeric_labels = [0, 1, 2, 3, 4, 5]
audio_data["tempo_bins"] = pd.cut(
    audio_data["tempo"], bins=bins, labels=numeric_labels, right=False
)

X = audio_data["track_id"].map(
    lambda id: cfg.paths.processed / "audio" / f"{str(id).zfill(6)}.mp3"
)
y = audio_data[
    [
        "danceability",
        "energy",
        "key",
        "loudness",
        "mode",
        "speechiness",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo_bins",
    ]
]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
logger.info(f"Train: {len(X_train)} samples, Test: {len(X_test)} samples")

logger.info("Extracting audio features...")
extraction_pipeline = create_extraction_pipeline(cfg)
X_train_extracted = extraction_pipeline.fit_transform(X_train)
X_test_extracted = extraction_pipeline.transform(X_test)
logger.info(f"Feature extraction complete. Shape: {X_train_extracted.shape}")




[32m2025-11-04 18:00:46.303[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mLoading dataset...[0m


[32m2025-11-04 18:00:46.330[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m59[0m - [1mTrain: 1383 samples, Test: 346 samples[0m
[32m2025-11-04 18:00:46.331[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m61[0m - [1mExtracting audio features...[0m
Loading audio: 100%|██████████| 1383/1383 [00:55<00:00, 24.99it/s]
Computing spectrograms: 100%|██████████| 1383/1383 [01:00<00:00, 22.72it/s]
Extracting features: 100%|██████████| 1383/1383 [00:00<00:00, 3032.69it/s]
Loading audio: 100%|██████████| 346/346 [00:15<00:00, 21.75it/s]
Computing spectrograms: 100%|██████████| 346/346 [00:19<00:00, 17.76it/s]
Extracting features: 100%|██████████| 346/346 [00:00<00:00, 4589.04it/s]
[32m2025-11-04 18:03:18.908[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m65[0m - [1mFeature extraction complete. Shape: (1383, 246)[0m


In [5]:
cfg = Config()

In [6]:
hybrid_model = joblib.load(cfg.paths.models/"hybrid_model.joblib")

In [7]:
hybrid_model

0,1,2
,top_models,"defaultdict(<...-05)])""}})})})"
,best_models,"defaultdict(<...79844)])""}})})"
,skip_stacking,{'regression': ['structure']}
,target_groups,"{'classification': {'key': ['key'], 'mode': ['mode'], 'tempo_bins': ['tempo_bins']}, 'regression': {'energy_mood': ['energy', 'valence', ...], 'production': ['loudness', 'acousticness', ...], 'structure': ['speechiness']}}"
,lookup_table,"{'classification': {'Random Forest': <class 'sklea...stClassifier'>, 'SVM': <class 'sklea..._classes.SVC'>, 'XGBoost': <class 'xgboo...GBClassifier'>}, 'regression': {'ElasticNet': <class 'sklea...t.ElasticNet'>, 'Lasso': <class 'sklea...escent.Lasso'>, 'Random Forest': <class 'sklea...estRegressor'>, 'Ridge': <class 'sklea..._ridge.Ridge'>, ...}}"
,random_state,42
,cv,4
,final_estimator_reg,Ridge()
,final_estimator_class,LogisticRegre...ndom_state=42)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,5000

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [8]:
prediction = hybrid_model.predict(X_train_extracted.iloc[[0]])
prediction

Unnamed: 0,energy,valence,danceability,loudness,acousticness,instrumentalness,liveness,speechiness,key,mode,tempo_bins
0,0.854202,0.434965,0.497951,-6.734079,0.063845,0.839302,0.20591,0.088295,11,1,2


In [12]:
feature_cols = prediction.columns.tolist()

In [26]:
feature_cols[:-3]

['energy',
 'valence',
 'danceability',
 'loudness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'speechiness']

In [51]:
spotify_dataset = pd.read_csv(cfg.paths.data / "raw/spotify-12m-songs/tracks_features.csv")

In [52]:
print(audio_data["tempo"].min(), audio_data["tempo"].max())
print(audio_data["tempo"].isna().sum())
print((audio_data["tempo"] < 0).sum())


0.0 220.165
0
0


In [53]:
bins = [0, 80, 100, 120, 140, 170, float("inf")]
numeric_labels = [0, 1, 2, 3, 4, 5]
spotify_dataset["tempo_bins"] = pd.cut(
    spotify_dataset["tempo"], bins=bins, labels=numeric_labels, right=False
)

In [54]:
print(spotify_dataset['tempo'].max(),spotify_dataset['tempo'].min())

248.93400000000003 0.0


In [55]:
spotify_dataset.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,tempo_bins
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,2
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,2
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,4
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,...,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,1
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,...,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,3


In [56]:
spotify_dataset.isna().sum()

id                   0
name                 3
album               11
album_id             0
artists              0
artist_ids           0
track_number         0
disc_number          0
explicit             0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
time_signature       0
year                 0
release_date         0
tempo_bins           0
dtype: int64

In [57]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
spotify_features_scaled = scaler.fit_transform(spotify_dataset[feature_cols])

In [58]:
import numpy as np

In [59]:
np.isnan(spotify_features_scaled).sum() /len(spotify_features_scaled)

np.float64(0.0)

In [60]:
similarities = cosine_similarity(spotify_features_scaled[0].reshape(1, -1),spotify_features_scaled)[0]

top_indices = np.argsort(similarities)[-10:][::-1]

recommendations = spotify_dataset.iloc[top_indices].copy()
recommendations['similarity_score'] = similarities[top_indices]


In [61]:
recommendations = spotify_dataset.iloc[top_indices].copy()
recommendations['similarity_score'] = similarities[top_indices]

In [62]:
recommendations

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,tempo_bins,similarity_score
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,2,1.0
27396,0l6NGrW2fe15fCfa8LM5i8,Go Away,Total,2G5Ictjj38CjYvpk8KDHGP,['Teenage Bottlerocket'],['0mGyXXKzoR5KAAh4Mkef2W'],7,1,False,0.499,...,0.0,0.335,0.476,102.02,134853,4.0,2005,2005-03-30,2,0.994947
88612,1Udy7blrWwKMOlpQKlQS82,Digital On,Neve,0Be2aYSLePwJkWJu4ZTY8D,['Neve'],['0VrYYSLc5DsP2rlIzTGy0H'],1,1,False,0.511,...,0.0,0.37,0.521,109.657,197960,4.0,2000,2000,2,0.993242
775625,6dFJetjPrOCBqnheeqXyhF,Freedom From Pain,Sacrifice,6yP83yNncW0oMo0ggjyNX6,['Solid Vision'],['4QDopoox1N55SEyyvT5aZ7'],2,1,False,0.498,...,0.0139,0.399,0.495,102.485,355400,3.0,2010,2010-07-20,2,0.992772
549958,3iXKSnl6hmYyn89qUFuiLz,If Love's the Answer,Transformer,3GR62Tqkrb4FvOZd6CQVrb,['Bruce Kulick'],['30TJoJIn79ZTDn7J93ymi8'],3,1,False,0.454,...,0.000972,0.328,0.429,114.039,257440,4.0,2014,2014-07-24,2,0.991581
400978,4CWCg4CRfRCv5uUqv3oWaq,Scissors,Black Sheep,7iJkW1Di5NcFTwNb8eUcXm,['Calmando Qual'],['3omVg4YzQzcWH2R7SHcLYh'],3,1,False,0.519,...,6e-06,0.325,0.53,100.013,222293,4.0,2012,2012-06-27,2,0.991096
869326,2yLFKU0rLvvIOIF0aQmKfS,Sleeping Somewhere Else,Born Ugly Got Worse,15jrF6WDr00AgAJ8mJk8Jb,['The Slow Death'],['4CenwegWYcU6hqBZ4meSia'],10,1,False,0.434,...,7.3e-05,0.322,0.551,115.542,191612,4.0,2018,2018-03-30,2,0.990985
394033,235ejgFkDSYLLAhag9xXKm,This Is War,Doxology,2W9PfC5s71nghWWHJK7DjD,['Core Zero'],['2E5VdUAKZCsB532zIBBUEE'],4,1,False,0.486,...,7.2e-05,0.359,0.511,100.051,263653,4.0,2010,2010-06-15,2,0.990778
466818,1u4FtCxuOcsXStGZwP3Nk6,Olympia WA,Hooligans United a Tribute to Rancid,3fNR9RPA8gWdl8DHyEylPv,"[""The Reverend Peyton's Big Damn Band""]",['6Ec85Uubz3lO7BIEvcBfZK'],16,1,False,0.444,...,0.000297,0.323,0.504,104.642,171979,4.0,2015,2015-04-14,2,0.989965
175090,0pKofzltD43HLvue8mprWc,Maybe I'm Wrong,Friend Or Foe,0D0FK6rtxn0SxAh3QS8QaP,['The Forces Of Evil'],['1aRbVOxYq8WaO2sf8vhF9s'],9,1,False,0.461,...,0.0,0.313,0.451,104.754,200733,4.0,2003,2003-10-19,2,0.989922
