In [None]:
from src.music_recommender.models.hybrid import HybridModel
import joblib
from src.music_recommender.config import Config
import os
import warnings
import ast
import joblib
import pandas as pd
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split

from src.music_recommender.config import Config
from src.music_recommender.data.pipeline import create_extraction_pipeline
from src.music_recommender.evaluation.evaluator import get_best_models, get_top_3_models
from src.music_recommender.models.hybrid import HybridModel
from src.music_recommender.models.model_registry import (
    MODEL_CLASS_LOOKUP,
    TARGET_GROUPS,
)
from src.music_recommender.training.trainer import train_models
from src.music_recommender.utils.helpers import tree
from src.music_recommender.utils.logger import get_logger
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from typing import List, Optional

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="Objective did not converge")


In [None]:
logger = get_logger(context="training")
cfg = Config()

logger.info("Loading dataset...")
audio_data = pd.read_csv(cfg.paths.processed / "matched_metadata.csv")

bins = [0, 80, 100, 120, 140, 170, float("inf")]
numeric_labels = [0, 1, 2, 3, 4, 5]
audio_data["tempo_bins"] = pd.cut(
    audio_data["tempo"], bins=bins, labels=numeric_labels, right=False
)

X = audio_data["track_id"].map(
    lambda id: cfg.paths.processed / "audio" / f"{str(id).zfill(6)}.mp3"
)
y = audio_data[
    [
        "danceability",
        "energy",
        "key",
        "loudness",
        "mode",
        "speechiness",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo_bins",
    ]
]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
logger.info(f"Train: {len(X_train)} samples, Test: {len(X_test)} samples")

logger.info("Extracting audio features...")
extraction_pipeline = create_extraction_pipeline(cfg)
X_train_extracted = extraction_pipeline.fit_transform(X_train)
X_test_extracted = extraction_pipeline.transform(X_test)
logger.info(f"Feature extraction complete. Shape: {X_train_extracted.shape}")


[32m2025-11-14 11:58:52.405[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mLoading dataset...[0m
[32m2025-11-14 11:58:52.429[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mTrain: 1383 samples, Test: 346 samples[0m
[32m2025-11-14 11:58:52.429[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mExtracting audio features...[0m
Loading audio: 100%|██████████| 1383/1383 [01:10<00:00, 19.65it/s]
Computing spectrograms: 100%|██████████| 1383/1383 [01:02<00:00, 22.06it/s]
Extracting features: 100%|██████████| 1383/1383 [00:00<00:00, 3321.41it/s]
Loading audio: 100%|██████████| 346/346 [00:16<00:00, 21.31it/s]
Computing spectrograms: 100%|██████████| 346/346 [00:16<00:00, 20.83it/s]
Extracting features: 100%|██████████| 346/346 [00:00<00:00, 4439.61it/s]
[32m2025-11-14 12:01:39.238[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m41[0m - [1mFeature extraction complete. Shape: (138

In [3]:
cfg = Config()

In [None]:
hybrid_model = joblib.load(cfg.paths.models / "hybrid_model.joblib")

In [5]:
hybrid_model

0,1,2
,top_models,"defaultdict(<...-05)])""}})})})"
,best_models,"defaultdict(<...79844)])""}})})"
,skip_stacking,{'regression': ['structure']}
,target_groups,"{'classification': {'key': ['key'], 'mode': ['mode'], 'tempo_bins': ['tempo_bins']}, 'regression': {'energy_mood': ['energy', 'valence', ...], 'production': ['loudness', 'acousticness', ...], 'structure': ['speechiness']}}"
,lookup_table,"{'classification': {'Random Forest': <class 'sklea...stClassifier'>, 'SVM': <class 'sklea..._classes.SVC'>, 'XGBoost': <class 'xgboo...GBClassifier'>}, 'regression': {'ElasticNet': <class 'sklea...t.ElasticNet'>, 'Lasso': <class 'sklea...escent.Lasso'>, 'Random Forest': <class 'sklea...estRegressor'>, 'Ridge': <class 'sklea..._ridge.Ridge'>, ...}}"
,random_state,42
,cv,4
,final_estimator_reg,Ridge()
,final_estimator_class,LogisticRegre...ndom_state=42)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,5000

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [6]:
prediction = hybrid_model.predict(X_train_extracted.iloc[[0]])
prediction

Unnamed: 0,energy,valence,danceability,loudness,acousticness,instrumentalness,liveness,speechiness,key,mode,tempo_bins
0,0.854202,0.434965,0.497951,-6.734079,0.063845,0.839302,0.20591,0.088295,11,1,2


In [7]:
feature_cols = prediction.columns.tolist()

In [36]:
prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   energy            1 non-null      float64
 1   valence           1 non-null      float64
 2   danceability      1 non-null      float64
 3   loudness          1 non-null      float64
 4   acousticness      1 non-null      float64
 5   instrumentalness  1 non-null      float64
 6   liveness          1 non-null      float64
 7   speechiness       1 non-null      float64
 8   key               1 non-null      int64  
 9   mode              1 non-null      int64  
 10  tempo_bins        1 non-null      int64  
dtypes: float64(8), int64(3)
memory usage: 220.0 bytes


In [8]:
feature_cols[:-3]

['energy',
 'valence',
 'danceability',
 'loudness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'speechiness']

In [None]:
spotify_dataset = pd.read_csv(
    cfg.paths.data / "raw/spotify-12m-songs/tracks_features.csv"
)

In [10]:
print(audio_data["tempo"].min(), audio_data["tempo"].max())
print(audio_data["tempo"].isna().sum())
print((audio_data["tempo"] < 0).sum())


0.0 220.165
0
0


In [11]:
bins = [0, 80, 100, 120, 140, 170, float("inf")]
numeric_labels = [0, 1, 2, 3, 4, 5]
spotify_dataset["tempo_bins"] = pd.cut(
    spotify_dataset["tempo"], bins=bins, labels=numeric_labels, right=False
)

In [None]:
print(spotify_dataset["tempo"].max(), spotify_dataset["tempo"].min())

248.93400000000003 0.0


In [13]:
spotify_dataset.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,tempo_bins
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,2
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,2
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,4
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,...,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,1
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,...,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,3


In [14]:
spotify_dataset.isna().sum()

id                   0
name                 3
album               11
album_id             0
artists              0
artist_ids           0
track_number         0
disc_number          0
explicit             0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
time_signature       0
year                 0
release_date         0
tempo_bins           0
dtype: int64

In [35]:
spotify_dataset[feature_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1204025 entries, 0 to 1204024
Data columns (total 11 columns):
 #   Column            Non-Null Count    Dtype   
---  ------            --------------    -----   
 0   energy            1204025 non-null  float64 
 1   valence           1204025 non-null  float64 
 2   danceability      1204025 non-null  float64 
 3   loudness          1204025 non-null  float64 
 4   acousticness      1204025 non-null  float64 
 5   instrumentalness  1204025 non-null  float64 
 6   liveness          1204025 non-null  float64 
 7   speechiness       1204025 non-null  float64 
 8   key               1204025 non-null  int64   
 9   mode              1204025 non-null  int64   
 10  tempo_bins        1204025 non-null  category
dtypes: category(1), float64(8), int64(2)
memory usage: 93.0 MB


In [32]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
spotify_features_scaled = scaler.fit_transform(spotify_dataset[feature_cols])

In [23]:
spotify_features_scaled = np.array(spotify_dataset[feature_cols])

In [24]:
import numpy as np

In [25]:
np.isnan(spotify_features_scaled).sum() / len(spotify_features_scaled)

np.float64(0.0)

In [26]:
similarities = cosine_similarity(prediction, spotify_features_scaled)[0]

top_indices = np.argsort(similarities)[-10:][::-1]

recommendations = spotify_dataset.iloc[top_indices].copy()
recommendations["similarity_score"] = similarities[top_indices]


In [27]:
recommendations = spotify_dataset.iloc[top_indices].copy()
recommendations["similarity_score"] = similarities[top_indices]

In [28]:
recommendations

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,tempo_bins,similarity_score
1061905,2wlwAMHh71lqqeObNL9Erh,Breakthrough & Conquer,2020,3KmiATbuQ8B7vvGYMJi3q5,['Zombi'],['5wb5QmufzBWk1wkmw2qtDJ'],1,1,False,0.561,...,0.889,0.12,0.373,100.015,226709,4.0,2020,2020-07-17,2,0.999923
455266,7yJk1pY9P3hxthLdrTWlEi,Vihmax - (Vihma remix),Vihma,4FF6T9sHJIT2Juuz2cS7oE,['Värttinä'],['4sn9fZDCAH3l0Pb0lGW1wt'],13,1,False,0.62,...,0.741,0.137,0.42,112.605,211747,4.0,1998,1998-08-10,2,0.999821
17261,7gAgxLiZAANNbGpnwDWVAr,Tomes of Acrimony,Souls To Deny,07V2cwHhr3JKC9hnLsMrI2,['Suffocation'],['4ItRDIouodpnW6nm4TYDk1'],8,1,False,0.256,...,0.871,0.139,0.513,108.993,270320,4.0,2004,2004-04-27,2,0.999785
1070567,41CVBs59XnZz9nQ2W1e4V1,Uncrossed,Uncrossed,4gSseutxROlL4SdFxHqBY2,['Korine'],['2kkHe1abWV084RUXhSEf2q'],1,1,False,0.589,...,0.838,0.102,0.571,111.021,225463,4.0,2019,2019-09-20,2,0.999776
17257,5d2ADtt2FJCwqvM21KY4pR,Surgery of Impalement,Souls To Deny,07V2cwHhr3JKC9hnLsMrI2,['Suffocation'],['4ItRDIouodpnW6nm4TYDk1'],4,1,False,0.244,...,0.851,0.163,0.515,107.293,230987,4.0,2004,2004-04-27,2,0.999775
355004,2jPyuZIMQKuFLHNk89rSJW,Flannery Said,Dark Snack,7Lxze5C6LUpvON0uUDE189,['The Moaners'],['3PWJ0xfnhzbndqYFPLbNAX'],5,1,False,0.538,...,0.612,0.156,0.502,119.648,163400,4.0,2005,2005-01-25,2,0.999751
1165681,4RYeyNZ7ZlMJqVhoJTvMgp,dominique,acts of rebellion,6PhhsnbohAEOXBdqsURBeh,['Ela Minus'],['4rdJkXHNrMgowlwUdQAg8T'],6,1,False,0.703,...,0.782,0.0712,0.468,100.997,234383,4.0,2020,2020-10-23,2,0.999747
111369,0555EmqQK12Tbjm3M7PFwg,Big W,High Five,5NATsTgBjgj5RNBVbRqit9,['Halfway to Gone'],['6oRW92RUVb0d6r75RscdjA'],2,1,False,0.455,...,0.882,0.0728,0.636,118.842,158067,4.0,2001,2001-05-01,2,0.999747
296533,1ucXhlPxA9gS4CUVoT3gip,If I Was A Scientist,The Lights Went Up,14GDJq0u1AhnWKIGfeQVxm,"['Anagram', 'Bill Gould']","['0cmNUXXnPdK6gZyDBRGizs', '2aBabrQjMzI5FUAssd...",7,1,False,0.487,...,0.807,0.0858,0.361,101.505,220120,4.0,2006,2006-11-07,2,0.999741
1144282,10LEH5Wa7dC3DPhUKeOSZY,Four (interlude),Trails,0YAcoBKKqhov6rYn50zwe5,['pronouncedyea'],['1gydLLHpbmraVz1LVZCW83'],4,1,False,0.583,...,0.927,0.162,0.228,118.042,195254,4.0,2018,2018-10-04,2,0.999729


In [29]:
recommendations[["name", "album", "artists", "similarity_score"]]

Unnamed: 0,name,album,artists,similarity_score
1061905,Breakthrough & Conquer,2020,['Zombi'],0.999923
455266,Vihmax - (Vihma remix),Vihma,['Värttinä'],0.999821
17261,Tomes of Acrimony,Souls To Deny,['Suffocation'],0.999785
1070567,Uncrossed,Uncrossed,['Korine'],0.999776
17257,Surgery of Impalement,Souls To Deny,['Suffocation'],0.999775
355004,Flannery Said,Dark Snack,['The Moaners'],0.999751
1165681,dominique,acts of rebellion,['Ela Minus'],0.999747
111369,Big W,High Five,['Halfway to Gone'],0.999747
296533,If I Was A Scientist,The Lights Went Up,"['Anagram', 'Bill Gould']",0.999741
1144282,Four (interlude),Trails,['pronouncedyea'],0.999729


In [44]:
from pathlib import Path
from typing import List, Optional,Dict
import logging

logger = logging.getLogger(__name__)

In [None]:
class MusicRecommender:
    
    def __init__(
        self,
        hybrid_model_path: Path,
        spotify_dataset_path: Path,
        feature_cols: Optional[Dict[str, List[str]]] = None,
        top_n: int = 10,
    ):
        logger.info("Initializing MusicRecommender...")
        
        self.top_n = top_n
        self.hybrid_model = joblib.load(hybrid_model_path)
        logger.info(f"Loaded hybrid model from {hybrid_model_path}")
        
        self.spotify_dataset = pd.read_csv(spotify_dataset_path)
        logger.info(f"Loaded {len(self.spotify_dataset)} tracks from Spotify dataset")
        
        self._add_tempo_bins()
        
        if feature_cols is None:
            self.feature_cols = {
                "continuous": [
                    "danceability", "energy", "loudness",
                    "speechiness", "acousticness", "instrumentalness",
                    "liveness", "valence"
                ],
                "categorical": ["key", "mode", "tempo_bins"]
            }
        else:
            self.feature_cols = feature_cols
        
        self._prepare_spotify_features()
        logger.info("MusicRecommender initialized successfully")
    
    def _add_tempo_bins(self):
        bins = [0, 80, 100, 120, 140, 170, float("inf")]
        numeric_labels = [0, 1, 2, 3, 4, 5]
        self.spotify_dataset["tempo_bins"] = pd.cut(
            self.spotify_dataset["tempo"],
            bins=bins,
            labels=numeric_labels,
            right=False
        )
    
    def _prepare_spotify_features(self):
        self.scaler = StandardScaler()
        spotify_cont_scaled = self.scaler.fit_transform(
            self.spotify_dataset[self.feature_cols["continuous"]]
        )
        
        categ_dfs = []
        for col in self.feature_cols["categorical"]:
            encoded = pd.get_dummies(
                self.spotify_dataset[col],
                prefix=col,
                drop_first=False
            )
            categ_dfs.append(encoded)
        
        spotify_categ_encoded = pd.concat(categ_dfs, axis=1)
        
        self.categorical_columns = spotify_categ_encoded.columns.tolist()
        
        self.spotify_features_scaled = np.concatenate(
            [spotify_cont_scaled, spotify_categ_encoded.values],
            axis=1
        )
        
        logger.info(
            f"Prepared {self.spotify_features_scaled.shape[1]} features "
            f"for {self.spotify_features_scaled.shape[0]} tracks"
        )
    
    def _prepare_prediction_vector(self, predicted_features: pd.DataFrame) -> np.ndarray:
        cont_features = predicted_features[self.feature_cols["continuous"]].fillna(
            predicted_features[self.feature_cols["continuous"]].median()
        )
        cont_scaled = self.scaler.transform(cont_features)
        
        categ_dfs = []
        for col in self.feature_cols["categorical"]:
            encoded = pd.get_dummies(
                predicted_features[col],
                prefix=col,
                drop_first=False
            )
            categ_dfs.append(encoded)
        
        categ_encoded = pd.concat(categ_dfs, axis=1)
        
        for col in self.categorical_columns:
            if col not in categ_encoded.columns:
                categ_encoded[col] = 0
        
        categ_encoded = categ_encoded[self.categorical_columns]
        
        return np.concatenate([cont_scaled, categ_encoded.values], axis=1)
    
    def predict_features(self, audio_features: pd.DataFrame) -> pd.DataFrame:
        """Predict music features from extracted audio features."""
        return self.hybrid_model.predict(audio_features)
    
    def get_recommendations_from_audio(
        self,
        audio_features: pd.DataFrame,
        top_n: Optional[int] = None,
        return_scores: bool = True
    ) -> pd.DataFrame:
        predicted_features = self.predict_features(audio_features)
        
        return self.get_recommendations_from_predictions(
            predicted_features,
            top_n=top_n,
            return_scores=return_scores
        )
    
    def get_recommendations_from_predictions(
        self,
        predicted_features: pd.DataFrame,
        top_n: Optional[int] = None,
        return_scores: bool = True
    ) -> pd.DataFrame:
        if top_n is None:
            top_n = self.top_n
        
        if len(predicted_features) > 1:
            logger.warning("Multiple predictions provided, using only the first one")
            predicted_features = predicted_features.iloc[[0]]
        
        prediction_vector = self._prepare_prediction_vector(predicted_features)
        
        similarities = cosine_similarity(
            prediction_vector,
            self.spotify_features_scaled
        )[0]
        
        top_indices = np.argsort(similarities)[-top_n:][::-1]
        recommendations = self.spotify_dataset.iloc[top_indices].copy()
        
        if return_scores:
            recommendations['similarity_score'] = similarities[top_indices]
        
        return recommendations
    
    def get_recommendations(
        self,
        features: pd.DataFrame,
        top_n: Optional[int] = None,
        return_scores: bool = True,
        features_type: str = "predicted"
    ) -> pd.DataFrame:
        if features_type == "audio":
            return self.get_recommendations_from_audio(features, top_n, return_scores)
        else:
            return self.get_recommendations_from_predictions(features, top_n, return_scores)
    
    def get_recommendations_simple(
        self,
        predicted_features: pd.DataFrame,
    ) -> List[dict]:
        recommendations = self.get_recommendations_from_predictions(
            predicted_features,
            top_n=self.top_n,
            return_scores=True
        )
        
        return recommendations[
            ["name", "album", "artists", "similarity_score"]
        ].to_dict('records')
    
    def batch_recommendations(
        self,
        features_list: List[pd.DataFrame],
        top_n: Optional[int] = None,
        features_type: str = "predicted"
    ) -> List[pd.DataFrame]:
        if top_n is None:
            top_n = self.top_n
        
        if features_type == "audio":
            return [
                self.get_recommendations_from_audio(features, top_n=top_n)
                for features in features_list
            ]
        else:
            return [
                self.get_recommendations_from_predictions(features, top_n=top_n)
                for features in features_list
            ]

In [47]:
recommender = MusicRecommender(
    hybrid_model_path=cfg.paths.models / "hybrid_model.joblib",
    spotify_dataset_path=cfg.paths.data / "raw/spotify-12m-songs/tracks_features.csv"
)

In [None]:
recommendations = recommender.get_recommendations_from_audio(
    X_train_extracted.iloc[[0]]
)

prediction = hybrid_model.predict(X_train_extracted.iloc[[0]])
recommendations = recommender.get_recommendations_from_predictions(prediction)

simple_recs = recommender.get_recommendations_simple(prediction)

In [50]:
simple_recs

[{'name': 'Suffocate',
  'album': 'Dysfunction',
  'artists': "['Staind']",
  'similarity_score': 0.9790538909952035},
 {'name': 'Return to Nebulon V',
  'album': 'Vomitron 2',
  'artists': "['Vomitron']",
  'similarity_score': 0.9783188146425124},
 {'name': 'Multiverso - Instrumental',
  'album': 'Ente (Instrumental)',
  'artists': "['Descarnado']",
  'similarity_score': 0.9781424851247322},
 {'name': 'Great White Lie',
  'album': 'Moloko & Ultraviolence',
  'artists': "['Alice Sweet Alice']",
  'similarity_score': 0.9746787596992393},
 {'name': 'Away with Words (Instrumental)',
  'album': 'Non Sequitur (Instrumental)',
  'artists': "['Raiju']",
  'similarity_score': 0.9746678180496112},
 {'name': 'Sky (feat. C-Lekktor) - C-Lekktor Remix',
  'album': 'Syndicus',
  'artists': "['Haushetaere', 'C-Lekktor']",
  'similarity_score': 0.9700433842524934},
 {'name': 'Lush',
  'album': 'Overnight',
  'artists': "['Heat']",
  'similarity_score': 0.9670660197781964},
 {'name': 'Breakthrough & Co