In [2]:
import sys
from pathlib import Path
# Add the src folder to sys.path
sys.path.append(str(Path().resolve().parent / "src"))
from src.music_recommender.models.hybrid import HybridModel 
import joblib 
from src.music_recommender.config import Config

import os 
import warnings 
import pandas as pd
from sklearn.exceptions import ConergenceWarnings
from sklearn.model_selection import train_test_split


from src.music_recommender.config import Config
from src.music_recommender.data.pipeline import create_extraction_pipeline
from src.music_recommender.evaluation.evaluator import get_best_models, get_top_3_models
from src.music_recommender.models.hybrid import HybridModel
from src.music_recommender.models.model_registry import (
    MODEL_CLASS_LOOKUP,
    TARGET_GROUPS,
)
from src.music_recommender.training.trainer import train_models
from src.music_recommender.utils.helpers import tree
from src.music_recommender.utils.logger import get_logger


warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="Objective did not converge")

logger = get_logger(context="training")
cfg = Config()

ModuleNotFoundError: No module named 'src'

In [None]:
logger.info("Loading dataset...")
audio_data = pd.read_csv(cfg.paths.processed / "matched_metadata.csv")

bins = [0, 80, 100, 120, 140, 170, float("inf")]
numeric_labels = [0, 1, 2, 3, 4, 5]
audio_data["tempo_bins"] = pd.cut(
    audio_data["tempo"], bins=bins, labels=numeric_labels, right=False
)

X = audio_data["track_id"].map(
    lambda id: cfg.paths.processed / "audio" / f"{str(id).zfill(6)}.mp3"
)
y = audio_data[
    [
        "danceability",
        "energy",
        "key",
        "loudness",
        "mode",
        "speechiness",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo_bins",
    ]
]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
logger.info(f"Train: {len(X_train)} samples, Test: {len(X_test)} samples")

logger.info("Extracting audio features...")
extraction_pipeline = create_extraction_pipeline(cfg)
X_train_extracted = extraction_pipeline.fit_transform(X_train)
X_test_extracted = extraction_pipeline.transform(X_test)
logger.info(f"Feature extraction complete. Shape: {X_train_extracted.shape}")

In [None]:
cfg = Config()

In [None]:
hybrid_model = joblib.load(cfg.paths.models/"hybrid_model.joblib")


hybrid_model

In [None]:
prediction = hybrid_model.predict(X_train_extracted.iloc[[0]])
prediction

In [None]:
feature_cols = prediction.columns.tolist()
feature_cols[:-3]

In [None]:
spotify_dataset = pd.read_csv(cfg.paths.data / "raw/spotify-12m-songs/tracks_features.csv")

In [None]:
print(audio_data["tempo"].min(), audio_data["tempo"].max())
print(audio_data["tempo"].isna().sum())
print((audio_data["tempo"] < 0).sum())

In [None]:
bins = [0, 80, 100, 120, 140, 170, float("inf")]
numeric_labels = [0, 1, 2, 3, 4, 5]
spotify_dataset["tempo_bins"] = pd.cut(
    spotify_dataset["tempo"], bins=bins, labels=numeric_labels, right=False
)

In [None]:
print(spotify_dataset['tempo'].max(),spotify_dataset['tempo'].min())

In [None]:
spotify_dataset.head()

In [None]:
spotify_dataset.isna().sum()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
spotify_features_scaled= scaler.fit_transform(spotify_dataset[feature_cols])

In [None]:
import numpy as np

np.isnan(spotify_features_scaled).sum() / len(spotify_features_scaled)

In [None]:
similarities = cosine_similarity(spotify_features_scaled[0].reshape(1, -1), spotify_features_scaled)[0]

# get the 10 largest similarities ()songs are sorted in acsnding order
top_indices = np.argsort(similarities)[-10:][::-1]

recommendations = spotify_dataset.iloc[top_indices].copy()
recommendations['similarity_score'] = similarities[top_indices]

In [None]:
recommendations