In [10]:
import sys
import io
import hashlib
import pickle
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import librosa
import joblib
from tqdm import tqdm

# Add the src folder to sys.path
sys.path.append(str(Path().resolve().parent / "src"))
from music_recommender.config import Config

# Scikit-learn base classes and utilities
from sklearn.base import BaseEstimator, TransformerMixin, clone

# Preprocessing & feature selection
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_regression, VarianceThreshold
from sklearn.decomposition import PCA

# Models
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier

# Model selection
from sklearn.model_selection import train_test_split, GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

# Metrics
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error,
    explained_variance_score, mean_absolute_percentage_error,
    accuracy_score, f1_score, precision_score, recall_score,
    balanced_accuracy_score, roc_auc_score
)

# Pandas display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [11]:
root = Path()

cfg = Config()
asp = cfg.paths.audio_spotify
prc = cfg.paths.processed
intr = cfg.paths.interim

print(asp, prc, intr)

\\wsl.localhost\Ubuntu-22.04\home\rime\music-recom\data\processed\audio \\wsl.localhost\Ubuntu-22.04\home\rime\music-recom\data\processed \\wsl.localhost\Ubuntu-22.04\home\rime\music-recom\data\interim


In [12]:
class AudioLoader(BaseEstimator, TransformerMixin):
    """Load audio files from paths or bytes with caching support"""
    
    def __init__(
        self, 
        sr: int = 22050,
        use_cache: bool = True,
        cache_dir: Optional[Union[str, Path]] = None,
        max_cache_size_mb: int = 1000
    ):
        
        self.sr = sr
        self.use_cache = use_cache
        self.cache_dir = Path(cache_dir) if cache_dir else Path('./audio_cache')
        self.max_cache_size_mb = max_cache_size_mb
        
        # In-memory cache for current session
        self._memory_cache = {}
        
        # Setup cache directory
        if self.use_cache:
            self.cache_dir.mkdir(parents=True, exist_ok=True)
            self._cache_index_path = self.cache_dir / "cache_index.pkl"
            self._load_cache_index()
    
    def _load_cache_index(self):
        """Load cache index (tracks what's cached and access times)"""
        if self._cache_index_path.exists():
            try:
                with open(self._cache_index_path, 'rb') as f:
                    self._cache_index = pickle.load(f)
            except Exception:
                self._cache_index = {}
        else:
            self._cache_index = {}
    
    def _save_cache_index(self):
        """Save cache index to disk"""
        try:
            with open(self._cache_index_path, 'wb') as f:
                pickle.dump(self._cache_index, f)
        except Exception as e:
            print(f"Warning: Could not save cache index: {e}")
    
    def _get_cache_key(self, item: Union[str, Path, bytes]) -> str:
        """Generate unique cache key for an item"""
        if isinstance(item, (str, Path)):
            # For files: use path + modification time + sr
            path = Path(item)
            if path.exists():
                mtime = path.stat().st_mtime
                key_string = f"{path.absolute()}_{mtime}_{self.sr}"
            else:
                key_string = f"{path.absolute()}_{self.sr}"
        elif isinstance(item, bytes):
            # For bytes: use hash of content + sr
            content_hash = hashlib.md5(item).hexdigest()
            key_string = f"bytes_{content_hash}_{self.sr}"
        else:
            raise ValueError(f"Unsupported input type: {type(item)}")
        
        # Return hash of key_string
        return hashlib.sha256(key_string.encode()).hexdigest()
    
    def _get_from_cache(self, cache_key: str) -> Optional[Dict[str, Any]]:
        """Retrieve item from cache (memory or disk)"""
        # Check memory cache first (fastest)
        if cache_key in self._memory_cache:
            return self._memory_cache[cache_key]
        
        # Check disk cache
        cache_file = self.cache_dir / f"{cache_key}.pkl"
        if cache_file.exists():
            try:
                with open(cache_file, 'rb') as f:
                    data = pickle.load(f)
                
                # Update memory cache
                self._memory_cache[cache_key] = data
                
                # Update access time in index
                if cache_key in self._cache_index:
                    self._cache_index[cache_key]['last_access'] = np.datetime64('now')
                
                return data
            except Exception as e:
                print(f"Warning: Could not load cache file {cache_file}: {e}")
                return None
        
        return None
    
    def _save_to_cache(self, cache_key: str, data: Dict[str, Any]):
        """Save item to cache (memory and disk)"""
        # Save to memory cache
        self._memory_cache[cache_key] = data
        
        # Save to disk cache
        cache_file = self.cache_dir / f"{cache_key}.pkl"
        try:
            with open(cache_file, 'wb') as f:
                pickle.dump(data, f)
            
            # Update cache index
            file_size = cache_file.stat().st_size / (1024 * 1024)  # MB
            self._cache_index[cache_key] = {
                'file': cache_file,
                'size_mb': file_size,
                'last_access': np.datetime64('now'),
                'created': np.datetime64('now')
            }
            
            # Check cache size and cleanup if needed
            self._cleanup_cache_if_needed()
            
        except Exception as e:
            print(f"Warning: Could not save to cache: {e}")
    
    def _cleanup_cache_if_needed(self):
        """Remove old cache entries if cache size exceeds limit"""
        total_size = sum(info['size_mb'] for info in self._cache_index.values())
        
        if total_size > self.max_cache_size_mb:
            # Sort by last access time (oldest first)
            sorted_entries = sorted(
                self._cache_index.items(),
                key=lambda x: x[1]['last_access']
            )
            
            # Remove oldest entries until under limit
            for cache_key, info in sorted_entries:
                if total_size <= self.max_cache_size_mb * 0.9:  # 90% of limit
                    break
                
                try:
                    # Remove file
                    if info['file'].exists():
                        info['file'].unlink()
                    
                    # Remove from index
                    total_size -= info['size_mb']
                    del self._cache_index[cache_key]
                    
                    # Remove from memory cache
                    if cache_key in self._memory_cache:
                        del self._memory_cache[cache_key]
                        
                except Exception as e:
                    print(f"Warning: Could not remove cache file: {e}")
            
            # Save updated index
            self._save_cache_index()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X) -> np.ndarray:
        """
        Transform paths/bytes to audio data
        
        Parameters
        ----------
        X : str, Path, bytes, or list of these
            Audio file paths or raw audio bytes
        
        Returns
        -------
        np.ndarray
            Array of dictionaries containing audio data
        """
        # Normalize input
        if isinstance(X, (str, Path, bytes)):
            X = [X]
        elif hasattr(X, "tolist"):
            X = X.tolist()
        
        results = []
        cache_hits = 0
        cache_misses = 0
        
        for item in tqdm(X, desc="Loading audio"):
            if self.use_cache:
                cache_key = self._get_cache_key(item)
                cached_data = self._get_from_cache(cache_key)
                
                if cached_data is not None:
                    results.append(cached_data)
                    cache_hits += 1
                    continue
            
            # Load from source
            loaded = self._load_single(item)
            results.append(loaded)
            cache_misses += 1
            
            # Save to cache
            if self.use_cache:
                self._save_to_cache(cache_key, loaded)
        
        # Print cache statistics
        if self.use_cache and (cache_hits + cache_misses) > 0:
            hit_rate = cache_hits / (cache_hits + cache_misses) * 100
            print(f"Cache: {cache_hits} hits, {cache_misses} misses ({hit_rate:.1f}% hit rate)")
        
        return np.array(results, dtype=object)
    
    def _load_single(self, item: Union[str, Path, bytes]) -> Dict[str, Any]:
        """Load a single audio file"""
        if isinstance(item, (str, Path)):
            # Load from file path
            audio, sr = librosa.load(item, sr=self.sr)
            return {
                "audio": audio,
                "sr": sr,
                "path": Path(item),
                "source_type": "path"
            }
        
        elif isinstance(item, bytes):
            # Load from bytes
            audio, sr = librosa.load(io.BytesIO(item), sr=self.sr)
            audio_hash = hashlib.md5(item[:10000]).hexdigest()
            pseudo_path = Path(f"uploaded_{audio_hash}")
            
            return {
                "audio": audio,
                "sr": sr,
                "path": pseudo_path,
                "source_type": "bytes"
            }
        
        else:
            raise ValueError(f"Unsupported input type: {type(item)}")
    
    def clear_cache(self, older_than_days: Optional[int] = None):
        """
        Clear cache files
        
        Parameters
        ----------
        older_than_days : int, optional
            If specified, only clear cache entries older than this many days.
            If None, clears entire cache.
        """
        if not self.use_cache:
            return
        
        if older_than_days is None:
            # Clear everything
            for cache_file in self.cache_dir.glob("*.pkl"):
                try:
                    cache_file.unlink()
                except Exception:
                    pass
            self._cache_index = {}
            self._memory_cache = {}
            print(f"Cleared all cache from {self.cache_dir}")
        else:
            # Clear old entries
            cutoff = np.datetime64('now') - np.timedelta64(older_than_days, 'D')
            removed_count = 0
            
            for cache_key, info in list(self._cache_index.items()):
                if info['last_access'] < cutoff:
                    try:
                        if info['file'].exists():
                            info['file'].unlink()
                        del self._cache_index[cache_key]
                        if cache_key in self._memory_cache:
                            del self._memory_cache[cache_key]
                        removed_count += 1
                    except Exception:
                        pass
            
            print(f"Removed {removed_count} cache entries older than {older_than_days} days")
        
        self._save_cache_index()
    
    def get_cache_stats(self) -> Dict[str, Any]:
        """Get cache statistics"""
        if not self.use_cache:
            return {"cache_enabled": False}
        
        total_size = sum(info['size_mb'] for info in self._cache_index.values())
        
        return {
            "cache_enabled": True,
            "cache_dir": str(self.cache_dir),
            "num_cached_items": len(self._cache_index),
            "total_size_mb": round(total_size, 2),
            "max_size_mb": self.max_cache_size_mb,
            "utilization_pct": round(total_size / self.max_cache_size_mb * 100, 1),
            "memory_cache_items": len(self._memory_cache)
        }

In [13]:
class MFCCExtractor(BaseEstimator, TransformerMixin):
    """Extract MFCC features with statistics (mean, std, quartiles)"""

    def __init__(self,
                 sr: int = 22050,
                 n_mfcc: int = 13,
                 n_fft: int = 2048,
                 hop_length: int = 512,
                 cache_dir: Path = None, 
                 enable_cache: bool = True
                 ):
        self.sr = sr
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.cache_dir = cache_dir
        self.enable_cache = enable_cache

        if enable_cache and cache_dir:
            cache_dir.mkdir(parents=True, exist_ok = True)

    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X) -> pd.DataFrame:
        """
        Extract mfcc features from audio data
        X: array of dicts from AudioLoader
        """
        feature_vectors = []

        for audio_dict in tqdm(X, desc='Extracting MFCC features'):
            audio = audio_dict["audio"]
            audio_path = audio_dict["path"]
            source_type = audio_dict["source_type"]

            # Try cache
            if self.enable_cache and source_type == "path":
                cache_key = self._get_cache_key(audio_path)
                cached = self._load_from_cache(cache_key)
                if cached is not None:
                    feature_vectors.append(cached)
                    continue

            # Extract features 
            features = self._extract_mfcc_stats(audio)

            # Save to cache
            if self.enable_cache and source_type=="path":
                cache_key = self._get_cache_key(audio_path)
                self._save_to_cache(cache_key, features)

            feature_vectors.append(features)

        return pd.DataFrame(feature_vectors)
    

    def _extract_mfcc_stats(self, audio: np.ndarray) -> Dict[str, float]:
        """Extract MFCC and compute statistics"""
        # Compute MFCCs
        mfccs = librosa.feature.mfcc(
            y=audio,
            sr=self.sr,
            n_mfcc=self.n_mfcc,
            n_fft=self.n_fft,
            hop_length=self.hop_length
        )
        
        # Compute deltas
        delta_mfccs = librosa.feature.delta(mfccs)
        delta2_mfccs = librosa.feature.delta(mfccs, order=2)
        
        # Compute statistics
        stats = {}
        
        # MFCC statistics
        for i in range(self.n_mfcc):
            stats.update(self._make_stats(mfccs[i, :], f"mfcc_{i}"))
        
        # Delta statistics
        for i in range(self.n_mfcc):
            stats.update(self._make_stats(delta_mfccs[i, :], f"delta_{i}"))
        
        # Delta2 statistics
        for i in range(self.n_mfcc):
            stats.update(self._make_stats(delta2_mfccs[i, :], f"delta2_{i}"))
        
        return stats
    
    @staticmethod
    def _make_stats(feature_array: np.ndarray, name: str) -> Dict[str, float]:
        """Compute statistics for a feature array"""
        return {
            f"{name}_mean": float(np.mean(feature_array)),
            f"{name}_std": float(np.std(feature_array)),
            f"{name}_min": float(np.min(feature_array)),
            f"{name}_max": float(np.max(feature_array)),
            f"{name}_median": float(np.median(feature_array)),
            f"{name}_q25": float(np.percentile(feature_array, 25)),
            f"{name}_q75": float(np.percentile(feature_array, 75))
        }
    
    def _get_cache_key(self, audio_path: Path) -> str:
        """Generate cache key"""
        params = f"{audio_path}_{self.sr}_{self.n_mfcc}_{self.n_fft}_{self.hop_length}"
        return hashlib.md5(params.encode()).hexdigest()
    
    def _load_from_cache(self, cache_key: str) -> Dict[str, float]:
        """Load from cache"""
        if not self.cache_dir:
            return None
        
        cache_file = self.cache_dir / f"{cache_key}.pkl"
        if cache_file.exists():
            try:
                with open(cache_file, 'rb') as f:
                    return pickle.load(f)
            except Exception:
                return None
        return None
    
    def _save_to_cache(self, cache_key: str, features: Dict[str, float]):
        """Save to cache"""
        if not self.cache_dir:
            return
        
        cache_file = self.cache_dir / f"{cache_key}.pkl"
        try:
            with open(cache_file, 'wb') as f:
                pickle.dump(features, f)
        except Exception:
            pass

In [14]:
class PerFeaturePredictor:
    """Train multiple models for each individual feature (not grouped)"""
    
    def __init__(
        self,
        target_groups: Dict[str, Dict[str, List[str]]],
        model_configs: Dict[str, Dict[str, Dict]],
        n_iter_config: Dict[str, int],
        use_scaler: bool = True
    ):

        self.target_groups = target_groups
        self.model_configs = model_configs
        self.n_iter_config = n_iter_config
        self.use_scaler = use_scaler
        self.scaler = StandardScaler() if use_scaler else None
        
        # NEW STRUCTURE: Store per feature instead of per group
        # {task_type: {feature_name: {model_name: model}}}
        self.trained_models = {'regression': {}, 'classification': {}}
        
        # {task_type: {feature_name: {'model_name': str, 'model': obj, 'score': float, 'group': str}}}
        self.best_models = {'regression': {}, 'classification': {}}
        
        # {task_type: {feature_name: {model_name: {'best_score': ..., 'best_params': ..., 'cv_results': ...}}}}
        self.training_results = {'regression': {}, 'classification': {}}
        
        # Keep track of which group each feature belongs to
        self.feature_to_group = {'regression': {}, 'classification': {}}
        self._build_feature_mapping()
    
    def _build_feature_mapping(self):
        """Create mapping of feature -> group for reference"""
        for task_type in ['regression', 'classification']:
            if task_type in self.target_groups:
                for group_name, features in self.target_groups[task_type].items():
                    for feature in features:
                        self.feature_to_group[task_type][feature] = group_name
    
    def fit(self, X: pd.DataFrame, y: pd.DataFrame, cv: int = 4, n_jobs: int = -1):
        """Train all models for all features"""
        # Scale features
        if self.use_scaler:
            X_scaled = pd.DataFrame(
                self.scaler.fit_transform(X),
                columns=X.columns,
                index=X.index
            )
        else:
            X_scaled = X
        
        # Train regression models
        if 'regression' in self.target_groups:
            self._fit_task(
                X_scaled, y, 
                task_type='regression',
                cv=cv,
                n_jobs=n_jobs
            )
        
        # Train classification models
        if 'classification' in self.target_groups:
            self._fit_task(
                X_scaled, y,
                task_type='classification',
                cv=cv,
                n_jobs=n_jobs
            )
    
    def _fit_task(
        self,
        X: pd.DataFrame,
        y: pd.DataFrame,
        task_type: str,
        cv: int,
        n_jobs: int
    ):
        """Train all models for each individual feature in task type"""

        print(f"TRAINING {task_type.upper()} MODELS - PER FEATURE")
        
        groups = self.target_groups[task_type]
        models = self.model_configs[task_type]
        
        # Iterate through groups (for organization)
        for group_name, target_cols in groups.items():

            print(f"GROUP: {group_name.upper()}")
            
            # Train each feature individually
            for feature_name in target_cols:
                print(f"\n▶ FEATURE: {feature_name}")
                print(f"  Training {len(models)} models...\n")
                
                y_feature = y[feature_name]
                
                # Initialize storage for this feature
                self.trained_models[task_type][feature_name] = {}
                self.training_results[task_type][feature_name] = {}
                
                best_score = -np.inf
                best_model_name = None
                
                # Train each model type on this single feature
                for model_name, config in models.items():
                    try:
                        # Always single output (one feature at a time)
                        base = clone(config['base_model'])
                        
                        # Remove 'estimator__' prefix for single-output models
                        param_grid = {
                            key.replace('estimator__', ''): value 
                            for key, value in config['param_grid'].items()
                        }
                        
                        # Setup BayesSearchCV
                        n_iter = self.n_iter_config.get(model_name, 30)
                        scoring = 'r2' if task_type == 'regression' else 'balanced_accuracy'
                        
                        search = BayesSearchCV(
                            estimator=base,
                            search_spaces=param_grid,
                            n_iter=n_iter,
                            cv=cv,
                            scoring=scoring,
                            n_jobs=n_jobs,
                            random_state=42,
                            verbose=0
                        )
                        
                        # Fit on single feature
                        search.fit(X, y_feature.values.ravel())
                        
                        # Store model
                        self.trained_models[task_type][feature_name][model_name] = search.best_estimator_
                        
                        # Store results
                        self.training_results[task_type][feature_name][model_name] = {
                            'best_score': search.best_score_,
                            'best_params': search.best_params_,
                            'cv_results': search.cv_results_,
                            'group': group_name
                        }
                        
                        print(f"  ✓ {model_name:20s} {scoring}={search.best_score_:7.4f}")
                        
                        # Track best model for this feature
                        if search.best_score_ > best_score:
                            best_score = search.best_score_
                            best_model_name = model_name
                    
                    except Exception as e:
                        print(f"  ✗ {model_name:20s} FAILED: {str(e)}")
                
                # Set best model for this feature
                if best_model_name:
                    self.best_models[task_type][feature_name] = {
                        'model_name': best_model_name,
                        'model': self.trained_models[task_type][feature_name][best_model_name],
                        'score': best_score,
                        'group': group_name
                    }
                    print(f"\n  - BEST for '{feature_name}': {best_model_name} (score={best_score:.4f})")
    
    def predict(
        self,
        X: pd.DataFrame,
        use_best: bool = True,
        specific_models: Dict[str, Dict[str, str]] = None
    ) -> Dict[str, np.ndarray]:
        
        if self.use_scaler:
            X_scaled = pd.DataFrame(
                self.scaler.transform(X),
                columns=X.columns,
                index=X.index
            )
        else:
            X_scaled = X
        
        predictions = {}
        
        for task_type in ['regression', 'classification']:
            if task_type not in self.target_groups:
                continue
            
            # Iterate through all features (not groups)
            for group_name, features in self.target_groups[task_type].items():
                for feature_name in features:
                    # Select model for this feature
                    if use_best:
                        if feature_name not in self.best_models[task_type]:
                            continue
                        model = self.best_models[task_type][feature_name]['model']
                    else:
                        if specific_models and feature_name in specific_models.get(task_type, {}):
                            model_name = specific_models[task_type][feature_name]
                            model = self.trained_models[task_type][feature_name][model_name]
                        else:
                            continue
                    
                    # Predict for this single feature
                    predictions[feature_name] = model.predict(X_scaled)
        
        return predictions
    
    def evaluate(
        self,
        X: pd.DataFrame,
        y: pd.DataFrame,
        use_best: bool = True
    ) -> Dict[str, Dict[str, Dict]]:
        """
        Evaluate all models
        
        Returns:
            {task_type: {feature_name: {metric: value}}}
        """
        predictions = self.predict(X, use_best=use_best)
        results = {'regression': {}, 'classification': {}}
        
        # Evaluate regression
        if 'regression' in self.target_groups:
            for group_name, features in self.target_groups['regression'].items():
                for feature_name in features:
                    if feature_name in predictions:
                        y_true = y[feature_name].values
                        y_pred = predictions[feature_name]
                        results['regression'][feature_name] = {
                            'r2': r2_score(y_true, y_pred),
                            'mae': mean_absolute_error(y_true, y_pred),
                            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
                            'group': self.feature_to_group['regression'][feature_name]
                        }
        
        # Evaluate classification
        if 'classification' in self.target_groups:
            for group_name, features in self.target_groups['classification'].items():
                for feature_name in features:
                    if feature_name in predictions:
                        y_true = y[feature_name].values
                        y_pred = predictions[feature_name]
                        results['classification'][feature_name] = {
                            'accuracy': accuracy_score(y_true, y_pred),
                            'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
                            'f1_weighted': f1_score(y_true, y_pred, average='weighted'),
                            'group': self.feature_to_group['classification'][feature_name]
                        }
        
        return results
    
    def get_results_dataframe(self, eval_results: Dict[str, Dict[str, Dict]]) -> pd.DataFrame:
       
        rows = []
        
        # Process regression results
        if 'regression' in eval_results:
            for feature_name, metrics in eval_results['regression'].items():
                # Get the best model name for this feature
                if feature_name in self.best_models['regression']:
                    model_name = self.best_models['regression'][feature_name]['model_name']
                    cv_score = self.best_models['regression'][feature_name]['score']
                else:
                    model_name = 'Unknown'
                    cv_score = None
                
                rows.append({
                    'task_type': 'regression',
                    'feature': feature_name,
                    'target_group': metrics.get('group', 'Unknown'),
                    'model_name': model_name,
                    'cv_score': cv_score,
                    'r2_score': metrics['r2'],
                    'mae': metrics['mae'],
                    'rmse': metrics['rmse']
                })
        
        # Process classification results
        if 'classification' in eval_results:
            for feature_name, metrics in eval_results['classification'].items():
                if feature_name in self.best_models['classification']:
                    model_name = self.best_models['classification'][feature_name]['model_name']
                    cv_score = self.best_models['classification'][feature_name]['score']
                else:
                    model_name = 'Unknown'
                    cv_score = None
                
                rows.append({
                    'task_type': 'classification',
                    'feature': feature_name,
                    'target_group': metrics.get('group', 'Unknown'),
                    'model_name': model_name,
                    'cv_score': cv_score,
                    'accuracy': metrics['accuracy'],
                    'balanced_accuracy': metrics['balanced_accuracy'],
                    'f1_weighted': metrics['f1_weighted']
                })
        
        # Create DataFrame
        df = pd.DataFrame(rows)
        
        # Sort by task type, group, and primary metric
        if not df.empty:
            sort_cols = ['task_type', 'target_group']
            if 'r2_score' in df.columns:
                sort_cols.append('r2_score')
                df = df.sort_values(sort_cols, ascending=[True, True, False])
            else:
                df = df.sort_values(sort_cols, ascending=[True, True])
            df = df.reset_index(drop=True)
        
        return df
    
    def get_all_feature_results(self) -> pd.DataFrame:
        """
        Get training results for ALL models trained on each feature
        (not just best model)
        
        Returns:
            DataFrame with all model results per feature
        """
        rows = []
        
        for task_type in ['regression', 'classification']:
            if task_type not in self.training_results:
                continue
            
            for feature_name, models_dict in self.training_results[task_type].items():
                for model_name, results in models_dict.items():
                    row = {
                        'task_type': task_type,
                        'feature': feature_name,
                        'group': results.get('group', 'Unknown'),
                        'model_name': model_name,
                        'cv_score': results['best_score'],
                        'best_params': str(results['best_params']),
                        'is_best': (
                            feature_name in self.best_models[task_type] and 
                            self.best_models[task_type][feature_name]['model_name'] == model_name
                        )
                    }
                    rows.append(row)
        
        df = pd.DataFrame(rows)
        if not df.empty:
            df = df.sort_values(['feature', 'cv_score'], ascending=[True, False])
            df = df.reset_index(drop=True)
        
        return df
    
    def save_models(self, save_dir: Path):
        """Save all models and metadata"""
        save_dir = Path(save_dir)
        save_dir.mkdir(parents=True, exist_ok=True)
        
        # Save scaler
        if self.use_scaler and self.scaler is not None:
            joblib.dump(self.scaler, save_dir / "scaler.pkl")
            print("✓ Saved scaler")
        
        # Save best models (per feature)
        for task_type in ['regression', 'classification']:
            for feature_name, info in self.best_models[task_type].items():
                # Use feature name in filename
                safe_name = feature_name.replace('/', '_').replace(' ', '_')
                path = save_dir / f"{task_type}_{safe_name}_best.pkl"
                joblib.dump(info, path)
                print(f"✓ Saved {task_type}/{feature_name}: {info['model_name']}")
        
        # Save metadata
        metadata = {
            'target_groups': self.target_groups,
            'feature_to_group': self.feature_to_group,
            'best_models': {
                task: {
                    feature: {
                        'model_name': info['model_name'], 
                        'score': info['score'],
                        'group': info['group']
                    }
                    for feature, info in features.items()
                }
                for task, features in self.best_models.items()
            }
        }
        joblib.dump(metadata, save_dir / "metadata.pkl")
        print("✓ Saved metadata")
    
    def load_models(self, load_dir: Path):
        """Load saved models"""
        load_dir = Path(load_dir)
        
        # Load scaler
        if self.use_scaler:
            scaler_path = load_dir / "scaler.pkl"
            if scaler_path.exists():
                self.scaler = joblib.load(scaler_path)
                print("✓ Loaded scaler")
        
        # Load metadata
        metadata = joblib.load(load_dir / "metadata.pkl")
        self.feature_to_group = metadata.get('feature_to_group', self.feature_to_group)
        
        # Load best models (per feature)
        for task_type in ['regression', 'classification']:
            if task_type not in metadata['best_models']:
                continue
            
            for feature_name in metadata['best_models'][task_type].keys():
                safe_name = feature_name.replace('/', '_').replace(' ', '_')
                path = load_dir / f"{task_type}_{safe_name}_best.pkl"
                if path.exists():
                    self.best_models[task_type][feature_name] = joblib.load(path)
                    print(f"✓ Loaded {task_type}/{feature_name}")


In [15]:
class MusicFeaturePipeline:
    """
    Complete pipeline: Audio → MFCC → (Optional PCA) → Multi-Model Predictions
    """
    
    def __init__(
        self,
        audio_loader,  # AudioLoader instance
        mfcc_extractor,  # MFCCExtractor instance
        target_groups: Dict[str, Dict[str, List[str]]],
        model_configs: Dict[str, Dict[str, Dict]],
        n_iter_config: Dict[str, int],
        model_dir: Path = None,
        use_scaler: bool = True,
        use_pca: bool = False,
        n_components: Optional[Union[int, float]] = None
    ):
        
        self.audio_loader = audio_loader
        self.mfcc_extractor = mfcc_extractor
        self.use_pca = use_pca
        self.n_components = n_components
        self.pca = None
        
        # Unified Predictor
        self.predictor = PerFeaturePredictor(
            target_groups=target_groups,
            model_configs=model_configs,
            n_iter_config=n_iter_config,
            use_scaler=use_scaler
        )
        self.model_dir = Path(model_dir) if model_dir else None
    
    def _apply_pca(self, features: pd.DataFrame, fit: bool = False) -> pd.DataFrame:
        """
        Apply PCA transformation to features
        """
        if fit:
            self.pca = PCA(n_components=self.n_components, random_state=42)
            features_transformed = self.pca.fit_transform(features)
            print(f"  PCA fitted: {features.shape[1]} → {self.pca.n_components_} components")
            print(f"  Explained variance: {self.pca.explained_variance_ratio_.sum():.4f}")
        else:
            if self.pca is None:
                raise ValueError("PCA not fitted. Call with fit=True first.")
            features_transformed = self.pca.transform(features)
        
        # Convert back to DataFrame
        features_df = pd.DataFrame(
            features_transformed,
            columns=[f'pca_{i}' for i in range(features_transformed.shape[1])],
            index=features.index
        )
        return features_df
    
    def extract_features(
        self, 
        audio_paths: List[Union[str, Path]], 
        apply_pca: bool = None,
        fit_pca: bool = False
    ) -> pd.DataFrame:
    
        if apply_pca is None:
            apply_pca = self.use_pca
        
        print("Loading audio files...")
        audio_data = self.audio_loader.transform(audio_paths)
        
        print("Extracting MFCC features...")
        features = self.mfcc_extractor.transform(audio_data)
        
        if apply_pca:
            print("Applying PCA...")
            features = self._apply_pca(features, fit=fit_pca)
        
        return features
    
    def train(
        self,
        audio_paths: List[Union[str, Path]],
        targets: pd.DataFrame,
        cv: int = 4,
        n_jobs: int = -1
    ):
       
        # Extract features (with PCA if enabled)
        X = self.extract_features(audio_paths, fit_pca=self.use_pca)
        
        # Ensure alignment
        if len(X) != len(targets):
            raise ValueError(f"Mismatch: {len(X)} audio samples but {len(targets)} target rows")
        
        self.predictor.fit(X, targets, cv=cv, n_jobs=n_jobs)
        
        # Save models
        if self.model_dir:
            print("\n" + "="*60)
            print("SAVING MODELS")
            print("="*60)
            self._save_pipeline()
        
        print("\n" + "="*60)
        print("TRAINING COMPLETE")
        print("="*60)
    
    def predict(
        self,
        audio_paths: List[Union[str, Path]],
        use_best: bool = True
    ) -> Dict[str, np.ndarray]:
        
        # Load models if needed
        if not self.predictor.best_models['regression'] and not self.predictor.best_models['classification']:
            if self.model_dir and self.model_dir.exists():
                print("Loading models...")
                self._load_pipeline()
            else:
                raise ValueError("No models trained or loaded. Call train() first or provide valid model_dir")
        
        # Extract features (with PCA if enabled, but don't fit)
        X = self.extract_features(audio_paths, apply_pca=self.use_pca, fit_pca=False)
        
        # Predict
        print("Generating predictions...")
        predictions = self.predictor.predict(X, use_best=use_best)
        
        return predictions
    
    def evaluate(
        self,
        audio_paths: List[Union[str, Path]],
        targets: pd.DataFrame,
        use_best: bool = True
    ) -> pd.DataFrame:
       
        # Extract features (with PCA if enabled, but don't fit)
        X = self.extract_features(audio_paths, apply_pca=self.use_pca, fit_pca=False)
        
        # Ensure alignment
        if len(X) != len(targets):
            raise ValueError(f"Mismatch: {len(X)} audio samples but {len(targets)} target rows")
        
        # Evaluate
        eval_results = self.predictor.evaluate(X, targets, use_best=use_best)
        
        # Convert to DataFrame
        results_df = self.predictor.get_results_dataframe(eval_results)
        
        return results_df
    
    def get_training_summary(self) -> pd.DataFrame:
       
        return self.predictor.get_all_feature_results()
    
    def _save_pipeline(self):
        """Save complete pipeline including PCA"""
        if not self.model_dir:
            raise ValueError("model_dir not specified")
        
        self.model_dir.mkdir(parents=True, exist_ok=True)
        
        # Save predictor models
        self.predictor.save_models(self.model_dir)
        
        # Save PCA if used
        if self.use_pca and self.pca is not None:
            joblib.dump(self.pca, self.model_dir / "pca.pkl")
            print("✓ Saved PCA transformer")
        
        # Save pipeline config
        pipeline_config = {
            'use_pca': self.use_pca,
            'n_components': self.n_components
        }
        joblib.dump(pipeline_config, self.model_dir / "pipeline_config.pkl")
        print("✓ Saved pipeline configuration")
    
    def _load_pipeline(self):
        """Load complete pipeline including PCA"""
        if not self.model_dir:
            raise ValueError("model_dir not specified")
        
        # Load predictor models
        self.predictor.load_models(self.model_dir)
        
        # Load pipeline config
        config_path = self.model_dir / "pipeline_config.pkl"
        if config_path.exists():
            pipeline_config = joblib.load(config_path)
            self.use_pca = pipeline_config.get('use_pca', False)
            self.n_components = pipeline_config.get('n_components', None)
            print("✓ Loaded pipeline configuration")
        
        # Load PCA if used
        if self.use_pca:
            pca_path = self.model_dir / "pca.pkl"
            if pca_path.exists():
                self.pca = joblib.load(pca_path)
                print("✓ Loaded PCA transformer")
            else:
                raise ValueError("PCA enabled but pca.pkl not found in model_dir")

In [16]:
target_groups = {
    "regression": {
        "energy_mood": ["energy", "valence", "danceability"],
        "production": ["loudness", "acousticness", "instrumentalness", "liveness"],
        "structure": ["speechiness"],
    },
    "classification": {
        "key": ["key"],
        "mode": ["mode"], 
        "tempo_bins": ["tempo_bins"]
    }
}

model_configs = {
    "regression": {
        "Ridge": {
            "base_model": Ridge(),
            "param_grid": {
                "estimator__alpha": Real(0.1, 10.0, prior="log-uniform"),
                "estimator__solver": Categorical(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
            },
        },
        "Lasso": {
            "base_model": Lasso(),
            "param_grid": {
                "estimator__alpha": Real(0.0001, 1.0, prior="log-uniform"),
                "estimator__selection": Categorical(["cyclic", "random"]),
            },
        },
        "ElasticNet": {
            "base_model": ElasticNet(),
            "param_grid": {
                "estimator__alpha": Real(0.0001, 1.0, prior="log-uniform"),
                "estimator__l1_ratio": Real(0.1, 1.0),
                "estimator__selection": Categorical(["cyclic", "random"]),
            },
        },
        "Random Forest": {
            "base_model": RandomForestRegressor(random_state=42),
            "param_grid": {
                "estimator__n_estimators": Integer(150, 350),
                "estimator__max_depth": Integer(10, 20),
                "estimator__min_samples_split": Integer(2, 8),
                "estimator__min_samples_leaf": Integer(3, 8),
                "estimator__max_features": Categorical(["sqrt", "log2", None]),
            },
        },
        "SVM": {
            "base_model": SVR(kernel="rbf"),
            "param_grid": {
                "estimator__C": Real(0.1, 100.0, prior="log-uniform"),
                "estimator__gamma": Real(0.00001, 0.01, prior="log-uniform"),
                "estimator__epsilon": Real(0.01, 0.2),
            },
        },
    },
    "classification": {
        "Random Forest": {
            "base_model": RandomForestClassifier(random_state=42, class_weight='balanced'),
            "param_grid": {
                "estimator__n_estimators": Integer(150, 350),
                "estimator__max_depth": Integer(8, 20),
                "estimator__min_samples_split": Integer(2, 10),
                "estimator__min_samples_leaf": Integer(3, 6),
                "estimator__max_features": Categorical(["sqrt", "log2", None]),
                "estimator__bootstrap": Categorical([True, False]),
            },
        },
        "SVM": {
            "base_model": SVC(kernel="rbf", probability=True, random_state=42, class_weight='balanced'),
            "param_grid": {
                "estimator__C": Real(1.0, 20.0, prior="log-uniform"),
                "estimator__gamma": Real(0.001, 0.02, prior="log-uniform"),
            },
        },
    },
}

n_iter_config = {
    "Ridge": 30,
    "Lasso": 30,
    "ElasticNet": 30,
    "Random Forest": 50,  
    "SVM": 40,
}

In [17]:
# Quick path fix
def fix_path(path):
    """Fix WSL path for Windows access"""
    path_str = str(path)
    if '\\\\wsl.localhost\\' in path_str:
        # Try alternative format
        path_str = path_str.replace('\\\\wsl.localhost\\', '\\\\wsl$\\')
    return path_str

In [18]:
audio_data = pd.read_csv(prc / "matched_metadata.csv")
audio_data.head()

bins = [0, 80, 100, 120, 140, 170, float("inf")]
numeric_labels = [0, 1, 2, 3, 4, 5]  # Now 6 labels for 6 bins

audio_data["tempo_bins"] = pd.cut(
    audio_data["tempo"], bins=bins, labels=numeric_labels, right=False
)


X = audio_data["track_id"].map(lambda id: asp / f"{str(id).zfill(6)}.mp3")
y = audio_data[
    [
        "danceability",
        "energy",
        "key",
        "loudness",
        "mode",
        "speechiness",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo_bins",
    ]
]

y.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo_bins
0,0.606,0.916,6,-8.162,1,0.0371,0.14,0.356,0.132,0.889,2
1,0.28,0.64,11,-7.799,0,0.123,0.349,0.675,0.136,0.0537,4
2,0.192,0.411,2,-9.445,1,0.0655,0.539,0.709,0.0909,0.139,0
3,0.584,0.918,7,-9.883,1,0.0345,0.0254,0.77,0.348,0.114,2
4,0.415,0.646,2,-12.022,1,0.0399,0.0189,0.948,0.0965,0.123,1


In [19]:
if __name__ == "__main__":
    from sklearn.model_selection import train_test_split
    
    audio_loader = AudioLoader(sr=22050, cache_dir=intr / "MFCC_cache")
    mfcc_extractor = MFCCExtractor(n_mfcc=13)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    
    X_train_fixed = [fix_path(p) for p in X_train]
    X_test_fixed = [fix_path(p) for p in X_test]
    
    # Create complete pipeline with PCA
    pipeline = MusicFeaturePipeline(
        audio_loader=audio_loader,
        mfcc_extractor=mfcc_extractor,
        target_groups=target_groups,
        model_configs=model_configs,
        n_iter_config=n_iter_config,
        model_dir=Path("models/per_feature/"),
        use_scaler=True,
        use_pca=True,           # Enable PCA
        n_components=80         # 80 components (or use 0.95 for 95% variance)
    )
    
    # Train: Audio → MFCC → PCA → Models
    pipeline.train(X_train_fixed, y_train, cv=4, n_jobs=-1)
    
    # Evaluate: Audio → MFCC → PCA → Predictions → Metrics
    results_df = pipeline.evaluate(X_test_fixed, y_test)
    print("TEST SET RESULTS")
    print(results_df)
    
    # Get all training results
    all_results = pipeline.get_training_summary()
    print("ALL TRAINING RESULTS (CV SCORES)")

    print(all_results)

Training samples: 1383
Test samples: 346
Loading audio files...


Loading audio: 100%|██████████| 1383/1383 [1:25:24<00:00,  3.71s/it]   


Cache: 56 hits, 1327 misses (4.0% hit rate)
Extracting MFCC features...


Extracting MFCC features: 100%|██████████| 1383/1383 [00:50<00:00, 27.51it/s]


Applying PCA...
  PCA fitted: 273 → 80 components
  Explained variance: 0.9998
TRAINING REGRESSION MODELS - PER FEATURE
GROUP: ENERGY_MOOD

▶ FEATURE: energy
  Training 5 models...

  ✓ Ridge                r2= 0.5032
  ✓ Lasso                r2= 0.5071
  ✓ ElasticNet           r2= 0.5073
  ✓ Random Forest        r2= 0.4896
  ✓ SVM                  r2= 0.5372

  - BEST for 'energy': SVM (score=0.5372)

▶ FEATURE: valence
  Training 5 models...

  ✓ Ridge                r2= 0.2417
  ✓ Lasso                r2= 0.2577
  ✓ ElasticNet           r2= 0.2575
  ✓ Random Forest        r2= 0.2078
  ✓ SVM                  r2= 0.2852

  - BEST for 'valence': SVM (score=0.2852)

▶ FEATURE: danceability
  Training 5 models...

  ✓ Ridge                r2= 0.3989
  ✓ Lasso                r2= 0.4085
  ✓ ElasticNet           r2= 0.4086
  ✓ Random Forest        r2= 0.3497
  ✓ SVM                  r2= 0.4284

  - BEST for 'danceability': SVM (score=0.4284)
GROUP: PRODUCTION

▶ FEATURE: loudness
  Training



  ✓ Random Forest        r2= 0.1981
  ✓ SVM                  r2= 0.2105

  - BEST for 'instrumentalness': SVM (score=0.2105)

▶ FEATURE: liveness
  Training 5 models...

  ✓ Ridge                r2=-0.0408
  ✓ Lasso                r2= 0.0227
  ✓ ElasticNet           r2= 0.0217
  ✓ Random Forest        r2= 0.0183
  ✓ SVM                  r2= 0.0088

  - BEST for 'liveness': Lasso (score=0.0227)
GROUP: STRUCTURE

▶ FEATURE: speechiness
  Training 5 models...

  ✓ Ridge                r2= 0.0241
  ✓ Lasso                r2= 0.0765
  ✓ ElasticNet           r2= 0.0765
  ✓ Random Forest        r2= 0.0630
  ✓ SVM                  r2= 0.0896

  - BEST for 'speechiness': SVM (score=0.0896)
TRAINING CLASSIFICATION MODELS - PER FEATURE
GROUP: KEY

▶ FEATURE: key
  Training 2 models...

  ✓ Random Forest        balanced_accuracy= 0.1312
  ✓ SVM                  balanced_accuracy= 0.1375

  - BEST for 'key': SVM (score=0.1375)
GROUP: MODE

▶ FEATURE: mode
  Training 2 models...

  ✓ Random Forest  

Loading audio: 100%|██████████| 346/346 [04:37<00:00,  1.24it/s]


Cache: 14 hits, 332 misses (4.0% hit rate)
Extracting MFCC features...


Extracting MFCC features: 100%|██████████| 346/346 [00:11<00:00, 28.92it/s]


Applying PCA...
TEST SET RESULTS
         task_type           feature target_group     model_name  cv_score  \
0   classification               key          key            SVM  0.137535   
1   classification              mode         mode  Random Forest  0.547999   
2   classification        tempo_bins   tempo_bins            SVM  0.267584   
3       regression            energy  energy_mood            SVM  0.537209   
4       regression      danceability  energy_mood            SVM  0.428387   
5       regression           valence  energy_mood            SVM  0.285222   
6       regression      acousticness   production          Lasso  0.478154   
7       regression          loudness   production  Random Forest  0.424675   
8       regression  instrumentalness   production            SVM  0.210457   
9       regression          liveness   production          Lasso  0.022731   
10      regression       speechiness    structure            SVM  0.089577   

    r2_score       mae      rm

In [24]:
def format_predictions(predictions: Dict[str, np.ndarray], decimals: int = 2) -> Dict[str, Union[float, int]]:
    formatted = {}
    
    for feature, value in predictions.items():
        # Extract scalar from array
        scalar_value = value.item() if hasattr(value, 'item') else value[0]
        
        # Format based on type
        if isinstance(scalar_value, (np.integer, int)):
            formatted[feature] = int(scalar_value)
        else:
            formatted[feature] = round(float(scalar_value), decimals)
    
    return formatted

def print_predictions(predictions: Dict[str, np.ndarray], decimals: int = 2):
    formatted = format_predictions(predictions, decimals)
    
    print("\nPredicted Features:")
    for feature, value in formatted.items():
        if isinstance(value, int):
            print(f"{feature:20s}: {value}")
        else:
            print(f"{feature:20s}: {value:.{decimals}f}")

In [25]:
# Load and predict on new audio
prediction1 = pipeline.predict(prc / 'audio/000010.mp3')
format_predictions(prediction1, decimals =2)

Loading audio files...


Loading audio: 100%|██████████| 1/1 [00:00<00:00, 167.12it/s]


Cache: 1 hits, 0 misses (100.0% hit rate)
Extracting MFCC features...


Extracting MFCC features: 100%|██████████| 1/1 [00:00<00:00, 27.39it/s]

Applying PCA...
Generating predictions...





{'energy': 0.97,
 'valence': 0.75,
 'danceability': 0.49,
 'loudness': -7.09,
 'acousticness': 0.15,
 'instrumentalness': 0.27,
 'liveness': 0.22,
 'speechiness': 0.07,
 'key': 6,
 'mode': 1,
 'tempo_bins': 4}