In [41]:
import pandas as pd
import numpy as np
from pathlib import  Path

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVR 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import joblib

In [42]:
# Get the current working directory (works in Jupyter)
root = Path.cwd().parent

In [43]:
# Build the full path
MFCC_df = pd.read_csv(root / 'data/processed/mfcc_features.csv')

MFCC_df.head()

Unnamed: 0,track_id,mfcc_mean_0,mfcc_mean_1,mfcc_mean_2,mfcc_mean_3,mfcc_mean_4,mfcc_mean_5,mfcc_mean_6,mfcc_mean_7,mfcc_mean_8,mfcc_mean_9,mfcc_mean_10,mfcc_mean_11,mfcc_mean_12,mfcc_std_0,mfcc_std_1,mfcc_std_2,mfcc_std_3,mfcc_std_4,mfcc_std_5,mfcc_std_6,mfcc_std_7,mfcc_std_8,mfcc_std_9,mfcc_std_10,mfcc_std_11,mfcc_std_12,delta_mean_0,delta_mean_1,delta_mean_2,delta_mean_3,delta_mean_4,delta_mean_5,delta_mean_6,delta_mean_7,delta_mean_8,delta_mean_9,delta_mean_10,delta_mean_11,delta_mean_12,delta_std_0,delta_std_1,delta_std_2,delta_std_3,delta_std_4,delta_std_5,delta_std_6,delta_std_7,delta_std_8,delta_std_9,delta_std_10,delta_std_11,delta_std_12,delta2_mean_0,delta2_mean_1,delta2_mean_2,delta2_mean_3,delta2_mean_4,delta2_mean_5,delta2_mean_6,delta2_mean_7,delta2_mean_8,delta2_mean_9,delta2_mean_10,delta2_mean_11,delta2_mean_12,delta2_std_0,delta2_std_1,delta2_std_2,delta2_std_3,delta2_std_4,delta2_std_5,delta2_std_6,delta2_std_7,delta2_std_8,delta2_std_9,delta2_std_10,delta2_std_11,delta2_std_12
0,109497,-14.46628,99.482414,-13.127948,43.313545,9.995256,5.39969,-5.793285,13.883069,-1.838129,2.187864,-2.674747,3.841628,-9.301169,26.336906,13.115074,12.082673,10.713166,7.633686,5.559512,5.984026,5.907621,5.963255,5.159916,5.725162,4.790164,5.414805,0.220803,0.009053,-0.002882,-0.003425,0.00075,0.0046,-0.003279,0.011742,0.006375,0.00165,0.000165,-0.000724,0.005746,4.41648,2.52771,1.879202,1.614143,1.372204,0.91686,1.072609,1.153425,1.141011,0.906527,1.142041,0.910949,1.01229,-0.135776,-0.035974,0.004075,-0.013851,-0.004475,-0.001072,-0.0025,-0.009171,-0.006136,-0.009151,-0.006148,-0.006779,-0.001375,2.662576,1.463119,1.183564,1.022441,0.859411,0.726815,0.79601,0.860587,0.880833,0.73046,0.814752,0.704768,0.741921
1,53666,-52.18589,99.23931,-19.81856,34.8412,-4.830251,7.452331,0.821418,-0.551589,0.541429,8.759988,-2.642872,6.46883,-2.554433,17.970594,16.446632,10.71229,8.822542,7.811877,5.619895,6.609158,6.775237,6.076496,6.250626,5.90914,5.741867,5.448841,0.012402,-0.007382,-0.005155,0.002076,-0.001866,-0.001652,-0.000474,-0.000553,-0.000199,0.011042,0.004957,-0.008891,-0.007915,3.199941,2.723585,2.00516,1.703189,1.259007,1.013505,1.200884,1.042044,1.084845,1.036201,1.091861,0.978752,1.000591,-0.011197,-0.015572,-0.007065,-0.002806,-0.001591,-0.006339,-0.008179,-0.007428,-0.002442,-0.002701,-0.006067,-0.005588,-0.005737,2.597663,1.668799,1.340824,1.059314,0.872083,0.693278,0.744236,0.724987,0.736523,0.742568,0.732206,0.654933,0.686628
2,55400,-37.400524,124.763824,-36.81307,21.995646,2.225886,21.032818,3.546124,10.978546,-5.663468,1.670306,-1.866043,4.414843,0.263866,25.074936,15.589418,11.059312,9.996412,8.603876,6.485661,7.064106,4.8184,4.59468,4.897986,4.508664,5.303911,5.510609,0.058065,-0.01056,0.011036,0.009491,-0.004898,0.002371,0.01321,0.016682,0.005274,0.004574,0.003347,0.005777,0.008513,4.374987,2.896857,1.517322,1.221072,1.538684,0.871887,1.157159,0.796285,0.784595,0.881469,0.744571,0.868556,0.73955,-0.057128,0.014514,-0.004043,0.006608,0.01231,-0.003013,0.000503,-0.001101,0.002705,-0.000753,-0.00103,0.000794,-0.001403,3.734957,1.728411,1.449256,1.055884,0.866639,0.662977,0.788613,0.562866,0.613871,0.658512,0.639172,0.61351,0.579289
3,10589,-380.00287,181.90225,-65.369286,10.870298,16.465267,-47.996254,-3.810154,-8.47399,-42.44927,-8.006559,-6.592747,-21.92328,-0.598965,65.70725,25.217339,45.143143,18.352695,13.137667,17.60754,8.177697,8.135131,13.13912,8.508199,9.304217,8.381541,8.219736,0.189026,0.083782,-0.052729,-0.005151,-0.000277,-0.026846,-0.010993,-0.0137,-0.027458,0.001743,0.007929,-0.008412,-0.008311,7.055398,3.073463,4.952223,2.348252,1.871211,1.966379,1.128496,1.124765,1.474231,1.269974,1.269533,1.053716,1.284437,-0.054324,-0.050451,0.013551,-0.010189,-0.012286,0.013427,-0.00487,-0.000656,0.008195,-2.5e-05,0.00531,0.010322,-0.001069,3.407578,1.517492,2.385228,1.016233,0.909341,0.9996,0.677607,0.646453,0.7891,0.651975,0.664161,0.649972,0.697728
4,55923,-18.598597,71.36076,-16.790304,51.837322,-2.433789,12.741719,6.08001,5.622876,0.253179,3.234265,-1.363661,1.0434,-7.887192,21.66691,18.164318,15.200961,13.062137,8.76108,9.015322,8.081576,7.085149,7.131298,6.38119,5.982719,5.748267,5.225229,0.066447,0.025715,-0.015879,0.012799,-0.001647,-0.011966,-0.006075,0.003678,0.003172,0.002557,-0.000214,-0.003642,-0.020094,4.150834,2.444548,2.64983,2.237288,1.532949,1.640232,1.431198,1.290325,1.357411,1.095927,1.101842,1.017437,0.991913,-0.040584,0.003627,0.011505,9.5e-05,0.00042,-0.011483,-0.005313,0.010726,0.000119,-0.002157,-5.5e-05,0.005269,-0.004375,3.154008,1.603595,1.603042,1.200993,0.932761,0.907508,0.857162,0.766474,0.761395,0.690974,0.697007,0.658661,0.626155


In [44]:
pd.set_option('display.max_columns', None)
matched_df = pd.read_csv(root / 'data/processed/matched_metadata.csv')

In [45]:
continuous_targets = ['energy',  
           'loudness', 'speechiness', 'acousticness', 
           'instrumentalness', 'liveness', 'valence', 'tempo']

In [46]:
matched_df_target = matched_df[['track_id'] + continuous_targets]
matched_df_target.head()

Unnamed: 0,track_id,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,10,0.916,-8.162,0.0371,0.14,0.356,0.132,0.889,111.563
1,237,0.64,-7.799,0.123,0.349,0.675,0.136,0.0537,140.368
2,238,0.411,-9.445,0.0655,0.539,0.709,0.0909,0.139,56.929
3,459,0.918,-9.883,0.0345,0.0254,0.77,0.348,0.114,108.305
4,459,0.646,-12.022,0.0399,0.0189,0.948,0.0965,0.123,93.887


In [47]:
c_targets_df = matched_df_target[["track_id"]+ continuous_targets]
merged_df = MFCC_df.merge(c_targets_df, on = 'track_id')

merged_df.head()

Unnamed: 0,track_id,mfcc_mean_0,mfcc_mean_1,mfcc_mean_2,mfcc_mean_3,mfcc_mean_4,mfcc_mean_5,mfcc_mean_6,mfcc_mean_7,mfcc_mean_8,mfcc_mean_9,mfcc_mean_10,mfcc_mean_11,mfcc_mean_12,mfcc_std_0,mfcc_std_1,mfcc_std_2,mfcc_std_3,mfcc_std_4,mfcc_std_5,mfcc_std_6,mfcc_std_7,mfcc_std_8,mfcc_std_9,mfcc_std_10,mfcc_std_11,mfcc_std_12,delta_mean_0,delta_mean_1,delta_mean_2,delta_mean_3,delta_mean_4,delta_mean_5,delta_mean_6,delta_mean_7,delta_mean_8,delta_mean_9,delta_mean_10,delta_mean_11,delta_mean_12,delta_std_0,delta_std_1,delta_std_2,delta_std_3,delta_std_4,delta_std_5,delta_std_6,delta_std_7,delta_std_8,delta_std_9,delta_std_10,delta_std_11,delta_std_12,delta2_mean_0,delta2_mean_1,delta2_mean_2,delta2_mean_3,delta2_mean_4,delta2_mean_5,delta2_mean_6,delta2_mean_7,delta2_mean_8,delta2_mean_9,delta2_mean_10,delta2_mean_11,delta2_mean_12,delta2_std_0,delta2_std_1,delta2_std_2,delta2_std_3,delta2_std_4,delta2_std_5,delta2_std_6,delta2_std_7,delta2_std_8,delta2_std_9,delta2_std_10,delta2_std_11,delta2_std_12,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,109497,-14.46628,99.482414,-13.127948,43.313545,9.995256,5.39969,-5.793285,13.883069,-1.838129,2.187864,-2.674747,3.841628,-9.301169,26.336906,13.115074,12.082673,10.713166,7.633686,5.559512,5.984026,5.907621,5.963255,5.159916,5.725162,4.790164,5.414805,0.220803,0.009053,-0.002882,-0.003425,0.00075,0.0046,-0.003279,0.011742,0.006375,0.00165,0.000165,-0.000724,0.005746,4.41648,2.52771,1.879202,1.614143,1.372204,0.91686,1.072609,1.153425,1.141011,0.906527,1.142041,0.910949,1.01229,-0.135776,-0.035974,0.004075,-0.013851,-0.004475,-0.001072,-0.0025,-0.009171,-0.006136,-0.009151,-0.006148,-0.006779,-0.001375,2.662576,1.463119,1.183564,1.022441,0.859411,0.726815,0.79601,0.860587,0.880833,0.73046,0.814752,0.704768,0.741921,0.897,-8.232,0.0874,0.00014,0.363,0.113,0.189,105.241
1,53666,-52.18589,99.23931,-19.81856,34.8412,-4.830251,7.452331,0.821418,-0.551589,0.541429,8.759988,-2.642872,6.46883,-2.554433,17.970594,16.446632,10.71229,8.822542,7.811877,5.619895,6.609158,6.775237,6.076496,6.250626,5.90914,5.741867,5.448841,0.012402,-0.007382,-0.005155,0.002076,-0.001866,-0.001652,-0.000474,-0.000553,-0.000199,0.011042,0.004957,-0.008891,-0.007915,3.199941,2.723585,2.00516,1.703189,1.259007,1.013505,1.200884,1.042044,1.084845,1.036201,1.091861,0.978752,1.000591,-0.011197,-0.015572,-0.007065,-0.002806,-0.001591,-0.006339,-0.008179,-0.007428,-0.002442,-0.002701,-0.006067,-0.005588,-0.005737,2.597663,1.668799,1.340824,1.059314,0.872083,0.693278,0.744236,0.724987,0.736523,0.742568,0.732206,0.654933,0.686628,0.874,-5.437,0.0447,1.9e-05,0.925,0.318,0.5,139.8
2,55400,-37.400524,124.763824,-36.81307,21.995646,2.225886,21.032818,3.546124,10.978546,-5.663468,1.670306,-1.866043,4.414843,0.263866,25.074936,15.589418,11.059312,9.996412,8.603876,6.485661,7.064106,4.8184,4.59468,4.897986,4.508664,5.303911,5.510609,0.058065,-0.01056,0.011036,0.009491,-0.004898,0.002371,0.01321,0.016682,0.005274,0.004574,0.003347,0.005777,0.008513,4.374987,2.896857,1.517322,1.221072,1.538684,0.871887,1.157159,0.796285,0.784595,0.881469,0.744571,0.868556,0.73955,-0.057128,0.014514,-0.004043,0.006608,0.01231,-0.003013,0.000503,-0.001101,0.002705,-0.000753,-0.00103,0.000794,-0.001403,3.734957,1.728411,1.449256,1.055884,0.866639,0.662977,0.788613,0.562866,0.613871,0.658512,0.639172,0.61351,0.579289,0.979,-4.001,0.0893,6.5e-05,0.00159,0.807,0.106,62.513
3,10589,-380.00287,181.90225,-65.369286,10.870298,16.465267,-47.996254,-3.810154,-8.47399,-42.44927,-8.006559,-6.592747,-21.92328,-0.598965,65.70725,25.217339,45.143143,18.352695,13.137667,17.60754,8.177697,8.135131,13.13912,8.508199,9.304217,8.381541,8.219736,0.189026,0.083782,-0.052729,-0.005151,-0.000277,-0.026846,-0.010993,-0.0137,-0.027458,0.001743,0.007929,-0.008412,-0.008311,7.055398,3.073463,4.952223,2.348252,1.871211,1.966379,1.128496,1.124765,1.474231,1.269974,1.269533,1.053716,1.284437,-0.054324,-0.050451,0.013551,-0.010189,-0.012286,0.013427,-0.00487,-0.000656,0.008195,-2.5e-05,0.00531,0.010322,-0.001069,3.407578,1.517492,2.385228,1.016233,0.909341,0.9996,0.677607,0.646453,0.7891,0.651975,0.664161,0.649972,0.697728,0.338,-7.432,0.0317,0.767,8e-06,0.113,0.247,94.139
4,55923,-18.598597,71.36076,-16.790304,51.837322,-2.433789,12.741719,6.08001,5.622876,0.253179,3.234265,-1.363661,1.0434,-7.887192,21.66691,18.164318,15.200961,13.062137,8.76108,9.015322,8.081576,7.085149,7.131298,6.38119,5.982719,5.748267,5.225229,0.066447,0.025715,-0.015879,0.012799,-0.001647,-0.011966,-0.006075,0.003678,0.003172,0.002557,-0.000214,-0.003642,-0.020094,4.150834,2.444548,2.64983,2.237288,1.532949,1.640232,1.431198,1.290325,1.357411,1.095927,1.101842,1.017437,0.991913,-0.040584,0.003627,0.011505,9.5e-05,0.00042,-0.011483,-0.005313,0.010726,0.000119,-0.002157,-5.5e-05,0.005269,-0.004375,3.154008,1.603595,1.603042,1.200993,0.932761,0.907508,0.857162,0.766474,0.761395,0.690974,0.697007,0.658661,0.626155,0.943,-5.427,0.09,0.121,0.0,0.34,0.566,79.396


**SVM**

In [48]:
X = merged_df.drop(['track_id'] + continuous_targets, axis=1)

# Define parameter grid optimized for RBF kernel
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],              
    'gamma': ['scale', 'auto', 0.0001, 0.001, 0.01, 0.1, 1, 10],  
    'epsilon': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2]    
}

# SVR 
svr = SVR(kernel='rbf')

# Grid search (5-fold cross-validation)
grid_search = GridSearchCV(
    estimator=svr,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1  # Use all CPU cores for faster processing
)


results =[]

for target in continuous_targets:
    
    y = merged_df[target]
    
    # Train test split 
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    
    # Create a NEW scaler for each target
    scaler = StandardScaler()
    
    # Scaling
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fitting the model
    grid_search.fit(X_train_scaled, y_train)
    
    # The best model
    best_model = grid_search.best_estimator_
    
    print(f"\nBest hyperparameters for {target}:")
    for param, value in grid_search.best_params_.items():
        print(f"  {param}: {value}")
    
    # Predictions
    y_train_pred = best_model.predict(X_train_scaled)
    y_test_pred = best_model.predict(X_test_scaled)
    
    # Performance metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)


    results.append({
        'target': target,
        'best_params': grid_search.best_params_,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'best_model': best_model,
        'scaler': scaler
    })

    
    print(f"\nPerformance Metrics for {target}:")
    print(f"\nTraining Set:")
    print(f"  RMSE: {train_rmse:.4f}")
    print(f"  MAE:  {train_mae:.4f}")
    print(f"  R²:   {train_r2:.4f}")
    print(f"\nTest Set:")
    print(f"  RMSE: {test_rmse:.4f}")
    print(f"  MAE:  {test_mae:.4f}")
    print(f"  R²:   {test_r2:.4f}")


# Convert to DataFrame for easy comparison
results_df = pd.DataFrame(results)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits

Best hyperparameters for energy:
  C: 0.1
  epsilon: 0.01
  gamma: 0.01

Performance Metrics for energy:

Training Set:
  RMSE: 0.1514
  MAE:  0.1093
  R²:   0.6759

Test Set:
  RMSE: 0.1627
  MAE:  0.1274
  R²:   0.6262
Fitting 5 folds for each of 288 candidates, totalling 1440 fits

Best hyperparameters for loudness:
  C: 100
  epsilon: 0.2
  gamma: 0.001

Performance Metrics for loudness:

Training Set:
  RMSE: 2.9976
  MAE:  1.9158
  R²:   0.5814

Test Set:
  RMSE: 3.2748
  MAE:  2.4861
  R²:   0.4638
Fitting 5 folds for each of 288 candidates, totalling 1440 fits

Best hyperparameters for speechiness:
  C: 0.1
  epsilon: 0.01
  gamma: scale

Performance Metrics for speechiness:

Training Set:
  RMSE: 0.0562
  MAE:  0.0196
  R²:   0.3780

Test Set:
  RMSE: 0.0678
  MAE:  0.0299
  R²:   0.1200
Fitting 5 folds for each of 288 candidates, totalling 1440 fits

Best hyperparameters for acousticness:
  C: 1
  epsilon: 0.15


In [49]:
results_df

Unnamed: 0,target,best_params,train_rmse,test_rmse,train_r2,test_r2,best_model,scaler
0,energy,"{'C': 0.1, 'epsilon': 0.01, 'gamma': 0.01}",0.151385,0.162687,0.675912,0.626211,"SVR(C=0.1, epsilon=0.01, gamma=0.01)",StandardScaler()
1,loudness,"{'C': 100, 'epsilon': 0.2, 'gamma': 0.001}",2.99763,3.274823,0.581409,0.463764,"SVR(C=100, epsilon=0.2, gamma=0.001)",StandardScaler()
2,speechiness,"{'C': 0.1, 'epsilon': 0.01, 'gamma': 'scale'}",0.056214,0.067831,0.378032,0.120037,"SVR(C=0.1, epsilon=0.01)",StandardScaler()
3,acousticness,"{'C': 1, 'epsilon': 0.15, 'gamma': 0.001}",0.242435,0.245616,0.530722,0.493493,"SVR(C=1, epsilon=0.15, gamma=0.001)",StandardScaler()
4,instrumentalness,"{'C': 1, 'epsilon': 0.2, 'gamma': 0.001}",0.315394,0.338894,0.274985,0.138757,"SVR(C=1, epsilon=0.2, gamma=0.001)",StandardScaler()
5,liveness,"{'C': 1, 'epsilon': 0.1, 'gamma': 0.001}",0.152968,0.148412,0.084517,0.032472,"SVR(C=1, gamma=0.001)",StandardScaler()
6,valence,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 0.01}",0.197901,0.215388,0.432425,0.309995,"SVR(C=0.1, gamma=0.01)",StandardScaler()
7,tempo,"{'C': 10, 'epsilon': 0.2, 'gamma': 'auto'}",26.972359,27.424687,0.258293,0.099215,"SVR(C=10, epsilon=0.2, gamma='auto')",StandardScaler()


Ecellent genereralisation: Energy and Accousticness.

In [50]:
import joblib
import json
from datetime import datetime
from pathlib import Path


for target in ['energy', 'acousticness']:
    # Get model data from results_df
    row = results_df[results_df['target'] == target].iloc[0]
    
    # Create directory
    (root / 'models' / target).mkdir(parents=True, exist_ok=True)
    
    # Save model and scaler
    joblib.dump(row['best_model'], root / 'models' / target / f'svr_{target}_model.pkl')
    joblib.dump(row['scaler'], root / 'models' / target / f'svr_{target}_scaler.pkl')
    
    # Save metadata
    metadata = {
        'target': target,
        'model_type': 'SVR',
        'trained_date': datetime.now().strftime('%Y-%m-%d'),
        'hyperparameters': row['best_params'],
        'performance': {
            'test_r2': float(row['test_r2']),
            'test_rmse': float(row['test_rmse'])
        }
    }
    
    with open(root / 'models' / target / f'svr_{target}_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"✓ Saved {target} model")

✓ Saved energy model
✓ Saved acousticness model


**Random Forest Regresor**

In [51]:
not_well_svr = ['loudness', 'speechiness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [52]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd

# Prepare features
X = merged_df.drop(['track_id'] + continuous_targets, axis=1)


print("RANDOM FOREST WITH STRONGER REGULARIZATION")


# IMPROVED PARAMETER GRID - Focus on preventing overfitting
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],  # Limit tree depth more aggressively
    'min_samples_split': [10, 20, 30],  # Require more samples to split
    'min_samples_leaf': [5, 10, 15],  # Require more samples in leaf nodes
    'max_features': ['sqrt', 'log2'],  # Limit features considered
    'bootstrap': [True],  # Always use bootstrap for regularization
    'max_samples': [0.7, 0.8, 0.9],  # Subsample training data
    'min_impurity_decrease': [0.0001, 0.001, 0.01]  # Require minimum improvement
}

print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")


# Random Forest with OOB scoring
rf = RandomForestRegressor(random_state=42, n_jobs=-1, oob_score=True)

# Grid Search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)

results_rf2 = []

# Define targets that need extra regularization
high_overfitting_targets = ['loudness', 'speechiness', 'instrumentalness', 
                             'liveness', 'tempo', 'valence']

for target in high_overfitting_targets:

    print(f"Processing target: {target}")

    
    y = merged_df[target]
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    
    # Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit Grid Search
    print(f"\nFitting Random Forest with regularization...")
    grid_search.fit(X_train_scaled, y_train)
    
    best_model = grid_search.best_estimator_
    
    print(f"\nBest hyperparameters for {target}:")
    for param, value in grid_search.best_params_.items():
        print(f"  {param}: {value}")
    
    # OOB Score
    if hasattr(best_model, 'oob_score_'):
        print(f"\nOut-of-Bag R² Score: {best_model.oob_score_:.4f}")
    
    # CV Results
    cv_results = grid_search.cv_results_
    best_idx = grid_search.best_index_
    cv_train_mse = -cv_results['mean_train_score'][best_idx]
    cv_val_mse = -cv_results['mean_test_score'][best_idx]
    
    print(f"\nCross-Validation:")
    print(f"  Train MSE: {cv_train_mse:.4f}")
    print(f"  Val MSE:   {cv_val_mse:.4f}")
    print(f"  Ratio:     {cv_val_mse/cv_train_mse:.2f}")
    
    # Predictions
    y_train_pred = best_model.predict(X_train_scaled)
    y_test_pred = best_model.predict(X_test_scaled)
    
    # Metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    overfitting_gap = train_r2 - test_r2
    
    
    # Store results
    results_rf2.append({
        'target': target,
        'model_type': 'Random Forest (Regularized)',
        'best_params': grid_search.best_params_,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'overfitting_gap': overfitting_gap,
        'oob_score': best_model.oob_score_ if hasattr(best_model, 'oob_score_') else None,
        'cv_train_mse': cv_train_mse,
        'cv_val_mse': cv_val_mse,
        'best_model': best_model,
        'scaler': scaler
    })
    
    print(f"\nPerformance Metrics for {target}:")
    print(f"\nTraining Set:")
    print(f"  RMSE: {train_rmse:.4f}")
    print(f"  MAE:  {train_mae:.4f}")
    print(f"  R²:   {train_r2:.4f}")
    print(f"\nTest Set:")
    print(f"  RMSE: {test_rmse:.4f}")
    print(f"  MAE:  {test_mae:.4f}")
    print(f"  R²:   {test_r2:.4f}")
    print(f"\nOverfitting Analysis:")
    print(f"  Gap: {overfitting_gap:.4f}")
    
    if overfitting_gap < 0.15:
        print("  ✓ Good generalization")
    elif overfitting_gap < 0.3:
        print("  Moderate overfitting")
    else:
        print("  High overfitting - features may not capture target well")

# Convert to DataFrame
results_rf2_df = pd.DataFrame(results_rf2)

RANDOM FOREST WITH STRONGER REGULARIZATION
Total combinations: 1458
Processing target: loudness

Fitting Random Forest with regularization...
Fitting 5 folds for each of 1458 candidates, totalling 7290 fits

Best hyperparameters for loudness:
  bootstrap: True
  max_depth: 15
  max_features: sqrt
  max_samples: 0.9
  min_impurity_decrease: 0.01
  min_samples_leaf: 5
  min_samples_split: 10
  n_estimators: 200

Out-of-Bag R² Score: 0.4151

Cross-Validation:
  Train MSE: 6.0484
  Val MSE:   12.5675
  Ratio:     2.08

Performance Metrics for loudness:

Training Set:
  RMSE: 2.4525
  MAE:  1.7280
  R²:   0.7198

Test Set:
  RMSE: 3.2103
  MAE:  2.4708
  R²:   0.4847

Overfitting Analysis:
  Gap: 0.2351
  Moderate overfitting
Processing target: speechiness

Fitting Random Forest with regularization...
Fitting 5 folds for each of 1458 candidates, totalling 7290 fits

Best hyperparameters for speechiness:
  bootstrap: True
  max_depth: 5
  max_features: sqrt
  max_samples: 0.7
  min_impurity_

In [53]:
results_rf2_df

Unnamed: 0,target,model_type,best_params,train_rmse,test_rmse,train_mae,test_mae,train_r2,test_r2,overfitting_gap,oob_score,cv_train_mse,cv_val_mse,best_model,scaler
0,loudness,Random Forest (Regularized),"{'bootstrap': True, 'max_depth': 15, 'max_feat...",2.452549,3.21025,1.72798,2.470821,0.719799,0.484702,0.235097,0.415126,6.048432,12.567539,"(DecisionTreeRegressor(max_depth=15, max_featu...",StandardScaler()
1,speechiness,Random Forest (Regularized),"{'bootstrap': True, 'max_depth': 5, 'max_featu...",0.065832,0.068637,0.035149,0.035114,0.147001,0.098986,0.048016,0.075526,0.004329,0.004701,"(DecisionTreeRegressor(max_depth=5, max_featur...",StandardScaler()
2,instrumentalness,Random Forest (Regularized),"{'bootstrap': True, 'max_depth': 15, 'max_feat...",0.227124,0.326108,0.196808,0.286401,0.624021,0.202519,0.421502,0.217767,0.051393,0.108064,"(DecisionTreeRegressor(max_depth=15, max_featu...",StandardScaler()
3,liveness,Random Forest (Regularized),"{'bootstrap': True, 'max_depth': 15, 'max_feat...",0.126381,0.148017,0.092742,0.113351,0.375095,0.037621,0.337475,0.036069,0.015379,0.024194,"(DecisionTreeRegressor(max_depth=15, max_featu...",StandardScaler()
4,tempo,Random Forest (Regularized),"{'bootstrap': True, 'max_depth': 15, 'max_feat...",20.043738,27.522049,15.479801,21.785006,0.590408,0.092808,0.4976,0.075053,401.213898,905.552731,"(DecisionTreeRegressor(max_depth=15, max_featu...",StandardScaler()
5,valence,Random Forest (Regularized),"{'bootstrap': True, 'max_depth': 15, 'max_feat...",0.150418,0.208919,0.12328,0.17259,0.672111,0.350824,0.321286,0.267868,0.022311,0.051362,"(DecisionTreeRegressor(max_depth=15, max_featu...",StandardScaler()


In [57]:
import joblib
import json
from datetime import datetime
from pathlib import Path

for target in ['loudness']:  
    # Get model data from results_df
    row = results_df[results_df['target'] == target].iloc[0]  # ← Add .iloc[0] to get the row as a Series
    
    # Create directory
    (root / 'models' / target).mkdir(parents=True, exist_ok=True)
    
    # Save model and scaler
    joblib.dump(row['best_model'], root / 'models' / target / f'rf_{target}_model.pkl')
    
    # Save metadata - convert all values to native Python types
    metadata = {
        'target': target,
        'model_type': 'RandomForest',
        'trained_date': datetime.now().strftime('%Y-%m-%d'),
        'hyperparameters': row['best_params'],  # This is already a dict
        'performance': {
            'test_r2': float(row['test_r2']),      # ← Convert to float
            'test_rmse': float(row['test_rmse'])   # ← Convert to float
        }
    }
    
    with open(root / 'models' / target / f'rf_{target}_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"✓ Saved {target} model")

✓ Saved loudness model


## **LightGBM**

In [55]:
!pip install lightgbm



In [56]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')


# CONFIGURATION


# Targets to train (the poor performers from RF/SVR)
targets_to_train = ['speechiness', 'instrumentalness', 'liveness', 'valence', 'tempo']


def get_lgbm_model():
    """Returns a configured LightGBM model"""
    return LGBMRegressor(
        n_estimators=1000,
        max_depth=8,
        learning_rate=0.01,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        verbose=-1  # Suppress training output
    )


# TRAINING AND EVALUATION


# Store results
lgbm_results = []


print("LIGHTGBM TRAINING FOR POOR-PERFORMING FEATURES")


for target in targets_to_train:
    print(f"Training LightGBM for: {target.upper()}")

    y = merged_df[target]
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    
    # Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    
    # Initialize model
    lgbm_model = get_lgbm_model()
    
    # Cross-validation
    print("\nPerforming 5-fold cross-validation...")
    cv_results = cross_validate(
        lgbm_model, 
        X_train_scaled, 
        y_train,
        cv=5,
        scoring=['neg_mean_squared_error', 'r2'],
        return_train_score=True,
        n_jobs=-1
    )
    
    cv_train_mse = -cv_results['train_neg_mean_squared_error'].mean()
    cv_val_mse = -cv_results['test_neg_mean_squared_error'].mean()
    cv_train_r2 = cv_results['train_r2'].mean()
    cv_val_r2 = cv_results['test_r2'].mean()
    
    print(f"CV Train MSE: {cv_train_mse:.4f}")
    print(f"CV Val MSE:   {cv_val_mse:.4f}")
    print(f"CV Train R²:  {cv_train_r2:.4f}")
    print(f"CV Val R²:    {cv_val_r2:.4f}")
    print(f"CV Ratio:     {cv_val_mse/cv_train_mse:.2f}")
    
    # Train final model
    print("\nTraining final model...")
    lgbm_model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = lgbm_model.predict(X_train_scaled)
    y_test_pred = lgbm_model.predict(X_test_scaled)
    
    # Training metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    
    # Test metrics
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Overfitting analysis
    overfitting_gap = train_r2 - test_r2
    
    # Print results
    print("PERFORMANCE METRICS")
    print("\nTraining Set:")
    print(f"  RMSE: {train_rmse:.4f}")
    print(f"  MAE:  {train_mae:.4f}")
    print(f"  R²:   {train_r2:.4f}")
    
    print("\nTest Set:")
    print(f"  RMSE: {test_rmse:.4f}")
    print(f"  MAE:  {test_mae:.4f}")
    print(f"  R²:   {test_r2:.4f}")
    
    print(f"\nOverfitting Analysis:")
    print(f"  Gap: {overfitting_gap:.4f}")
    if overfitting_gap < 0.1:
        print("  ✓ Good generalization")
    elif overfitting_gap < 0.2:
        print("  Moderate overfitting")
    else:
        print("  High overfitting")
    

    # Store results
    lgbm_results.append({
        'target': target,
        'model_type': 'LightGBM',
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'overfitting_gap': overfitting_gap,
        'cv_train_mse': cv_train_mse,
        'cv_val_mse': cv_val_mse,
        'cv_train_r2': cv_train_r2,
        'cv_val_r2': cv_val_r2,
        'best_model': lgbm_model
    })

lgbm_results_df = pd.DataFrame(lgbm_results)

LIGHTGBM TRAINING FOR POOR-PERFORMING FEATURES
Training LightGBM for: SPEECHINESS

Performing 5-fold cross-validation...
CV Train MSE: 0.0007
CV Val MSE:   0.0050
CV Train R²:  0.8538
CV Val R²:    -0.0187
CV Ratio:     6.82

Training final model...
PERFORMANCE METRICS

Training Set:
  RMSE: 0.0275
  MAE:  0.0132
  R²:   0.8508

Test Set:
  RMSE: 0.0667
  MAE:  0.0331
  R²:   0.1487

Overfitting Analysis:
  Gap: 0.7020
  High overfitting
Training LightGBM for: INSTRUMENTALNESS

Performing 5-fold cross-validation...
CV Train MSE: 0.0085
CV Val MSE:   0.1099
CV Train R²:  0.9381
CV Val R²:    0.1945
CV Ratio:     12.95

Training final model...
PERFORMANCE METRICS

Training Set:
  RMSE: 0.1025
  MAE:  0.0694
  R²:   0.9234

Test Set:
  RMSE: 0.3246
  MAE:  0.2646
  R²:   0.2098

Overfitting Analysis:
  Gap: 0.7136
  High overfitting
Training LightGBM for: LIVENESS

Performing 5-fold cross-validation...
CV Train MSE: 0.0030
CV Val MSE:   0.0260
CV Train R²:  0.8831
CV Val R²:    -0.0179
CV

In [61]:
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# Focus on the two most promising targets
for target in ['valence', 'tempo']:
    print(f"Training Ensemble for: {target.upper()}")
    
    y = merged_df[target]
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )


    # Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Create models
    rf = RandomForestRegressor(n_estimators=300, max_depth=12, random_state=42)
    xgb = XGBRegressor(n_estimators=400, max_depth=6, learning_rate=0.02, random_state=42)
    svr = SVR(C=1.0, gamma=0.01, epsilon=0.1)
    
    # Ensemble with weighted voting (favor XGBoost)
    ensemble = VotingRegressor([
        ('rf', rf),
        ('xgb', xgb),
        ('svr', svr)
    ], weights=[1, 3, 1])
    
    # Train
    ensemble.fit(X_train_scaled, y_train)
    
    # Evaluate
    y_pred = ensemble.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Ensemble R²: {r2:.4f}")
    
    # Compare with individual models
    for name, model in [('RF', rf), ('XGB', xgb), ('SVR', svr)]:
        model.fit(X_train_scaled, y_train)
        r2_ind = r2_score(y_test, model.predict(X_test_scaled))
        print(f"{name} R²: {r2_ind:.4f}")

Training Ensemble for: VALENCE
Ensemble R²: 0.3680
RF R²: 0.3769
XGB R²: 0.3366
SVR R²: 0.3206
Training Ensemble for: TEMPO
Ensemble R²: 0.0726
RF R²: 0.0815
XGB R²: 0.0151
SVR R²: 0.0534
