In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = pd.read_csv(r"D:\Study\ML\dataset-of-10s.csv")

In [3]:
X = df.drop(['track', 'artist', 'uri', 'target', 'danceability'], axis=1)
y = df['danceability']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
def evaluate_regressor(model, X_test_scaled, y_test, model_name):
    """Calculates and prints R-squared and MSE for a regression model."""
    print(f"\n--- Evaluating: {model_name} ---")
    
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    print(f"R-Squared (R²): {r2:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    return r2

In [7]:
# Model 1: Linear Regression (Baseline)
print("\nTraining Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
lr_r2 = evaluate_regressor(lr_model, X_test_scaled, y_test, "Linear Regression")


Training Linear Regression...

--- Evaluating: Linear Regression ---
R-Squared (R²): 0.4430
Mean Squared Error (MSE): 0.0216


In [8]:
# Model 2: Random Forest Regressor
print("\nTraining Random Forest Regressor...")
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train_scaled, y_train)
rf_r2 = evaluate_regressor(rf_model, X_test_scaled, y_test, "Random Forest Regressor")


Training Random Forest Regressor...

--- Evaluating: Random Forest Regressor ---
R-Squared (R²): 0.6185
Mean Squared Error (MSE): 0.0148


In [9]:
# Model 3: XGBoost Regressor
print("\nTraining XGBoost Regressor...")
xgb_model = xgb.XGBRegressor(random_state=42, n_estimators=100)
xgb_model.fit(X_train_scaled, y_train)
xgb_r2 = evaluate_regressor(xgb_model, X_test_scaled, y_test, "XGBoost Regressor")


Training XGBoost Regressor...

--- Evaluating: XGBoost Regressor ---
R-Squared (R²): 0.6106
Mean Squared Error (MSE): 0.0151


In [11]:
#Model 4: Support Vector Regressor (SVR) ---
from sklearn.svm import SVR
print("\nTraining Support Vector Regressor (SVR)...")
# SVR is sensitive to scaling, which we've already done.
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)
svr_r2 = evaluate_regressor(svr_model, X_test_scaled, y_test, "Support Vector Regressor (SVR)")


Training Support Vector Regressor (SVR)...

--- Evaluating: Support Vector Regressor (SVR) ---
R-Squared (R²): 0.5678
Mean Squared Error (MSE): 0.0168


In [13]:
models = {
    'Linear Regression': (lr_model, lr_r2),
    'Random Forest': (rf_model, rf_r2),
    'XGBoost': (xgb_model, xgb_r2),
    'SVR': (svr_model, svr_r2)
}
best_model_name = max(models, key=lambda k: models[k][1])
best_model_obj = models[best_model_name][0]
print(f"\n--- Best Model Selection ---")
print(f"The best model is: {best_model_name} with R² = {models[best_model_name][1]:.4f}")


--- Best Model Selection ---
The best model is: Random Forest with R² = 0.6185


In [14]:
best_model_to_test = best_model_obj
scaler_to_use = scaler

print(f"\n--- Testing the Best Model: {best_model_name} ---")

feature_names = [
    'energy', 'key', 'loudness', 'mode', 'speechiness', 
    'acousticness', 'instrumentalness', 'liveness', 'valence', 
    'tempo', 'duration_ms', 'time_signature', 'chorus_hit', 'sections'
]

sample_song_features = [
    0.85,    # energy
    7,       # key
    -5.5,    # loudness
    1,       # mode
    0.08,    # speechiness
    0.15,    # acousticness
    0.0,     # instrumentalness
    0.12,    # liveness
    0.78,    # valence
    124.0,   # tempo
    210000,  # duration_ms
    4,       # time_signature
    45.0,    # chorus_hit
    11       # sections
]


sample_song_scaled = scaler_to_use.transform([sample_song_features])
print("Test song features scaled.")

prediction = best_model_to_test.predict(sample_song_scaled)
predicted_danceability = prediction[0]

print("\n--- PREDICTION COMPLETE ---")
print(f"Predicted Danceability Score: {predicted_danceability:.4f}")
if predicted_danceability > 0.75:
    print("This is a very danceable song!")
elif predicted_danceability > 0.5:
    print("This song is moderately danceable.")
else:
    print("This song is not very danceable.")


--- Testing the Best Model: Random Forest ---
Test song features scaled.

--- PREDICTION COMPLETE ---
Predicted Danceability Score: 0.6979
This song is moderately danceable.


