In [2]:
# import models and fit
import pandas as pd

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score

In [21]:
# Load data
X_train_scaled = pd.read_csv('../data/processed/X_train_scaled.csv')
X_test_scaled = pd.read_csv('../data/processed/X_test_scaled.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

X_train_scaled.describe()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,26266.0,26266.0,26266.0,26266.0,26266.0,26266.0,26266.0,26266.0,26266.0,26266.0,26266.0,26266.0
mean,9.955065e-17,-5.166895e-16,3.489683e-17,-3.2462170000000005e-17,5.302154e-17,1.527751e-16,8.034386000000001e-17,-7.236358e-18,-2.16685e-16,-4.217377e-16,4.450022e-16,-9.109696000000001e-17
std,1.000019,1.000019,1.000019,1.000019,1.000019,1.000019,1.000019,1.000019,1.000019,1.000019,1.000019,1.000019
min,-4.505887,-3.844018,-1.488402,-13.30553,-1.147447,-1.056623,-0.8004158,-0.3775346,-1.227342,-2.18337,-4.490573,-3.69134
25%,-0.6307899,-0.6473257,-0.9338486,-0.4884452,-1.147447,-0.6528069,-0.731501,-0.3775346,-0.6283951,-0.7681144,-0.7762985,-0.6363735
50%,0.1194509,0.1245226,0.1752589,0.1854683,0.8715,-0.4385574,-0.4340791,-0.3774654,-0.4058923,0.003843458,0.04243293,-0.1635064
75%,0.7303121,0.7805936,1.00709,0.6918454,0.8715,0.2466464,0.3647889,-0.3558461,0.3702802,0.7843786,0.4865662,0.4609262
max,2.260046,1.668219,1.561643,2.677165,0.8715,7.60222,3.706251,4.060626,5.214891,2.066686,4.406957,4.856351


In [12]:
# Ridge Regression (L2)
ridge_model = Ridge(alpha=1.0)  # alpha is the regularization strength
ridge_model.fit(X_train_scaled, y_train)

y_train_ridge = ridge_model.predict(X_train_scaled)
y_test_ridge = ridge_model.predict(X_test_scaled)

# Lasso Regression (L1)
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train_scaled, y_train)

y_train_lasso = lasso_model.predict(X_train_scaled)
y_test_lasso = lasso_model.predict(X_test_scaled)

In [14]:
train_mse_ridge = accuracy_score(y_train, y_train_ridge)
train_r2_ridge = precision_score(y_train, y_train_ridge)

mse_ridge = accuracy_score(y_test, y_test_ridge)
r2_ridge = precision_score(y_test, y_test_ridge)

ridge_metrics = {'MSE':mse_ridge, 'R2':r2_ridge}

print(f'Train MSE: \t {train_mse_ridge}')
print(f'Test MSE: \t {mse_ridge}')
print(f'Train R2: \t {train_r2_ridge}')
print(f'Test R2: \t {r2_ridge}')

Train MSE: 	 0.16756935391332417
Test MSE: 	 0.16429258534304636
Train R2: 	 0.8359779862515065
Test R2: 	 0.8567334823029755


In [5]:
#Logistic Regression
LR_model = LinearRegression()
LR_model.fit(X_train_scaled, y_train)

y_train_LR = LR_model.predict(X_train_scaled)
y_test_LR = LR_model.predict(X_test_scaled)

train_mse_LR = accuracy_score(y_train, y_train_LR)
train_r2_LR = precision_score(y_train, y_train_LR)

mse_LR = accuracy_score(y_test, y_test_LR)
r2_LR = precision_score(y_test, y_test_LR)

LR_metrics = {'MSE':mse_LR, 'R2':r2_LR}

print(f'Train MSE: \t {train_mse_LR}')
print(f'Test MSE: \t {mse_LR}')
print(f'Train R2: \t {train_r2_LR}')
print(f'Test R2: \t {r2_LR}')



Train MSE: 	 0.1621316220371805
Test MSE: 	 0.14469695049134182
Train R2: 	 0.841300604688114
Test R2: 	 0.8738212794266512


In [25]:
# XGBoots
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_scaled, y_train)

y_train_xgb = xgb_model.predict(X_train_scaled)
y_test_xgb = xgb_model.predict(X_test_scaled)

train_accuracy_xgb = accuracy_score(y_train, y_train_xgb)
train_precison_xgb = precision_score(y_train, y_train_xgb, average='macro')
train_recall_xgb = recall_score(y_train, y_train_xgb, average='macro')

accuracy_xgb = accuracy_score(y_test, y_test_xgb)
precison_xgb = precision_score(y_test, y_test_xgb, average='macro')
recall_xgb = recall_score(y_test, y_test_xgb, average='macro')

XGB_metrics = {'accuracy':accuracy_xgb, 'precision':precison_xgb, 'recall':recall_xgb}

print(f'Train accuracy: \t {train_accuracy_xgb}')
print(f'Test accuracy: \t {accuracy_xgb}')
print(f'Train precision: \t {train_precison_xgb}')
print(f'Test precision: \t {precison_xgb}')
print(f'Train recall: \t {train_recall_xgb}')
print(f'Test recall: \t {recall_xgb}')

Train accuracy: 	 0.8022157922789919
Test accuracy: 	 0.5545911375057103
Train precision: 	 0.8031288225784352
Test precision: 	 0.5525002839747288
Train recall: 	 0.8020504499967954
Test recall: 	 0.5535543739210276


In [26]:
# Decision Tree
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train_scaled, y_train)

y_train_tree = tree_model.predict(X_train_scaled)
y_test_tree = tree_model.predict(X_test_scaled)

train_accuracy_tree = accuracy_score(y_train, y_train_tree)
train_precision_tree = precision_score(y_train, y_train_tree, average='weighted')

accuracy_tree = accuracy_score(y_test, y_test_tree)
precision_tree = precision_score(y_test, y_test_tree, average='weighted')

DTree_metrics = {'MSE':accuracy_tree, 'R2':precision_tree}

print(f'Train MSE: \t {train_accuracy_tree}')
print(f'Test MSE: \t {accuracy_tree}')
print(f'Train R2: \t {train_precision_tree}')
print(f'Test R2: \t {precision_tree}')

Train MSE: 	 0.9390086042792964
Test MSE: 	 0.43535861123800823
Train R2: 	 0.9405669715549807
Test R2: 	 0.4377462249914943


In [23]:
# Random Forest
forest_model = RandomForestClassifier(random_state=42)
forest_model.fit(X_train_scaled, y_train)

y_train_forest = forest_model.predict(X_train_scaled)
y_test_forest = forest_model.predict(X_test_scaled)

train_accuracy_forest = accuracy_score(y_train, y_train_forest)
train_precision_forest = precision_score(y_train, y_train_forest, average='macro')

accuracy_forest = accuracy_score(y_test, y_test_forest)
precision_forest = precision_score(y_test, y_test_forest, average='macro')

Forest_metrics = {'MSE':accuracy_forest, 'R2':precision_forest}

print(f'Train MSE: \t {train_accuracy_forest}')
print(f'Test MSE: \t {accuracy_forest}')
print(f'Train R2: \t {train_precision_forest}')
print(f'Test R2: \t {precision_forest}')

  forest_model.fit(X_train_scaled, y_train)


Train MSE: 	 0.9390086042792964
Test MSE: 	 0.5513933302878027
Train R2: 	 0.9392078085242707
Test R2: 	 0.5445726133076186


In [24]:
# Random Forest
forest_model = RandomForestClassifier(random_state=42)
forest_model.fit(X_train_scaled, y_train)

y_train_forest = forest_model.predict(X_train_scaled)
y_test_forest = forest_model.predict(X_test_scaled)

train_accuracy_forest = accuracy_score(y_train, y_train_forest)
train_precision_forest = precision_score(y_train, y_train_forest, average='weighted')

accuracy_forest = accuracy_score(y_test, y_test_forest)
precision_forest = precision_score(y_test, y_test_forest, average='weighted')

Forest_metrics = {'MSE':accuracy_forest, 'R2':precision_forest}

print(f'Train MSE: \t {train_accuracy_forest}')
print(f'Test MSE: \t {accuracy_forest}')
print(f'Train R2: \t {train_precision_forest}')
print(f'Test R2: \t {precision_forest}')

  forest_model.fit(X_train_scaled, y_train)


Train MSE: 	 0.9390086042792964
Test MSE: 	 0.5513933302878027
Train R2: 	 0.9389949379060863
Test R2: 	 0.5450224124541274


In [9]:
# SVR
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train_scaled, y_train)

y_train_svr = svr_model.predict(X_train_scaled)
y_test_svr = svr_model.predict(X_test_scaled)

train_mse_svr = accuracy_score(y_train, y_train_svr)
train_r2_svr = precision_score(y_train, y_train_tree)

mse_svr = accuracy_score(y_test, y_test_svr)
r2_svr = precision_score(y_test, y_test_svr)

SVR_metrics = {'MSE':mse_svr, 'R2':r2_svr}

print(f'Train MSE: \t {train_mse_svr}')
print(f'Test MSE: \t {mse_svr}')
print(f'Train R2: \t {train_r2_svr}')
print(f'Test R2: \t {r2_svr}')

Train MSE: 	 0.15843753400973856
Test MSE: 	 0.2779567950484619
Train R2: 	 1.0
Test R2: 	 0.7576159507523135


  y = column_or_1d(y, warn=True)


#### Cross-Validation for Regularization Parameter Tuning

In [19]:
# Define a grid of alpha values for Ridge
param_grid = {'alpha': [0.01, 0.1, 1.0, 10, 100]}
ridge_cv = GridSearchCV(Ridge(), param_grid, cv=5)
ridge_cv.fit(X_train_scaled, y_train)

# Best alpha
print("Best alpha:", ridge_cv.best_params_)

# Evaluate the best model
best_ridge_model = ridge_cv.best_estimator_
print("Best Ridge Train R²:", best_ridge_model.score(X_train_scaled, y_train))
print("Best Ridge Test R²:", best_ridge_model.score(X_test_scaled, y_test))

Best alpha: {'alpha': 10}
Best Ridge Train R²: 0.8301067235378031
Best Ridge Test R²: 0.8452075328700599
