In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import xgboost as xgb

In [None]:
# Read the CSV file
df = pd.read_csv('../tracks.csv')

# Drop unnecessary feature columns
features = df.drop(columns=[col for col in df.columns if col.startswith('label_')])
features = features.drop(columns=['track_id', 'track_name', 'track_artist'])

# Extract labels (main genre only)
labels_main_only = df[[col for col in df.columns if col.startswith('label_')]]
labels_main_only = labels_main_only.idxmax(axis=1).apply(lambda x: x.replace('label_', '').split('_')[0])

# Extract labels (main + subgenre)
labels_main_and_sub = df[[col for col in df.columns if col.startswith('label_')]]
labels_main_and_sub = labels_main_and_sub.idxmax(axis=1).apply(lambda x: x.replace('label_', ''))

In [None]:
main_genre_encoder = LabelEncoder()
main_encoded = main_genre_encoder.fit_transform(labels_main_only)

sub_genre_encoder = LabelEncoder()
sub_encoded = sub_genre_encoder.fit_transform(labels_main_and_sub)

X = features.copy()
X_train_main, X_test_main, y_train_main, y_test_main = train_test_split(X, main_encoded, test_size=0.2, random_state=42)
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X, sub_encoded, test_size=0.2, random_state=42)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist = {
    "n_estimators":      randint(150, 450),
    "max_depth":         randint(3, 9),
    "learning_rate":     uniform(0.03, 0.17),
    "subsample":         uniform(0.7, 0.3),
    "colsample_bytree":  uniform(0.7, 0.3),
    "gamma":             uniform(0, 5),
    "min_child_weight":  uniform(0, 5),
    "reg_alpha":         uniform(0, 2),
    "reg_lambda":        uniform(0.5, 4.5),
    "scale_pos_weight":  uniform(0.8, 4.2),
}

clf = XGBClassifier(
    tree_method="gpu_hist",
    n_jobs=-1,
    random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # keeps class balance

search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_dist,
    n_iter=25,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1,
    error_score="raise"        # optional: crash instead of filling nan
)

search.fit(X_train_main, y_train_main)


In [None]:
model_main = train_and_evaluate(X_train_main, X_test_main, y_train_main, y_test_main, main_genre_encoder, "Main Genre (XGBoost)")

model_sub = train_and_evaluate(X_train_sub, X_test_sub, y_train_sub, y_test_sub, sub_genre_encoder,"Main + Subgenre (XGBoost)")

