In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import lightgbm as lgb

In [None]:
# Read the CSV file
df = pd.read_csv('../tracks.csv')

# Drop unnecessary feature columns
features = df.drop(columns=[col for col in df.columns if col.startswith('label_')])
features = features.drop(columns=['track_id', 'track_name', 'track_artist'])

# Extract labels (main genre only)
labels_main_only = df[[col for col in df.columns if col.startswith('label_')]]
labels_main_only = labels_main_only.idxmax(axis=1).apply(lambda x: x.replace('label_', '').split('_')[0])

# Extract labels (main + subgenre)
labels_main_and_sub = df[[col for col in df.columns if col.startswith('label_')]]
labels_main_and_sub = labels_main_and_sub.idxmax(axis=1).apply(lambda x: x.replace('label_', ''))

In [None]:
main_genre_encoder = LabelEncoder()
main_encoded = main_genre_encoder.fit_transform(labels_main_only)

sub_genre_encoder = LabelEncoder()
sub_encoded = sub_genre_encoder.fit_transform(labels_main_and_sub)

X = features.copy()
X_train_main, X_test_main, y_train_main, y_test_main = train_test_split(X, main_encoded, test_size=0.2, random_state=42)
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X, sub_encoded, test_size=0.2, random_state=42)

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200],
    'min_child_samples': [5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0.5, 1.0]
}

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

def train_and_evaluate(X_train, X_test, y_train, y_test, label_encoder, model_name):
    # Train with hyperparameter search
    model = lgb.LGBMClassifier(objective='multiclass', random_state=42, device='gpu')
    search = RandomizedSearchCV(model, param_grid, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    # Predict
    y_pred = best_model.predict(X_test)

    # Print header
    print(f"\n===== {model_name} Tuned Classification Report =====")

    # Print scikit-learn classic report
    print(classification_report(
        y_test, y_pred, target_names=label_encoder.classes_, digits=2
    ))

    # Accuracy separately
    acc = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {acc:.4f}")

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 6))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.title(f"{model_name} - Confusion Matrix")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

    return best_model


In [None]:
model_main = train_and_evaluate(X_train_main, X_test_main, y_train_main, y_test_main, main_genre_encoder, "Main Genre (LightGBM)")

model_sub = train_and_evaluate(X_train_sub, X_test_sub, y_train_sub, y_test_sub, sub_genre_encoder, "Main + Subgenre (LightGBM)")


In [None]:
joblib.dump(model_main, 'lightgbm_main_genre.pkl')
joblib.dump(model_sub, 'lightgbm_main_and_sub.pkl')