In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.18.4-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.46-cp312-cp312-win_amd64.whl.metadata (9.8 kB)
Collecting tqdm (from optuna)
  Downloading tqdm-4.67.3-py3-none-any.whl.metadata (57 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.3.1-cp312-cp312-win_amd64.whl.metadata (3.8 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
Downloading alembic-1.18.4-py3-none-any.whl (263 kB)
Downloading sqlalchemy-2.0.46-cp312-cp312-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------

In [6]:
import pandas as pd
import numpy as np
import optuna
import gc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../../data/processed/ost_list_endgame.csv')

float_cols = df.select_dtypes(include=['float64']).columns
df[float_cols] = df[float_cols].astype('float32')


target_genres = [
    'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
    'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign',
    'History', 'Horror', 'Music', 'Mystery', 'Romance',
    'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western'
]

numeric_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode',
    'speechiness', 'acousticness', 'instrumentalness',
    'liveness', 'valence', 'tempo', 'duration_ms'
]

X = df[numeric_features]
y = df[target_genres]

del df
gc.collect()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


def top3_accuracy(y_true, y_pred_proba):
    """
    y_true: DataFrame (multi-label)
    y_pred_proba: numpy array (n_samples, n_classes)
    """
    top3 = np.argsort(y_pred_proba, axis=1)[:, -3:]
    correct = 0
    valid = 0

    for i in range(len(y_true)):
        true_idx = np.where(y_true.iloc[i].values == 1)[0]
        if len(true_idx) == 0:
            continue
        valid += 1
        if np.any(np.isin(true_idx, top3[i])):
            correct += 1

    return correct / valid if valid > 0 else 0


X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)


def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 400, step=50),
        'max_depth': trial.suggest_int('max_depth', 8, 16),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 8),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 3),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'class_weight': 'balanced',
        'n_jobs': 1,
        'random_state': 42
    }

    model = RandomForestClassifier(**params)
    model.fit(X_tr, y_tr)

    proba_list = model.predict_proba(X_val)
    y_pred_proba = np.stack([p[:, 1] for p in proba_list], axis=1)

    score = top3_accuracy(y_val, y_pred_proba)

    del model, proba_list, y_pred_proba
    gc.collect()

    return score


study = optuna.create_study(direction='maximize')
study.optimize(rf_objective, n_trials=15)

print("Best Validation Top-3 Accuracy:", round(study.best_value, 4))
print("Best Params:", study.best_params)

final_model = RandomForestClassifier(
    **study.best_params,
    class_weight='balanced',
    n_jobs=1,
    random_state=42
)

final_model.fit(X_train, y_train)

test_proba_list = final_model.predict_proba(X_test)
y_test_proba = np.stack([p[:, 1] for p in test_proba_list], axis=1)

final_top3 = top3_accuracy(y_test, y_test_proba)
print(f"Final Test Top-3 Accuracy: {final_top3:.4f}")


[32m[I 2026-02-11 05:59:34,736][0m A new study created in memory with name: no-name-6d277af1-7a0d-48c4-bc6e-7ba4c6365a0b[0m
[32m[I 2026-02-11 05:59:49,125][0m Trial 0 finished with value: 0.5307406809746652 and parameters: {'n_estimators': 350, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.5307406809746652.[0m
[32m[I 2026-02-11 06:00:07,779][0m Trial 1 finished with value: 0.6816201387768275 and parameters: {'n_estimators': 400, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.6816201387768275.[0m
[32m[I 2026-02-11 06:00:15,880][0m Trial 2 finished with value: 0.28320154913667905 and parameters: {'n_estimators': 250, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.6816201387768275.[0m
[32m[I 2026-02-11 06:00:27,416][0m Trial 3 finished with value: 0.4253671131192

Best Validation Top-3 Accuracy: 0.747
Best Params: {'n_estimators': 400, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Final Test Top-3 Accuracy: 0.7378
