In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
import optuna
import xgboost as xgb

# ------------------ Load Data ------------------
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

X = train.drop(columns=["song_popularity"])
y = train["song_popularity"]
test_ids = test["id"]
X_test = test.drop(columns=["id"])

# ------------------ Feature Engineering ------------------
def feature_engineering(df):
    df = df.copy()
    df["energy_dance"] = df["energy"] * df["danceability"]
    df["speech_instr"] = df["speechiness"] * (1 - df["instrumentalness"])
    df["log_duration"] = np.log1p(df["song_duration_ms"].fillna(df["song_duration_ms"].median()))
    df["log_tempo"] = np.log1p(df["tempo"].fillna(df["tempo"].median()))
    return df

feat_engineer = FunctionTransformer(feature_engineering)

num_cols = ["song_duration_ms","acousticness","danceability","energy",
            "instrumentalness","liveness","loudness","speechiness",
            "tempo","audio_valence","energy_dance","speech_instr","log_duration","log_tempo"]
cat_cols = ["key","audio_mode","time_signature"]

num_transformer = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5)),
    ("scaler", StandardScaler())
])
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)
])

# ------------------ AutoXGBoost with Optuna ------------------
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "tree_method": "gpu_hist",    # GPU training
        "predictor": "gpu_predictor", # GPU inference
        "eval_metric": "logloss",
        "use_label_encoder": False
    }
    
    # Build pipeline
    model = Pipeline([
        ("features", feat_engineer),
        ("preprocessor", preprocessor),
        ("clf", xgb.XGBClassifier(**params))
    ])
    
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []
    for train_idx, valid_idx in cv.split(X, y):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        scores.append(model.score(X.iloc[valid_idx], y.iloc[valid_idx]))
    
    return np.mean(scores)

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)  # you can increase to 100+ if time permits
print("Best params:", study.best_params)

# ------------------ Train Final Model ------------------
best_params = study.best_params
best_params.update({
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "eval_metric": "logloss",
    "use_label_encoder": False
})

final_model = Pipeline([
    ("features", feat_engineer),
    ("preprocessor", preprocessor),
    ("clf", xgb.XGBClassifier(**best_params))
])



[I 2025-09-15 23:50:12,473] A new study created in memory with name: no-name-5c35d7e4-9c24-464c-8c80-0bd43586a79d

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, i

Best params: {'n_estimators': 259, 'max_depth': 3, 'learning_rate': 0.010183083254484622, 'subsample': 0.6064891012539207, 'colsample_bytree': 0.9089055214863802, 'gamma': 1.8277033804498453, 'reg_lambda': 1.93012522932704, 'reg_alpha': 4.967521448186203}


In [4]:

final_model.fit(X, y)



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,steps,"[('features', ...), ('preprocessor', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function fea...x7ea24d780ea0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9089055214863802
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
# ------------------ Predict Test ------------------
test_preds = final_model.predict(X_test)

submission = pd.DataFrame({
    "id": test_ids,
    "song_popularity": test_preds
})
submission.to_csv("submission_3.csv", index=False)
print("✅ submission.csv created with AutoXGBoost + GPU")


✅ submission.csv created with AutoXGBoost + GPU



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
