In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier

In [11]:
# Load data
df = pd.read_csv("./data/train.csv")

# Features and target
X = df.drop(columns=["song_popularity", "id"])
y = df["song_popularity"]

# Identify types
num_cols = ["song_duration_ms","acousticness","danceability","energy",
            "instrumentalness","liveness","loudness","speechiness",
            "tempo","audio_valence"]
cat_cols = ["key","audio_mode","time_signature"]

# Preprocessing
# --- Feature Engineering ---
def feature_engineering(df):
    df = df.copy()
    df["energy_dance"] = df["energy"] * df["danceability"]
    df["speech_instr"] = df["speechiness"] * (1 - df["instrumentalness"])
    df["log_duration"] = np.log1p(df["song_duration_ms"].fillna(df["song_duration_ms"].median()))
    df["log_tempo"] = np.log1p(df["tempo"].fillna(df["tempo"].median()))
    return df

feat_engineer = FunctionTransformer(feature_engineering)

# Update feature lists with engineered features
extra_num_cols = ["energy_dance","speech_instr","log_duration","log_tempo"]

num_transformer = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5)),
    ("scaler", StandardScaler())
])
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_cols + extra_num_cols),
    ("cat", cat_transformer, cat_cols)
])




In [12]:
xgb_model = xgb.XGBClassifier(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="gpu_hist",      # GPU training
    predictor="gpu_predictor",   # GPU inference
    eval_metric="logloss"
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    device="gpu"                 # GPU mode
)

cat_model = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=10,
    task_type="GPU",             # GPU mode
    verbose=0
)

# --- Ensemble ---
ensemble = VotingClassifier(
    estimators=[("xgb", xgb_model), ("lgb", lgb_model), ("cat", cat_model)],
    voting="soft"
)

# --- Final Pipeline ---
clf = Pipeline([
    ("features", feat_engineer),
    ("preprocessor", preprocessor),
    ("model", ensemble)
])

# Cross-validation
scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
print("CV Accuracy:", scores.mean(), "±", scores.std())

# Train on full dataset
clf.fit(X, y)


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8746, number of negative: 15254
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 32
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A4500, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (0.46 MB) transferred to GPU in 0.001164 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364417 -> initscore=-0.556245
[LightGBM] [Info] Start training from score -0.556245



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8746, number of negative: 15254
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 32
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A4500, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (0.46 MB) transferred to GPU in 0.001056 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364417 -> initscore=-0.556245
[LightGBM] [Info] Start training from score -0.556245



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8746, number of negative: 15254
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 32
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A4500, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (0.46 MB) transferred to GPU in 0.001103 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364417 -> initscore=-0.556245
[LightGBM] [Info] Start training from score -0.556245



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8745, number of negative: 15255
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 32
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A4500, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (0.46 MB) transferred to GPU in 0.001088 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364375 -> initscore=-0.556425
[LightGBM] [Info] Start training from score -0.556425



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8745, number of negative: 15255
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 32
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A4500, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (0.46 MB) transferred to GPU in 0.001065 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364375 -> initscore=-0.556425
[LightGBM] [Info] Start training from score -0.556425



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


CV Accuracy: 0.6223666666666666 ± 0.003668787265568799



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 10932, number of negative: 19068
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 30000, number of used features: 32
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A4500, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (0.57 MB) transferred to GPU in 0.001084 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364400 -> initscore=-0.556317
[LightGBM] [Info] Start training from score -0.556317


0,1,2
,steps,"[('features', ...), ('preprocessor', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function fea...x73587d931f80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimators,"[('xgb', ...), ('lgb', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.05
,n_estimators,800
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [13]:
# Predict on test
test = pd.read_csv("./data/test.csv")
test_ids = test["id"]
X_test = test.drop(columns=["id"])
test_preds = clf.predict(X_test)

# Save submission
submission = pd.DataFrame({
    "id": test_ids,
    "song_popularity": test_preds
})
submission.to_csv("submission_2.csv", index=False)

print("✅ submission.csv file created!")

✅ submission.csv file created!



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
