In [6]:
# polynomial_pipeline_cpu.py
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
import joblib

# --- CONFIG ---
POLY_DEGREE = 2            # try 2 or 3 (higher -> explosion of features)
ALPHA_CV = [0.01, 0.1, 1.0, 10.0]  # Ridge regularization grid
RANDOM_STATE = 42

# --- Load data ---
train = pd.read_csv("./data/train.csv")   # ensure train.csv has 'song_popularity' and 'id'
test = pd.read_csv("./data/test.csv")     # ensure test.csv has 'id' column

# --- Separate ---
y = train["song_popularity"].values
X = train.drop(columns=["song_popularity"])
test_ids = test["id"].values
X_test = test.copy()

# --- Column lists (adjust to your dataset's column names) ---
num_cols = ["song_duration_ms","acousticness","danceability","energy",
            "instrumentalness","liveness","loudness","speechiness",
            "tempo","audio_valence"]
cat_cols = ["key","audio_mode","time_signature"]  # numeric-coded categoricals

# If some listed columns don't exist, adjust lists automatically:
num_cols = [c for c in num_cols if c in X.columns]
cat_cols = [c for c in cat_cols if c in X.columns]

# --- Preprocessing ---
num_transformer = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5)),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)
], remainder="drop")

# --- Full pipeline: preprocess -> poly features -> ridge ---
full_pipeline = Pipeline([
    ("preproc", preprocessor),
    ("poly", PolynomialFeatures(degree=POLY_DEGREE, include_bias=False)),
    ("reg", RidgeCV(alphas=ALPHA_CV, cv=5, scoring="neg_mean_squared_error"))
])

# Fit pipeline on training data
print("Fitting pipeline on training data...")
full_pipeline.fit(X, y)

# Predict continuous values on training set
y_pred_cont = full_pipeline.predict(X)

# Evaluate (optional)

Fitting pipeline on training data...


In [9]:
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score

def find_best_threshold(y_true, y_scores, start=0.30, end=0.45, step=1e-6):
    """
    Search for best threshold in [start, end] with high precision (default step=1e-6).
    Returns (best_threshold, best_accuracy).
    """
    thresholds = np.arange(start, end, step)
    best_thresh = 0.5
    best_acc = 0
    
    for thresh in thresholds:
        preds = (y_scores > thresh).astype(int)
        acc = accuracy_score(y_true, preds)
        if acc > best_acc:
            best_acc = acc
            best_thresh = thresh
    
    return best_thresh, best_acc

# --- Usage ---
try:
    auc = roc_auc_score(y, y_pred_cont)
    best_thresh, best_acc = find_best_threshold(y, y_pred_cont, start=0.35, end=0.40, step=1e-6)
    thresh_preds = (y_pred_cont > best_thresh).astype(int)
    print(f"Train ROC-AUC: {auc:.6f}, Best Threshold: {best_thresh:.6f}, Accuracy: {best_acc:.6f}")
except Exception as e:
    print("Evaluation skipped:", e)

THRESHOLD = best_thresh  # Set global threshold for test predictions


Train ROC-AUC: 0.586232, Best Threshold: 0.399859, Accuracy: 0.604067


In [10]:
# Predict on test
print("Transforming & predicting test set...")
test_pred_cont = full_pipeline.predict(X_test)

# Apply threshold
test_pred_bin = (test_pred_cont > THRESHOLD).astype(int)

# Save submission
submission = pd.DataFrame({
    "id": test_ids,
    "song_popularity": test_pred_bin
})
submission.to_csv("./data/submission.csv", index=False)
print("✅ Saved submission.csv")

Transforming & predicting test set...
✅ Saved submission.csv
