In [1]:
# basic
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random

# cross validation
from sklearn.model_selection import KFold

# models
from sklearn.tree import ExtraTreeClassifier  
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier    
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier as rf
from lightgbm import LGBMClassifier as lgbm
from catboost import CatBoostClassifier as catboost
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier as xgb


# evaluating
from sklearn.metrics import roc_auc_score

#optimization
import optuna

In [2]:
ss = pd.read_csv("/kaggle/input/song-popularity-prediction/sample_submission.csv")
train = pd.read_csv("/kaggle/input/song-popularity-prediction/train.csv")
test = pd.read_csv("/kaggle/input/song-popularity-prediction/test.csv")

TRAIN_SIZE = 0.5
VALID_SIZE = 0.5

TARGET="song_popularity"

In [3]:
def FE(df):
    
    def logme(ser: pd.Series):
        abs_ = ser.abs()
        abs_ = np.where(abs_ == 0, 0.001, abs_)
        return np.log(abs_)

    def update_trims(df):
            final_lower_trims = {'song_duration_ms': 1,
                                 'acousticness': 0,
                                 'danceability': 1,
                                 'energy': 0,
                                 'instrumentalness': 0,
                                 'liveness': 0,
                                 'loudness': 0,
                                 'speechiness': 1,
                                 'tempo': 0,
                                 'time_signature': 1,
                                 'audio_valence': 0}


            final_upper_trims = {'song_duration_ms': 1,
                                 'acousticness': 1,
                                 'danceability': 1,
                                 'energy': 1,
                                 'instrumentalness': 1,
                                 'liveness': 1,
                                 'loudness': 1,
                                 'speechiness': 1,
                                 'tempo': 1,
                                 'time_signature': 1,
                                 'audio_valence': 1}
            
            
            for col in df.columns:
                if col in final_upper_trims:
                    max_ = df[col].max()
                    min_ =  df[col].min()
                    p = (max_ - min_) / 100
                    new_max = max_ - (p * final_upper_trims[col])
                    new_min = min_ + (p * final_lower_trims[col])
                    df[col] = np.clip(df[col], new_min, new_max)
            return df

    
    df["acousticness"] =  logme(df["acousticness"])
    df["danceability"] =  logme(df["danceability"])
    df["instrumentalness"] =  logme(df["instrumentalness"])
    df["liveness"] =  logme(df["liveness"])
    df["speechiness"] =  logme(df["speechiness"])    
    
    df = update_trims(df)
    
    df = df.fillna(df.median())
    
    return df
    

In [4]:
def train_N_models(X,y):
    models = {} 
    
    lgbm_parms   = {'boosting_type': 'dart', 'num_leaves': 16, 'n_estimators': 438, 'max_depth': 33, 'min_samples_leaf': 6, 'learning_rate': 0.05666201057300166, 'subsample': 0.6846131162945679, 'reg_alpha': 0.8841562054396195, 'reg_lambda': 0.43480247488835144}
    lgbm_model = lgbm(**lgbm_parms)
    lgbm_model.fit(X,y) 
    models["lgbm"]= lgbm_model
    
    rf_parms = {'random_state': 2247,'n_estimators': 182, 'max_depth': 9, 'min_samples_split': 36, 'min_samples_leaf': 57, 'max_features': 13}
    rf_model = rf(**rf_parms) 
    rf_model.fit(X,y) 
    models["rf"]= rf_model
    
    GB_parms ={'n_estimators': 333, 'max_depth': 3, 'min_samples_split': 58, 'min_samples_leaf': 16, 'max_features': 9, 'learning_rate': 0.05690001817211911, 'subsample': 0.8865189659366153, 'criterion': 'mse'}
    GB_mosel = GradientBoostingClassifier(**GB_parms)
    GB_mosel.fit(X,y)
    models["GB"]= GB_mosel

    cat_parms = {'objective': 'Logloss', 'colsample_bylevel': 0.08751674271524071, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'iterations': 178, 'learning_rate': 0.15825880083719676, 'random_strength': 92, 'od_type': 'IncToDec', 'verbose': 0}
    cat_model = catboost(**cat_parms)
    cat_model.fit(X,y)
    models["cat"]= cat_model 
    
    xgb_parms = {'n_estimators': 84, 'booster': 'gbtree', 'max_depth': 3, 'min_samples_split': 68, 'min_samples_leaf': 32, 'max_features': 3, 'learning_rate': 0.1011207776914081, 'reg_alpha': 0.2769253824871142, 'reg_lambda': 0.38851574155058827, 'gamma': 0.914525582012808, 'subsample': 0.8187435357656347}
    xgb_model = xgb(**xgb_parms)
    xgb_model.fit(X,y)
    models["xgb"] = xgb_model
    

    ETC_parms = {'n_estimators': 472, 'max_depth': 12, 'min_samples_split': 107, 'min_samples_leaf': 9, 'max_features': 13}
    ETC_model = ExtraTreesClassifier(**ETC_parms)
    ETC_model.fit(X,y)
    models["ETC"]= ETC_model 
    
    
    return models

In [5]:
def predict_N_models(X,models, ens_method="avg"):
    preds_df = pd.DataFrame()
    for model in models:
        if model=="SG":
            preds_df[model] = models[model].predict(X)

        else:
            pos_ind = list(models[model].classes_).index(1)
            preds = models[model].predict_proba(X)
            preds_df[model] = [preds[row][pos_ind] for row in range(preds.shape[0])]
        
    if ens_method == "avg":
        return preds_df.mean(axis = 1)
    
    elif ens_method == "ens":
        return preds_df
        

In [6]:
def train_ens(models_preds, y):
    parms = {'verbose': 0, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.058798132946050255, 'depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'iterations': 78, 'learning_rate': 0.01713074463652201, 'random_strength': 52, 'od_type': 'Iter', 'bagging_temperature': 4.812865115761063}
    catboost_model = catboost(**parms)
    catboost_model.fit(models_preds,y)

    return catboost_model

In [7]:
def predict_ens(preds, ens):
    pos_ind = list(ens.classes_).index(1)
    preds = ens.predict_proba(preds)
    return [preds[row][pos_ind] for row in range(preds.shape[0])]

In [8]:
FOLDS = 5
seeds = [947, 27, 1, 2022, 20]

test = FE(test)
folds_preds = []

for fold in range(FOLDS):
    random.seed(seeds[fold])
    train_index = random.sample(list(train.index), int(train.shape[0] * TRAIN_SIZE))
    left_index = [ind for ind in list(train.index) if ind not in train_index]
    valid_index = random.sample(left_index, int(train.shape[0] * VALID_SIZE))

    X_train, X_valid = train.iloc[train_index].reset_index().drop('index', axis=1), train.iloc[valid_index].reset_index().drop('index', axis=1)
    y_train, y_valid = train.iloc[train_index].reset_index()['song_popularity'], train.iloc[valid_index].reset_index()['song_popularity']

    X_train = X_train.drop(TARGET,axis=1)
    X_valid = X_valid.drop(TARGET,axis=1)

    X_train = FE(X_train)
    X_valid = FE(X_valid)
    
    models = train_N_models(X_train.drop("id", axis = 1), y_train)
    models_preds = predict_N_models(X_valid.drop("id", axis = 1), models, "ens")
    ensemble_model = train_ens(models_preds, y_valid)

    N_preds = predict_N_models(test.drop("id", axis = 1), models, "ens")
    folds_preds.append(predict_ens(N_preds, ensemble_model))





Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [9]:
final_preds = np.array(folds_preds).mean(axis =0)

In [10]:
ss["song_popularity"] = final_preds
ss.to_csv("submission.csv", index=False)

In [11]:
ss

Unnamed: 0,id,song_popularity
0,0,0.404362
1,1,0.417344
2,2,0.385500
3,3,0.393675
4,4,0.400625
...,...,...
9995,9995,0.396492
9996,9996,0.386148
9997,9997,0.401665
9998,9998,0.414575
