In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, KBinsDiscretizer, RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet, HuberRegressor, BayesianRidge, ARDRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin, clone
import os
import joblib

from src.feature_engineering import *
from src.modeling import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
#config
DATA_PATH = "data/"
OUTPUT_PATH = "output/"
BLUEPRINT_PATH = "output/blueprint/"
MODEL_PATH = "models/"

BEST_MODELS_LIST = [
    "robust_lgbm_adil_params",
    "outlier_bayesian_alpha_1_1e_06_alpha_2_0_01_lambda_1_0_01_lambda_2_0_01",
    # "bayesian_alpha_1_1e_-06_alpha_2_0_01_lambda_1_0_01_lambda_2_0_01",
    "outlier_lasso_alpha_0_1_max_iter_1000",
    "outlier_elastic_net_alpha_1_0_l1_ratio_0_1",
    # "binning_pca_rf_n_bins_5_max_depth_10_min_samples_leaf_2_min_samples_split_2_n_estimator_200_n_components_0_99",
    # "outlier_pca_ridge_alpha_0_001_solver_auto_n_components_0_85",
    # "outlier_pca_ard_alpha_1e_-06_alpha_2_1e_-06_lambda_1_0_01_lambda_2_0_01_threshold_lambda_1000_n_components_0_9",
    "outlier_pca_xgb_lr_0_01_max_depth_3_n_estimators_100_n_components_0_85",
    # "robust_catboost_adil_params",
]

In [3]:
df_train = pd.read_csv(DATA_PATH + "train.csv")
print(df_train.shape)
df_train.head(3)

(524164, 11)


Unnamed: 0,id,RhythmScore,AudioLoudness,VocalContent,AcousticQuality,InstrumentalScore,LivePerformanceLikelihood,MoodScore,TrackDurationMs,Energy,BeatsPerMinute
0,0,0.60361,-7.636942,0.0235,5e-06,1e-06,0.051385,0.409866,290715.645,0.826267,147.5302
1,1,0.639451,-16.267598,0.07152,0.444929,0.349414,0.170522,0.65101,164519.5174,0.1454,136.15963
2,2,0.514538,-15.953575,0.110715,0.173699,0.453814,0.029576,0.423865,174495.5667,0.624667,55.31989


In [4]:
df_test = pd.read_csv(DATA_PATH + "test.csv")
print(df_test.shape)
df_test.head(3)

(174722, 10)


Unnamed: 0,id,RhythmScore,AudioLoudness,VocalContent,AcousticQuality,InstrumentalScore,LivePerformanceLikelihood,MoodScore,TrackDurationMs,Energy
0,524164,0.410013,-16.794967,0.0235,0.23291,0.012689,0.271585,0.664321,302901.5498,0.424867
1,524165,0.463071,-1.357,0.141818,0.057725,0.257942,0.097624,0.829552,221995.6643,0.846
2,524166,0.686569,-3.368928,0.167851,0.287823,0.210915,0.325909,0.304978,357724.0127,0.134067


In [5]:
target_col = "BeatsPerMinute"
feature_cols = [f for f in df_train.columns if f not in ('id', target_col)]

X_train = df_train[feature_cols].copy().reset_index(drop=True)
y_train = df_train[target_col].copy().reset_index(drop=True)

print(f"X_train shape : {X_train.shape}")
print(f"y_train shape : {y_train.shape}")

X_test = df_test[feature_cols].copy().reset_index(drop=True)

print(f"X_test shape : {X_test.shape}")

X_train shape : (524164, 9)
y_train shape : (524164,)
X_test shape : (174722, 9)


### Extracting current best models

In [6]:
models_dict = {}

for m in BEST_MODELS_LIST:
    blueprint = joblib.load(os.path.join(BLUEPRINT_PATH, f"{m}.joblib"))
    trained = joblib.load(os.path.join(MODEL_PATH, f"{m}_trained.joblib"))

    models_dict[m] = {}
    models_dict[m]["blueprint"] = blueprint
    models_dict[m]["trained"] = trained

### Checking original individual model performances

In [7]:
for m in BEST_MODELS_LIST:
    cv_rmse = cross_val_score(
        models_dict[m]["blueprint"],
        # models_dict[m]["trained"],
        X_train,
        y_train,
        scoring="neg_mean_squared_error",
        cv=5,
    ).mean()
    cv_rmse = (-cv_rmse)**0.5
    models_dict[m]["cv_rmse"] = cv_rmse

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 119.063098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 118.990123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

In [8]:
index = models_dict.keys()
values = [models_dict[k]['cv_rmse'] for k in models_dict.keys()]
df_cv_rmse = pd.DataFrame(zip(index, values), columns=['model_name', 'cv_rmse'])
df_cv_rmse.sort_values('cv_rmse', ascending=True)

Unnamed: 0,model_name,cv_rmse
0,robust_lgbm_adil_params,26.461618
1,outlier_bayesian_alpha_1_1e_06_alpha_2_0_01_la...,26.466404
4,outlier_pca_xgb_lr_0_01_max_depth_3_n_estimato...,26.467218
2,outlier_lasso_alpha_0_1_max_iter_1000,26.467444
3,outlier_elastic_net_alpha_1_0_l1_ratio_0_1,26.467444


### Blending

In [9]:
class AveragingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, models, weights=None):
        self.models = models
        self.weights = weights

    def fit(self, X, y):
        self.models_ = [clone(m).fit(X.copy(), y.copy()) for m in self.models]
        return self

    def predict(self, X):
        preds = np.column_stack([m.predict(X.copy()) for m in self.models_])
        if self.weights is not None:
            return np.average(preds, axis=1, weights=self.weights)
            # return np.mean([p*w for p, w in zip(preds, self.weights)])
        return np.mean(preds, axis=1)

In [10]:
#using simple averaging
model = AveragingRegressor(
    models = [models_dict[key]["blueprint"] for key in models_dict.keys()],
    weights = [1/len(models_dict.keys())] * len(models_dict.keys())
)

rmse = cross_val_score(
    model,
    X_train,
    y_train,
    scoring="neg_mean_squared_error",
    cv=5,
).mean()
rmse = (-rmse)**0.5

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 119.063098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 118.990123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

In [11]:
print(f"Blending - Simple Averaging - CV RMSE : {rmse}")

Blending - Simple Averaging - CV RMSE : 26.464154367409552


In [12]:
#weight based on individual cv rmse performance
def softmax(x):
    e_x = np.exp(x - np.max(x))   # stability trick
    return e_x / e_x.sum()

temp_model_list = [models_dict[key]["blueprint"] for key in df_cv_rmse['model_name']]
temp_weights = softmax(df_cv_rmse['cv_rmse'] * -1)
assert sum(temp_weights)==1

model = AveragingRegressor(
    models = temp_model_list,
    weights = temp_weights
)

rmse = cross_val_score(
    model,
    X_train,
    y_train,
    scoring="neg_mean_squared_error",
    cv=5,
).mean()
rmse = (-rmse)**0.5

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002248 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 119.063098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001243 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 118.990123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001937 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

In [13]:
print(f"Blending - RMSE Softmax Weights - CV RMSE : {rmse}")

Blending - RMSE Softmax Weights - CV RMSE : 26.464144106038066


In [14]:
df_cv_rmse

Unnamed: 0,model_name,cv_rmse
0,robust_lgbm_adil_params,26.461618
1,outlier_bayesian_alpha_1_1e_06_alpha_2_0_01_la...,26.466404
2,outlier_lasso_alpha_0_1_max_iter_1000,26.467444
3,outlier_elastic_net_alpha_1_0_l1_ratio_0_1,26.467444
4,outlier_pca_xgb_lr_0_01_max_depth_3_n_estimato...,26.467218


In [15]:
#custom weights based on observation
temp_model_list = [models_dict[key]["blueprint"] for key in df_cv_rmse['model_name']]
temp_weights = [0.80, 0.05, 0.05, 0.05, 0.05]
# assert sum(temp_weights)==1

model = AveragingRegressor(
    models = temp_model_list,
    weights = temp_weights
)

rmse = cross_val_score(
    model,
    X_train,
    y_train,
    scoring="neg_mean_squared_error",
    cv=5,
).mean()
rmse = (-rmse)**0.5

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001418 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 119.063098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 118.990123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

In [16]:
print(f"Blending - Custom Weights - CV RMSE : {rmse}")

Blending - Custom Weights - CV RMSE : 26.461011386413695


In [17]:
#generate predictions for test set
temp_model_list = [models_dict[key]["blueprint"] for key in df_cv_rmse['model_name']]
temp_weights = [0.80, 0.05, 0.05, 0.05, 0.05]
# assert sum(temp_weights)==1

model = AveragingRegressor(
    models = temp_model_list,
    weights = temp_weights
)

model.fit(X_train, y_train)
y_test_preds = model.predict(X_test)

#save predictions
df_preds = pd.DataFrame({
    "id": df_test["id"],
    target_col: y_test_preds
})
df_preds.to_csv(os.path.join(OUTPUT_PATH, "preds/blending_custom_weights.csv" ), index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001484 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 524164, number of used features: 9
[LightGBM] [Info] Start training from score 119.034899


### Stacking

In [20]:
keys = models_dict.keys()
blueprints = [models_dict[k]['blueprint'] for k in keys]
estimators = [(k, b) for k, b in zip(keys, blueprints)]

stacking = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(),   # meta-model (can be anything)
    cv=5                       # internal CV for meta-features
)

cv_rmse = cross_val_score(
    stacking,
    X_train,
    y_train,
    scoring="neg_mean_squared_error",
    cv=5,
).mean()
cv_rmse = (-cv_rmse)**0.5

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 119.063098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 335464, number of used features: 9
[LightGBM] [Info] Start training from score 119.038252
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000889 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

In [21]:
print(f"Stacking Regressor - CV RMSE : {cv_rmse}")

Stacking Regressor - CV RMSE : 26.461057892697767


In [22]:
#generate predictions for test set
stacking.fit(X_train, y_train)
y_test_preds = stacking.predict(X_test)

#save predictions
df_preds = pd.DataFrame({
    "id": df_test["id"],
    target_col: y_test_preds
})
df_preds.to_csv(os.path.join(OUTPUT_PATH, "preds/stacking.csv" ), index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 524164, number of used features: 9
[LightGBM] [Info] Start training from score 119.034899
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 119.063098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota