In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import os
import joblib

from src.feature_engineering import *
from src.modeling import *

import warnings
warnings.filterwarnings("ignore")

In [24]:
#config
DATA_PATH = "data/"
OUTPUT_PATH = "output/"

POLY_FEATS_DEGREE = 3 #for polynomial features generation
BINNING_N_BINS = 4 #for binning features generation
PSI_THRESHOLD = 0.1 #for feature selection
GRID_SEARCH_N_SAMPLE = 20000 #number of samples used for grid search
CV_SPLITS = 3  #cross-validation n_split
PCA_N_COMPONENTS = 0.95 #proportion of variance to retain after applying PCA
N_BEST_MODELS = 5 #select the top n best model during grid search to train with whole train data

In [3]:
df_train = pd.read_csv(DATA_PATH + "train.csv")
print(df_train.shape)
df_train.head(3)

(524164, 11)


Unnamed: 0,id,RhythmScore,AudioLoudness,VocalContent,AcousticQuality,InstrumentalScore,LivePerformanceLikelihood,MoodScore,TrackDurationMs,Energy,BeatsPerMinute
0,0,0.60361,-7.636942,0.0235,5e-06,1e-06,0.051385,0.409866,290715.645,0.826267,147.5302
1,1,0.639451,-16.267598,0.07152,0.444929,0.349414,0.170522,0.65101,164519.5174,0.1454,136.15963
2,2,0.514538,-15.953575,0.110715,0.173699,0.453814,0.029576,0.423865,174495.5667,0.624667,55.31989


In [4]:
df_test = pd.read_csv(DATA_PATH + "test.csv")
print(df_test.shape)
df_test.head(3)

(174722, 10)


Unnamed: 0,id,RhythmScore,AudioLoudness,VocalContent,AcousticQuality,InstrumentalScore,LivePerformanceLikelihood,MoodScore,TrackDurationMs,Energy
0,524164,0.410013,-16.794967,0.0235,0.23291,0.012689,0.271585,0.664321,302901.5498,0.424867
1,524165,0.463071,-1.357,0.141818,0.057725,0.257942,0.097624,0.829552,221995.6643,0.846
2,524166,0.686569,-3.368928,0.167851,0.287823,0.210915,0.325909,0.304978,357724.0127,0.134067


In [5]:
feature_cols = [c for c in df_train.columns if c not in ('id', 'BeatsPerMinute')]
feature_cols

['RhythmScore',
 'AudioLoudness',
 'VocalContent',
 'AcousticQuality',
 'InstrumentalScore',
 'LivePerformanceLikelihood',
 'MoodScore',
 'TrackDurationMs',
 'Energy']

In [6]:
label_col = 'BeatsPerMinute'

### Feature Generation and Selection

In [7]:
#generate polynomial features
poly = PolynomialFeatures(degree=POLY_FEATS_DEGREE, interaction_only=False, include_bias=False)
poly.fit(df_train[feature_cols])

df_train_poly_feats = pd.DataFrame(
    poly.transform(df_train[feature_cols]),
    columns = poly.get_feature_names_out()
)
df_train_poly_feats = df_train_poly_feats[[c for c in df_train_poly_feats if c not in feature_cols]] #exclude the original features
print(f"df_train polynomial features shape : {df_train_poly_feats.shape}")

df_test_poly_feats = pd.DataFrame(
    poly.transform(df_test[feature_cols]),
    columns = poly.get_feature_names_out()
)
df_test_poly_feats = df_test_poly_feats[[c for c in df_test_poly_feats if c not in feature_cols]] #exclude the original features
print(f"df_test polynomial features shape : {df_test_poly_feats.shape}")

df_train polynomial features shape : (524164, 210)
df_test polynomial features shape : (174722, 210)


In [8]:
#generate binning features
binning = KBinsDiscretizer(n_bins=BINNING_N_BINS, encode="ordinal", strategy="quantile", quantile_method="linear")
binning.fit(df_train[feature_cols])

df_train_binning_feats = pd.DataFrame(
    binning.transform(df_train[feature_cols]),
    columns = binning.get_feature_names_out() + "_binned"
)
print(f"df_train binning features shape : {df_train_binning_feats.shape}")

df_test_binning_feats = pd.DataFrame(
    binning.transform(df_test[feature_cols]),
    columns = binning.get_feature_names_out() + "_binned"
)
print(f"df_test binning features shape : {df_test_binning_feats.shape}")

df_train binning features shape : (524164, 9)
df_test binning features shape : (174722, 9)


In [9]:
#combine all of the features
df_train_combined = pd.concat([
    df_train, df_train_poly_feats, df_train_binning_feats
], axis=1)
assert len(df_train_combined.columns)==len(set(df_train_combined.columns)) #make sure no duplicate columns
print(f"df_train_combined shape : {df_train_combined.shape}")

df_test_combined = pd.concat([
    df_test, df_test_poly_feats, df_test_binning_feats
], axis=1)
assert len(df_test_combined.columns)==len(set(df_test_combined.columns)) #make sure no duplicate columns
print(f"df_test_combined shape : {df_test_combined.shape}")

df_train_combined shape : (524164, 230)
df_test_combined shape : (174722, 229)


In [10]:
#feature selection with PSI check
#to check the stability of the features between train and test
#filter out features with extreme shifts in distribution
psi_dict = {}
for c in df_train_combined.columns:
    if c in ('id', label_col):
        continue
    
    psi = calculate_psi(df_train_combined[c], df_test_combined[c])
    psi_dict[c] = psi
df_psi = pd.DataFrame([psi_dict]).T
df_psi.columns = ['psi']
df_psi = df_psi.sort_values('psi', ascending=False)
df_psi.head()

Unnamed: 0,psi
RhythmScore AudioLoudness InstrumentalScore,0.000188
AudioLoudness MoodScore Energy,0.000184
RhythmScore VocalContent LivePerformanceLikelihood,0.000144
VocalContent Energy^2,0.000143
RhythmScore MoodScore^2,0.000142


In [11]:
high_psi_feats = df_psi.loc[df_psi['psi']>PSI_THRESHOLD].index.tolist()
len(high_psi_feats)
#no features with high psi, so no filtering needed

0

### Experiments

In [12]:
##split train dataset using cross-validation
# cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=1)

In [13]:
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

In [None]:
#define the models and parameters 
models_and_params = {
    "knn": (KNeighborsRegressor(), {"model__n_neighbors": [3, 5, 10]}),
    "ridge": (Ridge(), {"model__alpha": [0.01, 0.1, 1, 10]}),
    "lasso": (Lasso(), {"model__alpha": [0.01, 0.1, 1, 10]}),
    "rf": (RandomForestRegressor(), {"model__n_estimators": [100, 200],
                                     "model__max_depth": [None, 10]}),
    "gbr": (GradientBoostingRegressor(), {"model__n_estimators": [100, 200],
                                          "model__learning_rate": [0.05, 0.1],
                                          "model__max_depth": [3, 5]}),
    # "svr": (SVR(), {"model__kernel": ["linear", "rbf"], "model__C": [0.1, 1, 10]}),
}

In [15]:
df_train_sample = df_train_combined.sample(n=GRID_SEARCH_N_SAMPLE)
X = df_train_sample[[c for c in df_train_sample.columns if c not in ('id', label_col)]]
y = df_train_sample[label_col]

# X = X[:100] #for testing
# y = y[:100] #for testing

print(f"X shape : {X.shape}")
print(f"y shape : {y.shape}")

X shape : (20000, 228)
y shape : (20000,)


In [None]:
best_models = {}

for name, (model, params) in models_and_params.items():
    for method in ['original', 'outlier_removal', 'standardization', 'pca']:
    # for method in ['pca']:
        steps = []
        
        if method=='original':
            pass

        elif method=='outlier_removal':
            steps.append(('outlier_removal', OutlierRemoval()))
        
        elif method=='standardization':
            steps.append(('standardization', StandardScaler()))        

        elif method=='pca':
            steps.append(('pca', PCA(n_components=PCA_N_COMPONENTS)))

        steps.append(("model", model))
        pipe = Pipeline(steps)

        grid = GridSearchCV(pipe, params, cv=CV_SPLITS, scoring="neg_mean_squared_error")
        grid.fit(X.copy(), y.copy())
        
        best_rmse = (-grid.best_score_)**0.5
        best_models[f"{name}_{method}"] = [grid.best_estimator_, best_rmse] #best model and RMSE
        print(f"{name}_{method} best estimator RMSE : {best_rmse}")
        #save best estimator to output folder
        joblib.dump(grid.best_estimator_, OUTPUT_PATH + f"02_modeling__grid_search_{name}_{method}.joblib")

rf_pca best estimator RMSE : 26.63605436316777
gbr_pca best estimator RMSE : 26.61778898386265


In [19]:
df_best_models = pd.DataFrame(best_models).T
df_best_models.columns = ['best_estimator', 'rmse']
df_best_models = df_best_models.sort_values('rmse', ascending=True)
df_best_models

Unnamed: 0,best_estimator,rmse
lasso_standardization,"(StandardScaler(), Lasso(alpha=1))",26.574791
ridge_pca,"(PCA(n_components=0.95), Ridge(alpha=0.01))",26.57604
lasso_pca,"(PCA(n_components=0.95), Lasso(alpha=0.01))",26.57604
lasso_outlier_removal,"(OutlierRemoval(), Lasso(alpha=10))",26.615761
gbr_pca,"(PCA(n_components=0.95), ([DecisionTreeRegress...",26.617789
rf_original,"((DecisionTreeRegressor(max_depth=10, max_feat...",26.622235
lasso_original,(Lasso(alpha=10)),26.624076
rf_pca,"(PCA(n_components=0.95), (DecisionTreeRegresso...",26.636054
ridge_outlier_removal,"(OutlierRemoval(), Ridge(alpha=10))",26.68623
ridge_original,(Ridge(alpha=10)),26.699151


In [20]:
df_best_models.to_csv(OUTPUT_PATH + "02_modeling__grid_search_result.csv")

### Train best models with whole training data and generate test predictions 

In [25]:
df_best_models_filtered = df_best_models.iloc[:N_BEST_MODELS].copy().reset_index().rename(columns={'index':'model_name'})
df_best_models_filtered

Unnamed: 0,model_name,best_estimator,rmse
0,lasso_standardization,"(StandardScaler(), Lasso(alpha=1))",26.574791
1,ridge_pca,"(PCA(n_components=0.95), Ridge(alpha=0.01))",26.57604
2,lasso_pca,"(PCA(n_components=0.95), Lasso(alpha=0.01))",26.57604
3,lasso_outlier_removal,"(OutlierRemoval(), Lasso(alpha=10))",26.615761
4,gbr_pca,"(PCA(n_components=0.95), ([DecisionTreeRegress...",26.617789


In [26]:
if not os.path.exists(OUTPUT_PATH + "preds/"):
    os.makedirs(OUTPUT_PATH + "preds/")

In [27]:
for idx in range(N_BEST_MODELS):
    model_name = df_best_models_filtered.loc[idx, 'model_name']
    estimator = df_best_models_filtered.loc[idx, 'best_estimator']

    estimator.fit(
        df_train_combined[[c for c in df_train_combined.columns if c not in ('id', label_col)]],
        df_train_combined[label_col]
    )
    y_test_preds = estimator.predict(df_test_combined[[c for c in df_test_combined.columns if c not in ('id', label_col)]])
    
    df_preds = df_test_combined[['id']]
    df_preds[label_col] = y_test_preds
    df_preds.to_csv(OUTPUT_PATH + f"preds/02_modeling__{model_name}_test_preds.csv", index=False)