In [2]:
import numpy as np
import pandas as pd 
import os
from typing import Callable, List, Tuple
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
SEED = 42

# Reading the data

In [4]:
train_df = pd.read_csv('/kaggle/input/song-popularity-prediction/train.csv')
test_df = pd.read_csv('/kaggle/input/song-popularity-prediction/test.csv')
sample_submission = pd.read_csv("/kaggle/input/song-popularity-prediction/sample_submission.csv")

# Looking at data

In [5]:
train_df.isnull().sum()

In [6]:
test_df.isnull().sum()

Several columns (`song_duration_ms`, `acousticness`, `danceability`, `energy`, `instrumentalness`, `key`, `liveness`, `loudness`) are lacking roughly 10% of their data.

In [7]:
train_df.nunique()

One of the columns with missing data (`key`) is a categorical column. So, maybe it will be good to make it categorical again after imputing the missing values.

### Selecting needed and unnecessary columns

In [8]:
cat_features = ['key', 'audio_mode', 'time_signature']
not_useful_features = ["id", "song_popularity", "k_fold"]
useful_features = [col for col in train_df.columns if col not in not_useful_features]
test_df = test_df[useful_features]

# Cross Validation

In [9]:
n_splits = 5
X = train_df.drop('song_popularity', axis=1)
y = train_df['song_popularity']

skf = StratifiedKFold(n_splits = n_splits, random_state=SEED, shuffle=True)

In [10]:
for fold, (train_ind, val_ind) in enumerate(skf.split(X, y)):
    train_df.at[val_ind, "k_fold"] = fold

# Building predict functions

There are 2 predict functions.  
First one (`predict_pipeline_whole`) fits imputer on the whole training dataset (`train_df`).  
Second one fits imputer on a `k-1` folds of training dataset (`x_train`)

In [15]:
def predict_pipeline_whole(imputer: Callable, train_df: pd.DataFrame, test_df: pd.DataFrame, 
                           not_useful_features: List[str], useful_features: List[str], 
                           n_splits: int, add_indicator: bool = False) -> Tuple[List[list], List[float], Callable]:
    final_predictions = []
    val_scores = []
    
    train_df_imputed = imputer.fit_transform(train_df[useful_features])
    test_df_imputed = imputer.transform(test_df)
    
    if add_indicator:
        missing_values_columns = train_df.columns[train_df.isnull().sum() != 0]
        missing_values_columns = [col + "_missing" for col in missing_values_columns]
        useful_features_missing = useful_features + missing_values_columns
        
        train_df_export = pd.concat([pd.DataFrame(train_df_imputed, columns=useful_features_missing), 
                              train_df[not_useful_features]], axis=1)
        test_df_export = pd.DataFrame(test_df_imputed, columns=useful_features_missing)
        
        train_df = pd.concat([pd.DataFrame(train_df_imputed, columns=useful_features_missing), 
                          train_df[not_useful_features]], axis=1).drop(missing_values_columns, axis=1)
        test_df = pd.DataFrame(test_df_imputed, columns=useful_features_missing).drop(missing_values_columns, axis=1)

    for fold in range(n_splits):
        x_train = train_df[train_df['k_fold'] != fold].reset_index(drop=True)
        x_val = train_df[train_df['k_fold'] == fold].reset_index(drop=True)
        x_test = test_df.copy()

        y_train = x_train['song_popularity']
        y_val = x_val['song_popularity']

        x_train = x_train[useful_features]
        x_val = x_val[useful_features]
        # x_train['key'] = x_train['key'].round()
        # for col in cat_features:
        # x_train[col] = x_train[col].astype("category")
         
        model = XGBClassifier(random_state=fold, n_jobs=4, use_label_encoder=False, eval_metric='auc')
        model.fit(x_train, y_train)
        val_pred = model.predict(x_val)
        test_pred = model.predict(x_test)

        final_predictions.append(test_pred)
        roc_auc_value = roc_auc_score(y_val, val_pred)
        val_scores.append(roc_auc_value)
        print(fold, roc_auc_score(y_val, val_pred))
    
    return final_predictions, val_scores, imputer, train_df_export, test_df_export

In [12]:
def predict_pipeline_k_1(imputer: Callable, train_df: pd.DataFrame, test_df: pd.DataFrame, 
                           not_useful_features: List[str], useful_features: List[str], 
                           n_splits: int) -> Tuple[List[list], List[float], Callable]:
    final_predictions = []
    val_scores = []

    for fold in range(n_splits):
        x_train = train_df[train_df['k_fold'] != fold].reset_index(drop=True)
        x_val = train_df[train_df['k_fold'] == fold].reset_index(drop=True)
        x_test = test_df.copy()

        y_train = x_train['song_popularity']
        y_val = x_val['song_popularity']

        x_train = x_train[useful_features]
        x_val = x_val[useful_features]
        
        x_train_imputed = imputer.fit_transform(x_train)
        x_val_imputed = imputer.transform(x_val)
        x_test_imputed = imputer.transform(x_test)

        model = XGBClassifier(random_state=fold, n_jobs=4, use_label_encoder=False, eval_metric='auc')
        model.fit(x_train, y_train)
        val_pred = model.predict(x_val)
        test_pred = model.predict(x_test)

        final_predictions.append(test_pred)
        roc_auc_value = roc_auc_score(y_val, val_pred)
        val_scores.append(roc_auc_value)
        print(fold, roc_auc_score(y_val, val_pred))
    
    return final_predictions, val_scores, imputer

# Sklearn's `SimpleImputer`

### Mean strategy

In [None]:
simple_imputer = SimpleImputer(strategy="mean", add_indicator=False)

In [None]:
final_predictions_simple_mean_whole, val_scores, imp, _, _ = predict_pipeline_whole(simple_imputer, train_df, test_df, 
                                                                        not_useful_features, useful_features, 
                                                                        n_splits)
print(f"ROC AUC score for 'predict_pipeline_whole' function with {imp} and {imp.strategy} strategy is "
      f"{np.mean(val_scores).round(3)} +- {np.std(val_scores).round(3)}") 

In [None]:
final_predictions_simple_mean_k_1, val_scores, imp = predict_pipeline_k_1(simple_imputer, train_df, test_df, 
                                                                        not_useful_features, useful_features, 
                                                                        n_splits)
print(f"ROC AUC score for 'predict_pipeline_k_1' function with {imp} and {imp.strategy} strategy is "
      f"{np.mean(val_scores).round(3)} +- {np.std(val_scores).round(3)}") 

### Median strategy

In [None]:
simple_imputer = SimpleImputer(strategy="median", add_indicator=False)

In [None]:
final_predictions_simple_med_whole, val_scores, imp, _, _ = predict_pipeline_whole(simple_imputer, train_df, test_df, 
                                                                        not_useful_features, useful_features, 
                                                                        n_splits)
print(f"ROC AUC score for 'predict_pipeline_whole' function with {imp} and {imp.strategy} strategy is "
      f"{np.mean(val_scores).round(3)} +- {np.std(val_scores).round(3)}") 

In [None]:
final_predictions_simple_med_k_1, val_scores, imp = predict_pipeline_k_1(simple_imputer, train_df, test_df, 
                                                                        not_useful_features, useful_features, 
                                                                        n_splits)
print(f"ROC AUC score for 'predict_pipeline_k_1' function with {imp} and {imp.strategy} strategy is "
      f"{np.mean(val_scores).round(3)} +- {np.std(val_scores).round(3)}") 

# Sklearn's `KNNImputer`

In [None]:
knn_imputer = KNNImputer(n_neighbors=1)

In [None]:
final_predictions_knn_whole, val_scores, imp, _, _ = predict_pipeline_whole(knn_imputer, train_df, test_df, 
                                                                            not_useful_features, useful_features, 
                                                                            n_splits)
print(f"ROC AUC score for 'predict_pipeline_whole' function with {imp} and 1 n_neighbors is {np.mean(val_scores).round(3)}"
      f" +- {np.std(val_scores).round(3)}") 

In [None]:
final_predictions_knn_k_1, val_scores, imp = predict_pipeline_k_1(knn_imputer, train_df, test_df, 
                                                                  not_useful_features, useful_features, n_splits)
print(f"ROC AUC score for 'predict_pipeline_k_1' function with {imp} and 1 n_neighbors is {np.mean(val_scores).round(3)}"
      f" +- {np.std(val_scores).round(3)}") 

# Sklearn's `IterativeImputer`

### Median strategy

In [None]:
iterative_imputer = IterativeImputer(max_iter=20, initial_strategy='median')

In [None]:
final_predictions_it_med_whole, val_scores, imp, _, _ = predict_pipeline_whole(iterative_imputer, train_df, test_df, 
                                                                   not_useful_features, useful_features, n_splits)
print(f"ROC AUC score for 'predict_pipeline_whole' function with {imp} is {np.mean(val_scores).round(3)} "
      f"+- {np.std(val_scores).round(3)}")

In [None]:
final_predictions_it_med_k_1, val_scores, imp = predict_pipeline_k_1(iterative_imputer, train_df, test_df, 
                                                                   not_useful_features, useful_features, n_splits)
print(f"ROC AUC score for 'predict_pipeline_k_1' function with {imp} is {np.mean(val_scores).round(3)} "
      f"+- {np.std(val_scores).round(3)}")

### Mean strategy

In [13]:
iterative_imputer = IterativeImputer(max_iter=20, initial_strategy='mean', add_indicator=True)

In [14]:
final_predictions_it_mean_whole, val_scores, imp, train_df_imputed, test_df_imputed = predict_pipeline_whole(
    iterative_imputer, train_df, test_df, not_useful_features, useful_features, n_splits, add_indicator=True)
print(f"ROC AUC score for {imp} and {imp.initial_strategy} strategy is {np.mean(val_scores).round(3)} "
      f"+- {np.std(val_scores).round(3)}") 

In [None]:
train_df_imputed.to_csv("train_df_imputed_5folds.csv", index=False)
test_df_imputed.to_csv("test_df_imputed_5folds.csv", index=False)

Saving datasets with imputed missing values for future modelling

In [None]:
iterative_imputer = IterativeImputer(max_iter=20, initial_strategy='mean)

In [None]:
final_predictions_it_mean_k_1, val_scores, imp = predict_pipeline_k_1(iterative_imputer, train_df, test_df, 
                                                                   not_useful_features, useful_features, n_splits)
print(f"ROC AUC score for 'predict_pipeline_k_1' function with {imp} and {imp.initial_strategy} strategy is "
      f"{np.mean(val_scores).round(3)} +- {np.std(val_scores).round(3)}")

# `LGBMImputer`

In [None]:
# !rm -r kuma_utils
!git clone https://github.com/analokmaus/kuma_utils.git

In [None]:
import sys
sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

In [None]:
lgbm_imtr = LGBMImputer(n_iter=100)

In [None]:
final_predictions_lgbm_whole, val_scores, imp, _, _ = predict_pipeline_whole(lgbm_imtr, train_df, test_df, not_useful_features, 
                                                                 useful_features, n_splits)
print(f"ROC AUC score for 'predict_pipeline_whole' function with LGBMImputer is {np.mean(val_scores).round(3)} "
      f"+- {np.std(val_scores).round(3)}") 

In [None]:
final_predictions_lgbm_k_1, val_scores, imp = predict_pipeline_k_1(lgbm_imtr, train_df, test_df, not_useful_features, 
                                                               useful_features, n_splits)
print(f"ROC AUC score for 'predict_pipeline_k_1' function with LGBMImputer is {np.mean(val_scores).round(3)} "
      f"+- {np.std(val_scores).round(3)}") 

# Imputation results

Seems like, the Iterative imputer with 'mean' strategy is a way to go for this dataset. So, I'm going to use it for future modelling.  

There is one open question (for me), by the way. How should we measure the performance of imputation algorithms?   
Is it correct to fit imputation algorithm on whole train dataset and only after that split it to train and validation subsets or should we do this inside of cross validation loop and train imputation algorithm on train subset and use it to transform train and val subsets?   

# Submitting first results

In [None]:
preds = np.mean(np.column_stack(final_predictions_it_mean_whole),axis=1)

In [None]:
sample_submission["song_popularity"] = preds
sample_submission.to_csv("submission.csv", index=False)

# TODO

* Check imputation on whole dataset (train + test) and discuss train-test contamination (data leakage)