In [139]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [6]:
SEED = 42

# Reading the data

In [121]:
train_df = pd.read_csv('/kaggle/input/song-popularity-prediction/train.csv')
test_df = pd.read_csv('/kaggle/input/song-popularity-prediction/test.csv')
sample_submission = pd.read_csv("/kaggle/input/song-popularity-prediction/sample_submission.csv")

# Missing values imputation

In [122]:
train_df.isnull().sum()

In [123]:
useful_features = [col for col in train_df.columns if col not in ("id", "song_popularity", "k_fold")]
test_df = test_df[useful_features]

In [124]:
n_splits = 5
X = train_df.drop('song_popularity', axis=1)
y = train_df['song_popularity']

skf = StratifiedKFold(n_splits = n_splits, random_state=SEED, shuffle=True)

In [125]:
for fold, (train_ind, val_ind) in enumerate(skf.split(X, y)):
    train_df.at[val_ind, "k_fold"] = fold

In [126]:
train_df.head()

In [127]:
x_test.head()

In [147]:
def predict(imputer, train_df, test_df, n_splits):
    final_predictions = []
    val_scores = []
    
    y_train = x_train['song_popularity']
    y_val = x_val['song_popularity']

    x_train = train_df[useful_features]
    x_val = x_val[useful_features]
    
    for fold in range(n_splits):
        x_train = train_df[train_df['k_fold'] != fold].reset_index(drop=True)
        x_val = train_df[train_df['k_fold'] == fold].reset_index(drop=True)
        x_test = test_df.copy()

        y_train = x_train['song_popularity']
        y_val = x_val['song_popularity']

        x_train = x_train[useful_features]
        x_val = x_val[useful_features]

        x_train_imputed = imputer.fit_transform(x_train)
        x_val_imputed = imputer.transform(x_val)
        test_imputed = imputer.transform(x_test)

        model = XGBClassifier(random_state=fold, n_jobs=4, use_label_encoder=False, eval_metric='auc')
        model.fit(x_train_imputed, y_train)
        val_pred = model.predict(x_val_imputed)
        test_pred = model.predict(test_imputed)

        final_predictions.append(test_pred)
        roc_auc_value = roc_auc_score(y_val, val_pred)
        val_scores.append(roc_auc_value)
        print(fold, roc_auc_score(y_val, val_pred))
    
    return final_predictions, val_scores

## Trying out knn imputer from sklearn

In [115]:
knn_imputer = KNNImputer(n_neighbors=1)

In [116]:
final_predictions = predict(knn_imputer, train_df, test_df, n_splits)

## Trying out simple imputer from sklearn

In [148]:
simple_imputer = SimpleImputer(strategy="mean", add_indicator=False)

In [149]:
final_predictions, val_scores = predict(simple_imputer, train_df, test_df, n_splits)

In [158]:
print(f"Mean ROC AUC score is {np.mean(val_scores).round(3)} +- {np.std(val_scores).round(3)}") 

## Trying out iterative imputer from sklearn

In [162]:
iterative_imputer = IterativeImputer(max_iter=10, initial_strategy='median')

In [163]:
final_predictions, val_scores = predict(iterative_imputer, train_df, test_df, n_splits)

In [164]:
print(f"Mean ROC AUC score is {np.mean(val_scores).round(3)} +- {np.std(val_scores).round(3)}") 

## Trying out LightGBM imputer

In [131]:
# !rm -r kuma_utils
!git clone https://github.com/analokmaus/kuma_utils.git

In [132]:
import sys
sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

In [135]:
lgbm_imtr = LGBMImputer(n_iter=100, verbose=False)

In [136]:
final_predictions = predict(lgbm_imtr, train_df, test_df, n_splits)

In [173]:
preds = np.mean(final_predictions, axis=0)

In [175]:
sample_submission["song_popularity"] = preds
sample_submission.to_csv("submission.csv", index=False)