In [None]:
from lightgbm.sklearn import LGBMClassifier
import numpy as np
import optuna.integration.lightgbm as lgb
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

Load dataset

In [None]:
data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")

Group data by Id. Compute mean features of groups

In [None]:
data = data.sort_values(['Id', 'Week'])
data = data.groupby(['Id']).agg(np.nanmean).reset_index()

test_data = test_data.sort_values(['Id', 'Week'])
test_data = test_data.groupby(['Id']).agg(np.nanmean).reset_index()

Split into features and target

In [None]:
X = data.drop(['Id', 'Week', 'target'], axis=1)
y = data['target']

Perform hyperparameter tuning with cross-validation for LightGBMClassifier using optuna (5 folds)

In [None]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.005,
    'random_state': 42,
    'n_jobs': 16,
    'verbosity': -1
}

tuner = lgb.LightGBMTunerCV(params, lgb.Dataset(X, y), verbose_eval=500, num_boost_round=1000, 
                            optuna_seed=42)
tuner.run()

Get Out-of-Fold predictions using tuned hyperparameters (10 folds)

In [None]:
X = data.drop(['Id', 'Week', 'target'], axis=1)
y = data['target']

folds = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
predictions = np.zeros(len(test_data))
for fold, (train_index, test_index) in enumerate(folds.split(X, y)):
    
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    clf = LGBMClassifier(n_estimators=5000, **params, **tuner.study.best_trial.params)
    clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='auc', early_stopping_rounds=250, verbose=0)
    
    y_preds = clf.predict_proba(X_valid)[:,1]
    predictions += clf.predict_proba(test_data.drop(['Id', 'Week'], axis=1))[:,1] / folds.n_splits 
    
    print(f'Fold {fold + 1}, Validation AUC = {roc_auc_score(y_valid, y_preds)}')

In [None]:
submission = pd.DataFrame(test_data['Id'])
submission['Predicted'] = predictions
submission = submission.set_index('Id')
submission.to_csv('submission.csv')