In [1]:
!pip install py_boost

Collecting py_boost
  Downloading py_boost-0.4.3-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: py_boost
Successfully installed py_boost-0.4.3
[0m

In [2]:
import sys
sys.path.append('../input/iterativestratification')

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from py_boost.gpu import *
from py_boost.multioutput.sketching import *

import warnings
warnings.filterwarnings('ignore', '.*DataFrame is highly fragmented*')

In [3]:
def calc_multilabel_roc_auc(Y_true: pd.DataFrame, Y_preds: pd.DataFrame):
    
    rocaucs = []
    
    for c in Y_true.columns:
        rocaucs.append(roc_auc_score(Y_true[c], Y_preds[c]))
        
    return np.mean(rocaucs)


def generate_features(train, test, cat_cols, num_cols):
    
    df = pd.concat([train, test], axis = 0, copy = False)
    
    for c in cat_cols + num_cols:
        
        df[f'count_{c}'] = df.groupby(c)[c].transform('count')
        
    for c in cat_cols:
        for n in num_cols:
                df[f'mean_{n}_per_{c}'] = df.groupby(c)[n].transform('mean')
            
    return df.iloc[:len(train),:], df.iloc[len(train):, :]

In [4]:
%%time

train = pd.read_csv('/kaggle/input/playground-series-s3e18/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e18/test.csv')
sub = pd.read_csv('/kaggle/input/playground-series-s3e18/sample_submission.csv')

print(train.shape, test.shape, sub.shape)

(14838, 38) (9893, 32) (9893, 3)
CPU times: user 151 ms, sys: 31 ms, total: 182 ms
Wall time: 286 ms


In [5]:
target_cols = ['EC1', 'EC2', 'EC3', 'EC4', 'EC5', 'EC6']
cols_to_drop = ['id']

features = [c for c in train.columns if c not in target_cols + cols_to_drop]

cat_cols = ['EState_VSA2','HallKierAlpha','NumHeteroatoms','PEOE_VSA10','PEOE_VSA14','PEOE_VSA6',
            'PEOE_VSA7','PEOE_VSA8', 'SMR_VSA10','SMR_VSA5','SlogP_VSA3','fr_COO','fr_COO2']

num_cols = [c for c in features if c not in cat_cols]

print(f'Categorical Features: {len(cat_cols)}')
print(f'Numeric Features: {len(num_cols)}')

Categorical Features: 13
Numeric Features: 18


In [6]:
X_train = train[features]
Y_train = train[target_cols]
X_test = test[features]

print(X_train.shape, Y_train.shape)
print(X_test.shape)

(14838, 31) (14838, 6)
(9893, 31)


In [7]:
%%time

X_train, X_test = generate_features(X_train, X_test, cat_cols, num_cols)

CPU times: user 439 ms, sys: 30 ms, total: 469 ms
Wall time: 474 ms


In [8]:
N_FOLDS = 10

cv = MultilabelStratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=0)

In [9]:
features = X_train.columns
print(len(features))

296


In [10]:
%%time

scores = []
oof_predictions = np.zeros((len(X_train),2))
test_predictions = np.zeros((len(test),2))

for fold_n, (tr_idx, val_idx) in enumerate(cv.split(X_train, Y_train[['EC1', 'EC2']])):
    
    print(f'Fold {fold_n} started.')
    
    X_tr, X_val = X_train.loc[tr_idx], X_train.loc[val_idx]
    Y_tr, Y_val = Y_train.loc[tr_idx], Y_train.loc[val_idx]
    
    print(f'X_train shape: {X_tr.shape}. Y_train shape: {Y_tr.shape}')
    print(f'X_valid shape: {X_val.shape}. Y_valid shape: {Y_val.shape}')
    
    sketch = RandomProjectionSketch(1)
    
    model = GradientBoosting('multilabel',
                             ntrees=1000,
                             lr=0.03,
                             verbose=100,
                             lambda_l2=1,
                             gd_steps=1, 
                             subsample=1,
                             colsample=1,
                             min_data_in_leaf=10,
                             use_hess=False, 
                             max_bin=256,
                             max_depth=6,
                             multioutput_sketch=sketch)
    
    model.fit(X_tr.values, Y_tr.values, eval_sets = [{'X': X_val.values, 'y': Y_val.values}])
    
    fold_predictions = model.predict(X_val.values)[:,:2]
    test_predictions += model.predict(X_test[features].values)[:,:2] / N_FOLDS
    
    oof_predictions[val_idx] = fold_predictions
    
    fold_score = calc_multilabel_roc_auc(Y_val[['EC1', 'EC2']], pd.DataFrame(fold_predictions, columns = ['EC1', 'EC2']))
    scores.append(fold_score)
    print(f'Fold {fold_n} ROC AUC: {fold_score}')
    print(25 * '#')
    
print(f'Mean ROC AUC: {np.mean(scores)}')
print(f'Std ROC AUC: {np.std(scores)}')

Fold 0 started.
X_train shape: (13355, 296). Y_train shape: (13355, 6)
X_valid shape: (1483, 296). Y_valid shape: (1483, 6)
[18:51:26] Stdout logging level is INFO.
[18:51:26] GDBT train starts. Max iter 1000, early stopping rounds 100
[18:51:49] Iter 0; Sample 0, BCE = 0.5294502506271496; 
[18:51:51] Iter 100; Sample 0, BCE = 0.499819261257522; 
[18:51:53] Iter 200; Sample 0, BCE = 0.49752121090116225; 
[18:51:55] Iter 300; Sample 0, BCE = 0.49749501417257896; 
[18:51:56] Early stopping at iter 381, best iter 281, best_score 0.49731315794670367
Fold 0 ROC AUC: 0.6508215287785704
#########################
Fold 1 started.
X_train shape: (13354, 296). Y_train shape: (13354, 6)
X_valid shape: (1484, 296). Y_valid shape: (1484, 6)
[18:51:57] Stdout logging level is INFO.
[18:51:57] GDBT train starts. Max iter 1000, early stopping rounds 100
[18:51:57] Iter 0; Sample 0, BCE = 0.5332890398190299; 
[18:51:59] Iter 100; Sample 0, BCE = 0.5043535751640481; 
[18:52:00] Iter 200; Sample 0, BCE = 

In [11]:
sub[['EC1', 'EC2']] = test_predictions
sub.to_csv('submission.csv', index = 0)