In [11]:
"""
[V2]
Blend Models:
* tabnet 10 folds 3 seeds using non scored
* 2 stage NN with chris split using nonscored
* deepinsight-efficientnet-v7-b3-infer
* deepinsight-restnest-v2-infer

"""

kernel_mode = False

import os
import numpy as np
import pandas as pd
import time
import random
import math
import glob

from numba import njit
from scipy.optimize import minimize, fsolve

import optuna

import warnings
warnings.filterwarnings('ignore')

import gc
gc.enable()

rand_seed = 1120

optuna_mode = True
study_name = "moa_blend_team_v2"
# n_trials = 500
n_trials = 3000

In [6]:
# !mkdir -p /root/.cache/torch/hub/checkpoints/
# !cp ../input/gen-efficientnet-pretrained/tf_efficientnet_*.pth /root/.cache/torch/hub/checkpoints/
# !cp ../input/deepinsight-resnest-v1-resnest50/*.pth /root/.cache/torch/hub/checkpoints/
# !cp ../input/deepinsight-resnest-v2-resnest50-output/*.pth /root/.cache/torch/hub/checkpoints/
# !ls -la /root/.cache/torch/hub/checkpoints/

In [7]:
# !cp ../input/kaggle-moa-team/scripts/* .
# !ls -la

In [15]:
dataset_folder = "../input/lish-moa" if kernel_mode else "/workspace/Kaggle/MoA/"

# Add your model inference script here
# Tuple Format: (script, oof_filename, output_filename, weight)
model_list = [
    ("Models data moa/script_tabnet_ns_oldcv.py",
     "Models data moa/oof_tabnet_ns_oldcv.npy",
     "submission_tabnet_ns_oldcv.csv", 0),
    
    ("Models data moa/script_nn_ns_newcv.py",
     "Models data moa/oof_nn_ns_newcv.npy",
     "submission_nn_ns_newcv.csv", 0),
    
    ("deepinsight_efficientnet_lightning_v7_b3_infer.py",
     "oof_deepinsight_efficientnet_v7_b3_0.014802440208660929.npy",
     "submission_effnet_v7_b3.csv", 0),
    
    ("deepinsight_resnest_lightning_v2_infer.py",
     "oof_deepinsight_ResNeSt_v2_resnest50_0.01455961217985703.npy",
     "submission_resnest_v2.csv", 0),
]

model_path = "." if kernel_mode else dataset_folder

In [31]:
train_features = pd.read_csv(f"{dataset_folder}/train_features.csv",
                             engine='c')
train_labels = pd.read_csv(f'{dataset_folder}/train_targets_scored.csv',
                           engine='c')
train_classes = [c for c in train_labels.columns if c != "sig_id"]

non_control_group_rows = train_features["cp_type"] == "trt_cp"
non_control_group_train_labels = train_labels.loc[
    non_control_group_rows, :].copy().reset_index(drop=True)

submission = pd.read_csv(f'{dataset_folder}/sample_submission.csv')
submission.iloc[:, 1:] = 0

In [37]:
# Reference: https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0/notebook
# CPMP's logloss from https://www.kaggle.com/c/lish-moa/discussion/183010
def log_loss_numpy(y_pred, y_true):
    y_true_ravel = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = np.where(y_true_ravel == 1, -np.log(y_pred), -np.log(1 - y_pred))
    return loss.mean()


def func_numpy_metric(weights):
    oof_blend = np.tensordot(weights, oof, axes=((0), (0)))
    return log_loss_numpy(oof_blend)


def mean_logloss(y_pred, y_true):
    logloss = (1 - y_true) * np.log(1 - y_pred +
                                    1e-15) + y_true * np.log(y_pred + 1e-15)
    return np.nanmean(-logloss)

In [38]:
total_start = time.time()
if not optuna_mode:
    for i, (script, oof_filename, output_filename, weight) in enumerate(model_list):
        print(f"Generating submission file from {script} ......")
        infer_start = time.time()
        !python {model_path}/{script}
        infer_elapsed = time.time() - infer_start
        print(f"Time spent on inference: {infer_elapsed/60:.2f} minutes.")

        model_submit = pd.read_csv(output_filename, engine='c')
        print(model_submit.head(5))
        print(model_submit.shape)
        submission.iloc[:, 1:] += weight * model_submit.iloc[:, 1:]
else:
    ## Search Best Blend Weights by Optuna ##
    model_oofs = []

    for i, (script, oof_filename, output_filename, weight) in enumerate(model_list):
        print(f"Loading OOF from {oof_filename} ......")
        oof = np.load(f"{dataset_folder}/{oof_filename}")
        
        if oof.shape[0] == 23814:
            oof = oof[non_control_group_rows, :]

        oof_loss = mean_logloss(oof, non_control_group_train_labels[train_classes].values)
        print(f"OOF Validation Loss of {script}: {oof_loss:.6f}")
        model_oofs.append(oof)

    def objective(trial):
        weights = []
        for i in range(len(model_list)):
            weights.append(trial.suggest_float(f"w{i}", 0, 1.0))

        blend = np.zeros(model_oofs[0].shape)
        for i in range(len(model_list)):
            blend += weights[i] * model_oofs[i]
        blend = np.clip(blend, 0, 1.0)

        loss = mean_logloss(blend, non_control_group_train_labels[train_classes].values)
        return loss

    pruner = optuna.pruners.MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=0,
        interval_steps=1,
    )
    sampler = optuna.samplers.TPESampler(seed=rand_seed)
    study = optuna.create_study(direction="minimize",
                                pruner=pruner,
                                sampler=sampler,
                                study_name=study_name,
                                storage=f'sqlite:///{study_name}.db',
                                load_if_exists=True)

    study.optimize(objective,
                   n_trials=n_trials,
                   timeout=None,
                   gc_after_trial=True,
                   n_jobs=-1)

    trial = study.best_trial

#     for i, (script, oof_filename, output_filename, _) in enumerate(model_list):
#         optimal_weight = trial.params[f"w{i}"]
#         print(f"Generating submission file from {script} ...... (Weight: {optimal_weight})")
#         infer_start = time.time()
#         !python {model_path}/{script}
#         infer_elapsed = time.time() - infer_start
#         print(f"Time spent on inference: {infer_elapsed/60:.2f} minutes.")

#         model_submit = pd.read_csv(output_filename, engine='c')
#         print(model_submit.head(5))
#         print(model_submit.shape)
#         submission.iloc[:, 1:] += optimal_weight * model_submit.iloc[:, 1:]

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

Loading OOF from Models data moa/oof_tabnet_ns_oldcv.npy ......
OOF Validation Loss of Models data moa/script_tabnet_ns_oldcv.py: 0.871528
Loading OOF from Models data moa/oof_nn_ns_newcv.npy ......
OOF Validation Loss of Models data moa/script_nn_ns_newcv.py: 0.016976
Loading OOF from oof_deepinsight_efficientnet_v7_b3_0.014802440208660929.npy ......
OOF Validation Loss of deepinsight_efficientnet_lightning_v7_b3_infer.py: 0.016016
Loading OOF from oof_deepinsight_ResNeSt_v2_resnest50_0.01455961217985703.npy ......
OOF Validation Loss of deepinsight_resnest_lightning_v2_infer.py: 0.015756


[32m[I 2020-11-25 17:44:43,897][0m A new study created in RDB with name: moa_blend_team_v2[0m
[32m[I 2020-11-25 17:44:44,714][0m Trial 3 finished with value: 5.660407814841266 and parameters: {'w0': 0.9757298437171829, 'w1': 0.8852232109655547, 'w2': 0.9464210437716456, 'w3': 0.02711559156986376}. Best is trial 3 with value: 5.660407814841266.[0m
[32m[I 2020-11-25 17:44:44,915][0m Trial 5 finished with value: 0.3321732752130106 and parameters: {'w0': 0.41068561704474305, 'w1': 0.15701546539831035, 'w2': 0.04405989232158347, 'w3': 0.7644084179206361}. Best is trial 5 with value: 0.3321732752130106.[0m
[32m[I 2020-11-25 17:44:44,935][0m Trial 0 finished with value: 5.15654264457879 and parameters: {'w0': 0.9191188225629842, 'w1': 0.189949417445464, 'w2': 0.1709586478137347, 'w3': 0.9615809825171439}. Best is trial 5 with value: 0.3321732752130106.[0m
[32m[I 2020-11-25 17:44:44,996][0m Trial 4 finished with value: 4.6656242033960815 and parameters: {'w0': 0.8645889518102593,

KeyboardInterrupt: 

In [8]:
total_elapsed = time.time() - total_start
print(f"Total time spent: {total_elapsed/60:.2f} minutes.")

Total time spent: 22.97 minutes.


In [9]:
# Number of finished trials: 500
# Best trial:
#   Value: 0.014158536219669974
#   Params: 
#     w0: 0.3287684605023437
#     w1: 0.2763485706536088
#     w2: 0.3859487453003219

In [10]:
# Number of finished trials: 3000
# Best trial:
#   Value: 0.014287989662394677
#   Params: 
#     w0: 0.4397214034464735
#     w1: 0.5466210401214696

In [11]:
print(submission.shape)
submission

(3982, 207)


Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001140,0.001169,0.001982,0.016637,0.028251,0.004118,0.003356,0.003270,0.000252,...,0.001502,0.000647,0.002621,0.001826,0.000815,0.000746,0.000761,0.002113,0.002082,0.001310
1,id_001897cda,0.000194,0.000270,0.000732,0.000334,0.000743,0.001452,0.002962,0.006127,0.058353,...,0.000328,0.000143,0.001729,0.000768,0.004997,0.000174,0.004149,0.000435,0.000187,0.002191
2,id_002429b5b,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,id_00276f245,0.000512,0.000411,0.001805,0.019714,0.016225,0.004170,0.003897,0.003916,0.000429,...,0.000819,0.001271,0.001750,0.011008,0.031156,0.000446,0.004321,0.002950,0.002086,0.003045
4,id_0027f1083,0.003641,0.001858,0.001494,0.015493,0.023506,0.004922,0.005166,0.001399,0.000332,...,0.001317,0.000739,0.002620,0.001143,0.001230,0.000870,0.001134,0.001481,0.000477,0.001661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.000262,0.000490,0.000595,0.002090,0.004227,0.000717,0.000626,0.001199,0.000385,...,0.000660,0.010027,0.002168,0.349161,0.007396,0.001034,0.004639,0.000505,0.000806,0.000355
3978,id_ff925dd0d,0.004736,0.003249,0.001048,0.010602,0.020970,0.006351,0.005969,0.002876,0.000273,...,0.000628,0.000635,0.003043,0.001603,0.001152,0.001074,0.002228,0.001827,0.000363,0.000892
3979,id_ffb710450,0.004125,0.002012,0.000942,0.009287,0.030860,0.009721,0.003308,0.001775,0.000248,...,0.000615,0.000696,0.002354,0.000970,0.001489,0.000763,0.000839,0.001155,0.000323,0.001657
3980,id_ffbb869f2,0.001578,0.000898,0.001135,0.027620,0.025039,0.003751,0.009288,0.002486,0.000657,...,0.000972,0.000554,0.002377,0.001237,0.001722,0.000495,0.000787,0.002356,0.000556,0.002994


In [12]:
submission.to_csv('submission.csv', index=False)

In [13]:
!rm ./*.py
!ls -la

total 65024
drwxr-xr-x 4 root root     4096 Nov 22 16:31 .
drwxr-xr-x 6 root root     4096 Nov 22 16:07 ..
---------- 1 root root   234779 Nov 22 16:31 __notebook__.ipynb
drwxr-xr-x 2 root root     4096 Nov 22 16:08 checkpoints
-rw-r--r-- 1 root root 16575670 Nov 22 16:31 submission.csv
-rw-r--r-- 1 root root 16577393 Nov 22 16:16 submission_effnet_v7_b3.csv
-rw-r--r-- 1 root root 16569497 Nov 22 16:23 submission_resnest_v1.csv
-rw-r--r-- 1 root root 16598162 Nov 22 16:30 submission_resnest_v2.csv
drwxr-xr-x 2 root root     4096 Nov 22 16:08 wandb
