- a notebook to save preprocessing model and train/save NN models
- all necessary ouputs are stored in MODEL_DIR = output/kaggle/working/model
    - put those into dataset, and load it from inference notebook

In [1]:
kernel_mode = False
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import sys
if kernel_mode:
    sys.path.append(
        '../input/iterative-stratification/iterative-stratification-master')
    sys.path.append('../input/umaplearn/umap')

%mkdir model
%mkdir interim

from scipy.sparse.csgraph import connected_components
from umap import UMAP
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
import time

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
print(torch.cuda.is_available())
import warnings
# warnings.filterwarnings('ignore')

True


In [3]:
torch.__version__

'1.6.0+cu101'

In [4]:
dataset_folder = "../input/lish-moa" if kernel_mode else "/workspace/Kaggle/MoA"
model_output_folder = "../input/kibuna-nn-hs-1024-last-train-markpeng" if kernel_mode \
    else f"{dataset_folder}/kibuna-nn-hs-1024-last-train-markpeng"
BATCH_SIZE = 256
INFER_BATCH_SIZE = 512

In [5]:
NB = '25'

IS_TRAIN = True
MODEL_DIR = f"{model_output_folder}/model"
INT_DIR = f"{model_output_folder}/interim"

if IS_TRAIN:
    os.makedirs(model_output_folder, exist_ok=True)
    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(INT_DIR, exist_ok=True)

NSEEDS = 5  # 5
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 15
LEARNING_RATE = 5e-3
WEIGHT_DECAY = 1e-5
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

NFOLDS = 5  # 5

PMIN = 0.0005
PMAX = 0.9995
SMIN = 0.0
SMAX = 1.0

In [6]:
train_features = pd.read_csv(f'{dataset_folder}/train_features.csv')
train_targets_scored = pd.read_csv(f'{dataset_folder}/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(f'{dataset_folder}/train_targets_nonscored.csv')

test_features = pd.read_csv(f'{dataset_folder}/test_features.csv')
sample_submission = pd.read_csv(f'{dataset_folder}/sample_submission.csv')

In [7]:
train_targets_nonscored = train_targets_nonscored.loc[:, train_targets_nonscored.sum() != 0]
print(train_targets_nonscored.shape)

(23814, 332)


In [8]:
# for c in train_targets_scored.columns:
#     if c != "sig_id":
#         train_targets_scored[c] = np.maximum(PMIN, np.minimum(PMAX, train_targets_scored[c]))
for c in train_targets_nonscored.columns:
    if c != "sig_id":
        train_targets_nonscored[c] = np.maximum(
            PMIN, np.minimum(PMAX, train_targets_nonscored[c]))

In [9]:
print("(nsamples, nfeatures)")
print(train_features.shape)
print(train_targets_scored.shape)
print(train_targets_nonscored.shape)
print(test_features.shape)
print(sample_submission.shape)

(nsamples, nfeatures)
(23814, 876)
(23814, 207)
(23814, 332)
(3982, 876)
(3982, 207)


In [10]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [11]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


seed_everything(seed=1903)

In [12]:
# GENES
n_comp = 90
n_dim = 45

data = pd.concat(
    [pd.DataFrame(train_features[GENES]),
     pd.DataFrame(test_features[GENES])])

if IS_TRAIN:
    fa = FactorAnalysis(n_components=n_comp,
                        random_state=1903).fit(data[GENES])
    pd.to_pickle(fa, f'{MODEL_DIR}/{NB}_factor_analysis_g.pkl')
    umap = UMAP(n_components=n_dim, random_state=1903).fit(data[GENES])
    pd.to_pickle(umap, f'{MODEL_DIR}/{NB}_umap_g.pkl')
else:
    fa = pd.read_pickle(f'{MODEL_DIR}/{NB}_factor_analysis_g.pkl')
    umap = pd.read_pickle(f'{MODEL_DIR}/{NB}_umap_g.pkl')

data2 = (fa.transform(data[GENES]))
data3 = (umap.transform(data[GENES]))

train2 = data2[:train_features.shape[0]]
test2 = data2[-test_features.shape[0]:]
train3 = data3[:train_features.shape[0]]
test3 = data3[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'fa_G-{i}' for i in range(n_comp)])
train3 = pd.DataFrame(train3, columns=[f'umap_G-{i}' for i in range(n_dim)])
test2 = pd.DataFrame(test2, columns=[f'fa_G-{i}' for i in range(n_comp)])
test3 = pd.DataFrame(test3, columns=[f'umap_G-{i}' for i in range(n_dim)])

train_features = pd.concat((train_features, train2, train3), axis=1)
test_features = pd.concat((test_features, test2, test3), axis=1)

#CELLS
n_comp = 50
n_dim = 25

data = pd.concat(
    [pd.DataFrame(train_features[CELLS]),
     pd.DataFrame(test_features[CELLS])])

if IS_TRAIN:
    fa = FactorAnalysis(n_components=n_comp,
                        random_state=1903).fit(data[CELLS])
    pd.to_pickle(fa, f'{MODEL_DIR}/{NB}_factor_analysis_c.pkl')
    umap = UMAP(n_components=n_dim, random_state=1903).fit(data[CELLS])
    pd.to_pickle(umap, f'{MODEL_DIR}/{NB}_umap_c.pkl')
else:
    fa = pd.read_pickle(f'{MODEL_DIR}/{NB}_factor_analysis_c.pkl')
    umap = pd.read_pickle(f'{MODEL_DIR}/{NB}_umap_c.pkl')

data2 = (fa.transform(data[CELLS]))
data3 = (umap.fit_transform(data[CELLS]))

train2 = data2[:train_features.shape[0]]
test2 = data2[-test_features.shape[0]:]
train3 = data3[:train_features.shape[0]]
test3 = data3[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'fa_C-{i}' for i in range(n_comp)])
train3 = pd.DataFrame(train3, columns=[f'umap_C-{i}' for i in range(n_dim)])
test2 = pd.DataFrame(test2, columns=[f'fa_C-{i}' for i in range(n_comp)])
test3 = pd.DataFrame(test3, columns=[f'umap_C-{i}' for i in range(n_dim)])

train_features = pd.concat((train_features, train2, train3), axis=1)
test_features = pd.concat((test_features, test2, test3), axis=1)

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]

In [13]:
from sklearn.preprocessing import QuantileTransformer

for col in (GENES + CELLS):
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = pd.concat([train_features, test_features
                         ])[col].values.reshape(vec_len + vec_len_test, 1)
    if IS_TRAIN:
        transformer = QuantileTransformer(n_quantiles=100,
                                          random_state=123,
                                          output_distribution="normal")
        transformer.fit(raw_vec)
        pd.to_pickle(transformer,
                     f'{MODEL_DIR}/{NB}_{col}_quantile_transformer.pkl')
    else:
        transformer = pd.read_pickle(
            f'{MODEL_DIR}/{NB}_{col}_quantile_transformer.pkl')

    train_features[col] = transformer.transform(
        train_features[col].values.reshape(vec_len, 1)).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(
        test_features[col].values.reshape(vec_len_test,
                                          1)).reshape(1, vec_len_test)[0]

In [14]:
# PCAS = [col for col in train_features.columns if col.startswith('pca_')]
# UMAPS = [col for col in train_features.columns if col.startswith('umap_')]

In [15]:
# from sklearn.preprocessing import PolynomialFeatures
# n_deg = 2

# data = pd.concat([pd.DataFrame(train_features[PCAS]), pd.DataFrame(test_features[PCAS])])
# data2 = (PolynomialFeatures(degree=n_deg, include_bias=False).fit_transform(data[PCAS]))

# # print(data2)
# # data4 = (UMAP(n_components=n_dim, n_neighbors=5, random_state=1903).fit_transform(data[GENES]))
# # data5 = (UMAP(n_components=n_dim, min_dist=0.01, random_state=1903).fit_transform(data[GENES]))

# train2 = data2[:train_features.shape[0]]
# test2 = data2[-test_features.shape[0]:]

# # print(train2.shape)
# train2 = pd.DataFrame(train2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])
# test2 = pd.DataFrame(test2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])

# # drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
# # train_features = pd.concat((train_features, train2, train3, train4, train5), axis=1)
# # test_features = pd.concat((test_features, test2, test3, test4, test5), axis=1)
# train_features = pd.concat((train_features, train2), axis=1)
# test_features = pd.concat((test_features, test2), axis=1)


# data = pd.concat([pd.DataFrame(train_features[UMAPS]), pd.DataFrame(test_features[UMAPS])])
# data2 = (PolynomialFeatures(degree=n_deg, include_bias=False).fit_transform(data[UMAPS]))

# # print(data2)
# # data4 = (UMAP(n_components=n_dim, n_neighbors=5, random_state=1903).fit_transform(data[GENES]))
# # data5 = (UMAP(n_components=n_dim, min_dist=0.01, random_state=1903).fit_transform(data[GENES]))

# train2 = data2[:train_features.shape[0]]
# test2 = data2[-test_features.shape[0]:]

# # print(train2.shape)
# train2 = pd.DataFrame(train2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])
# test2 = pd.DataFrame(test2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])

# # drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
# # train_features = pd.concat((train_features, train2, train3, train4, train5), axis=1)
# # test_features = pd.concat((test_features, test2, test3, test4, test5), axis=1)
# train_features = pd.concat((train_features, train2), axis=1)
# test_features = pd.concat((test_features, test2), axis=1)

In [16]:
print(train_features.shape)
print(test_features.shape)

(23814, 1086)
(3982, 1086)


In [17]:
# train = train_features.merge(train_targets_scored, on='sig_id')
train = train_features.merge(train_targets_nonscored, on='sig_id')
train = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type'] != 'ctl_vehicle'].reset_index(
    drop=True)

# target = train[train_targets_scored.columns]
target = train[train_targets_nonscored.columns]

In [18]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [19]:
print(target.shape)
print(train_features.shape)
print(test_features.shape)
print(train.shape)
print(test.shape)

(21948, 332)
(23814, 1086)
(3982, 1086)
(21948, 1416)
(3624, 1085)


In [20]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [21]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds



Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,vasopressin_receptor_antagonist,ve-cadherin_antagonist,vesicular_monoamine_transporter_inhibitor,vitamin_k_antagonist,voltage-gated_potassium_channel_activator,voltage-gated_sodium_channel_blocker,wdr5_mll_interaction_inhibitor,xanthine_oxidase_inhibitor,xiap_inhibitor,kfold
0,id_000644bb2,24,D1,1.146806,0.902075,-0.418339,-0.961202,-0.254770,-1.021300,-1.369236,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
1,id_000779bfc,72,D1,0.128824,0.676862,0.274345,0.090495,1.208863,0.688965,0.316734,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3
2,id_000a6266a,48,D1,0.790372,0.939951,1.428097,-0.121817,-0.002067,1.495091,0.238763,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
3,id_0015fd391,48,D1,-0.729866,-0.277163,-0.441200,0.766612,2.347817,-0.862761,-2.308829,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
4,id_001626bd3,72,D2,-0.444558,-0.481202,0.974729,0.977467,1.468304,-0.874772,-0.372682,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.247623,-1.231184,0.221572,-0.354096,-0.332073,0.570635,-0.150125,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21944,id_fffb1ceed,24,D2,0.217613,-0.027031,-0.237430,-0.787215,-0.677817,0.919474,0.742866,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
21945,id_fffb70c0c,24,D2,-1.914666,0.581880,-0.588706,1.303439,-1.009079,0.852202,-0.302814,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21946,id_fffcb9e7c,24,D1,0.826302,0.411235,0.433297,0.307575,1.075324,-0.024425,0.051483,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3


In [22]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1416)
(21948, 1417)
(3624, 1085)
(21948, 332)
(3982, 207)


In [23]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return (self.features.shape[0])

    def __getitem__(self, idx):
        dct = {
            'x': torch.tensor(self.features[idx, :], dtype=torch.float),
            'y': torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        return dct


class TestDataset:
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return (self.features.shape[0])

    def __getitem__(self, idx):
        dct = {'x': torch.tensor(self.features[idx, :], dtype=torch.float)}
        return dct

In [24]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0

    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        #         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()

    final_loss /= len(dataloader)

    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []

    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())

    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)

    return final_loss, valid_preds


def inference_fn(model, dataloader, device):
    model.eval()
    preds = []

    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)

        preds.append(outputs.sigmoid().detach().cpu().numpy())

    preds = np.concatenate(preds)

    return preds

In [25]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.15)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features,
                                                     hidden_size))

        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.3)
        self.dense2 = nn.Linear(hidden_size, hidden_size)

        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.25)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))

    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.leaky_relu(self.dense1(x))

        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)

        return x

In [26]:
def process_data(data):

    data = pd.get_dummies(data, columns=['cp_time', 'cp_dose'])
    #     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
    #     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})

    # --------------------- Normalize ---------------------
    #     for col in GENES:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #     for col in CELLS:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #--------------------- Removing Skewness ---------------------
    #     for col in GENES + CELLS:
    #         if(abs(data[col].skew()) > 0.75):

    #             if(data[col].skew() < 0): # neg-skewness
    #                 data[col] = data[col].max() - data[col] + 1
    #                 data[col] = np.sqrt(data[col])

    #             else:
    #                 data[col] = np.sqrt(data[col])

    return data

In [27]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold', 'sig_id']]
len(feature_cols)

1087

In [28]:
num_features = len(feature_cols)
num_targets = len(target_cols)
hidden_size = 2048
# hidden_size=4096
# hidden_size=9192

In [29]:
def run_training(fold, seed):

    seed_everything(seed)

    train = process_data(folds)
    test_ = process_data(test)

    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index

    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)

    x_train, y_train = train_df[feature_cols].values, train_df[
        target_cols].values
    x_valid, y_valid = valid_df[feature_cols].values, valid_df[
        target_cols].values

    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset,
                                              batch_size=INFER_BATCH_SIZE,
                                              shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=5e-3,
                                 weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                              pct_start=0.2,
                                              div_factor=1e3,
                                              max_lr=1e-2,
                                              epochs=EPOCHS,
                                              steps_per_epoch=len(trainloader))

    loss_fn = nn.BCEWithLogitsLoss()

    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf

    for epoch in range(EPOCHS):

        train_loss = train_fn(model, optimizer, scheduler, loss_fn,
                              trainloader, DEVICE)
        print(
            f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}"
        )
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(
            f"SEED: {seed} ,FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}"
        )

        if valid_loss < best_loss:

            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(),
                       f"model/{NB}-nonscored1-SEED{seed}-FOLD{fold}_.pth")

        elif (EARLY_STOP == True):

            early_step += 1
            if (early_step >= early_stopping_steps):
                break

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset,
                                             batch_size=INFER_BATCH_SIZE,
                                             shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.load_state_dict(
        torch.load(f"model/{NB}-nonscored1-SEED{seed}-FOLD{fold}_.pth"))
    model.to(DEVICE)

    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)

    return oof, predictions

In [30]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))

    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)

        predictions += pred_ / NFOLDS
        oof += oof_

    return oof, predictions

In [None]:
SEED = [940, 1513, 1269, 1392, 1119, 1303]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_start = time.time()

for seed in SEED:

    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    print(f"elapsed time: {time.time() - time_start}")

train[target_cols] = oof
test[target_cols] = predictions

print(oof.shape)
print(predictions.shape)

SEED: 940, FOLD: 0, EPOCH: 0, train_loss: 0.6458044633053351
SEED: 940 ,FOLD: 0, EPOCH: 0, valid_loss: 0.23897353808085123
SEED: 940, FOLD: 0, EPOCH: 1, train_loss: 0.03576180375302616
SEED: 940 ,FOLD: 0, EPOCH: 1, valid_loss: 0.009251730516552925
SEED: 940, FOLD: 0, EPOCH: 2, train_loss: 0.00939711653020071
SEED: 940 ,FOLD: 0, EPOCH: 2, valid_loss: 0.009194845540655984
SEED: 940, FOLD: 0, EPOCH: 3, train_loss: 0.00933286329002484
SEED: 940 ,FOLD: 0, EPOCH: 3, valid_loss: 0.009400543756783009
SEED: 940, FOLD: 0, EPOCH: 4, train_loss: 0.009148614589070929
SEED: 940 ,FOLD: 0, EPOCH: 4, valid_loss: 0.009084091625279851
SEED: 940, FOLD: 0, EPOCH: 5, train_loss: 0.008975854771130758
SEED: 940 ,FOLD: 0, EPOCH: 5, valid_loss: 0.009017287960482968
SEED: 940, FOLD: 0, EPOCH: 6, train_loss: 0.009263736951718296
SEED: 940 ,FOLD: 0, EPOCH: 6, valid_loss: 0.010180441559188895
SEED: 940, FOLD: 0, EPOCH: 7, train_loss: 0.009065507411740828
SEED: 940 ,FOLD: 0, EPOCH: 7, valid_loss: 0.00897248637759023

SEED: 940, FOLD: 4, EPOCH: 6, train_loss: 0.008952772339293058
SEED: 940 ,FOLD: 4, EPOCH: 6, valid_loss: 0.008799775917496946
SEED: 940, FOLD: 4, EPOCH: 7, train_loss: 0.008867254244514566
SEED: 940 ,FOLD: 4, EPOCH: 7, valid_loss: 0.008773985422319837
SEED: 940, FOLD: 4, EPOCH: 8, train_loss: 0.008817838045997896
SEED: 940 ,FOLD: 4, EPOCH: 8, valid_loss: 0.008596631698310375
SEED: 940, FOLD: 4, EPOCH: 9, train_loss: 0.008740798651200274
SEED: 940 ,FOLD: 4, EPOCH: 9, valid_loss: 0.008562602930598788
SEED: 940, FOLD: 4, EPOCH: 10, train_loss: 0.00861561261927304
SEED: 940 ,FOLD: 4, EPOCH: 10, valid_loss: 0.00852917641815212
SEED: 940, FOLD: 4, EPOCH: 11, train_loss: 0.00849586079383026
SEED: 940 ,FOLD: 4, EPOCH: 11, valid_loss: 0.008476049225363467
SEED: 940, FOLD: 4, EPOCH: 12, train_loss: 0.008363409399770308
SEED: 940 ,FOLD: 4, EPOCH: 12, valid_loss: 0.00843964951733748
SEED: 940, FOLD: 4, EPOCH: 13, train_loss: 0.008220767467350199
SEED: 940 ,FOLD: 4, EPOCH: 13, valid_loss: 0.0084201

SEED: 1513, FOLD: 3, EPOCH: 10, train_loss: 0.008510033444811901
SEED: 1513 ,FOLD: 3, EPOCH: 10, valid_loss: 0.008998410672777228
SEED: 1513, FOLD: 3, EPOCH: 11, train_loss: 0.008380469832353401
SEED: 1513 ,FOLD: 3, EPOCH: 11, valid_loss: 0.008958581317630079
SEED: 1513, FOLD: 3, EPOCH: 12, train_loss: 0.008260354055496662
SEED: 1513 ,FOLD: 3, EPOCH: 12, valid_loss: 0.008916081446740363
SEED: 1513, FOLD: 3, EPOCH: 13, train_loss: 0.008100920593015093
SEED: 1513 ,FOLD: 3, EPOCH: 13, valid_loss: 0.008896990161803033
SEED: 1513, FOLD: 3, EPOCH: 14, train_loss: 0.008023450394039568
SEED: 1513 ,FOLD: 3, EPOCH: 14, valid_loss: 0.008887458696133561
SEED: 1513, FOLD: 4, EPOCH: 0, train_loss: 0.6479944033899169
SEED: 1513 ,FOLD: 4, EPOCH: 0, valid_loss: 0.25152041845851475
SEED: 1513, FOLD: 4, EPOCH: 1, train_loss: 0.03594132327893074
SEED: 1513 ,FOLD: 4, EPOCH: 1, valid_loss: 0.009253945408595933
SEED: 1513, FOLD: 4, EPOCH: 2, train_loss: 0.009558766661886719
SEED: 1513 ,FOLD: 4, EPOCH: 2, val

In [None]:
train.to_pickle(f"{INT_DIR}/{NB}-train_nonscore_pred.pkl")
test.to_pickle(f"{INT_DIR}/{NB}-test_nonscore_pred.pkl")

In [None]:
len(target_cols)

In [None]:
train[target_cols] = np.maximum(PMIN, np.minimum(PMAX, train[target_cols]))
valid_results = train_targets_nonscored.drop(columns=target_cols).merge(
    train[['sig_id'] + target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_nonscored[target_cols].values
y_true = y_true > 0.5
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]

print("CV log_loss: ", score)

CV log_loss:  0.014761779358699672
CV log_loss:  0.014519859174255039
CV log_loss:  0.014525173864593479
CV log_loss:  0.014354930596928602 # 3 umap features
CV log_loss:  0.014353604854355429 # more umap features
CV log_loss:  0.01436484670778641 # more hidden nodes

In [None]:
EPOCHS = 25
# NFOLDS = 5

In [None]:
# sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
# sub.to_csv('submission.csv', index=False)

In [None]:
nonscored_target = [
    c for c in train[train_targets_nonscored.columns] if c != "sig_id"
]

In [None]:
nonscored_target

In [None]:
train = pd.read_pickle(f"{INT_DIR}/{NB}-train_nonscore_pred.pkl")
test = pd.read_pickle(f"{INT_DIR}/{NB}-test_nonscore_pred.pkl")

In [None]:
# use nonscored target in the given file as feature
# if comment out below, use predicted nonscored target
# train = train.drop(nonscored_target, axis=1)
# train = train.merge(train_targets_nonscored, on="sig_id")
# train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_scored, on='sig_id')
# train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
# test = test[test['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# target = train[train_targets_scored.columns]
target = train[train_targets_scored.columns]

In [None]:
# from sklearn.preprocessing import QuantileTransformer

for col in (nonscored_target):

    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = train[col].values.reshape(vec_len, 1)
    if IS_TRAIN:
        transformer = QuantileTransformer(n_quantiles=100,
                                          random_state=0,
                                          output_distribution="normal")
        transformer.fit(raw_vec)
        pd.to_pickle(transformer,
                     f"{MODEL_DIR}/{NB}_{col}_quantile_nonscored.pkl")
    else:
        transformer = pd.read_pickle(
            f"{MODEL_DIR}/{NB}_{col}_quantile_nonscored.pkl")

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(
        vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [None]:
train

In [None]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds

In [None]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

In [None]:
def process_data(data):

    data = pd.get_dummies(data, columns=['cp_time', 'cp_dose'])
    #     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
    #     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})

    # --------------------- Normalize ---------------------
    #     for col in GENES:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #     for col in CELLS:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #--------------------- Removing Skewness ---------------------
    #     for col in GENES + CELLS:
    #         if(abs(data[col].skew()) > 0.75):

    #             if(data[col].skew() < 0): # neg-skewness
    #                 data[col] = data[col].max() - data[col] + 1
    #                 data[col] = np.sqrt(data[col])

    #             else:
    #                 data[col] = np.sqrt(data[col])

    return data

In [None]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

In [None]:
num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=2048
# hidden_size=4096
# hidden_size=9192

In [None]:
def run_training(fold, seed):

    seed_everything(seed)

    train = process_data(folds)
    test_ = process_data(test)

    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index

    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)

    x_train, y_train = train_df[feature_cols].values, train_df[
        target_cols].values
    x_valid, y_valid = valid_df[feature_cols].values, valid_df[
        target_cols].values

    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=5e-3,
                                 weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                              pct_start=0.2,
                                              div_factor=1e3,
                                              max_lr=1e-2,
                                              epochs=EPOCHS,
                                              steps_per_epoch=len(trainloader))

    loss_fn = nn.BCEWithLogitsLoss()
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0

    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf

    for epoch in range(EPOCHS):

        train_loss = train_fn(model, optimizer, scheduler, loss_fn,
                              trainloader, DEVICE)
        print(
            f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}"
        )
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(
            f"SEED: {seed} ,FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}"
        )

        if valid_loss < best_loss:

            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(),
                       f"model/{NB}-scored1-SEED{seed}-FOLD{fold}_.pth")

        elif (EARLY_STOP == True):

            early_step += 1
            if (early_step >= early_stopping_steps):
                break

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset,
                                             batch_size=BATCH_SIZE,
                                             shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.load_state_dict(
        torch.load(f"model/{NB}-scored1-SEED{seed}-FOLD{fold}_.pth"))
    model.to(DEVICE)

    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)

    return oof, predictions

In [None]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))

    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)

        predictions += pred_ / NFOLDS
        oof += oof_

    return oof, predictions

In [None]:
SEED = [940, 1513, 1269, 1392, 1119, 1303]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_start = time.time()

for seed in SEED:

    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    print(f"elapsed time: {time.time() - time_start}")

train[target_cols] = oof
test[target_cols] = predictions

In [None]:
train.to_pickle(f"{INT_DIR}/{NB}-train-score-pred.pkl")
test.to_pickle(f"{INT_DIR}/{NB}-test-score-pred.pkl")

In [None]:
len(target_cols)

In [None]:
train[target_cols] = np.maximum(PMIN, np.minimum(PMAX, train[target_cols]))

valid_results = train_targets_scored.drop(columns=target_cols).merge(
    train[['sig_id'] + target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_true = y_true > 0.5
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]

print("CV log_loss: ", score)

- CV log_loss:  0.014761779358699672
- CV log_loss:  0.014519859174255039
- CV log_loss:  0.014525173864593479
- CV log_loss:  0.014354930596928602 # 3 umap features
- CV log_loss:  0.014353604854355429 # more umap features
- CV log_loss:  0.01436484670778641 # more hidden nodes
- CV log_loss:  0.014344688083211073
  - using predicted unscored targets as feature 
- CV log_loss:  0.013368097791623873
  - using given unscored targets as feature
  - bad in public lb
- CV log_loss:  0.01434373547175235
  - rankgauss predicted unscored targets
- CV log_loss:  0.014346100008158216
  - unscored targets pca/umap
- CV log_loss:  0.014328486629791769
  - NFOLDS=10, Epoch=20
- CV log_loss:  0.014299741080816082
  - NFOLDS=10, Epoch=20, 25
- CV log_loss:  0.014311301224480969
  - NFOLDS=10, Epoch=25
- CV log_loss:  0.01429269446076626
  - NFOLDS=10, Epoch=15, 25

In [None]:
# train = pd.read_pickle(f"../interim/23-train-score-pred.pkl")
# test = pd.read_pickle(f"../interim/23-test-score-pred.pkl")

In [None]:
train = pd.read_pickle(f"{INT_DIR}/{NB}-train-score-pred.pkl")
test = pd.read_pickle(f"{INT_DIR}/{NB}-test-score-pred.pkl")

In [None]:
EPOCHS = 25
# NFOLDS = 5

In [None]:
PMIN = 0.0005
PMAX = 0.9995
for c in train_targets_scored.columns:
    if c != "sig_id":
        train_targets_scored[c] = np.maximum(
            PMIN, np.minimum(PMAX, train_targets_scored[c]))

In [None]:
train_targets_scored.columns

In [None]:
train = train[train_targets_scored.columns]
train.columns = [
    c + "_pred" if (c != 'sig_id' and c in train_targets_scored.columns) else c
    for c in train.columns
]

In [None]:
test = test[train_targets_scored.columns]
test.columns = [
    c + "_pred" if (c != 'sig_id' and c in train_targets_scored.columns) else c
    for c in test.columns
]

In [None]:
train

In [None]:
# use nonscored target in the given file as feature
# if comment out below, use predicted nonscored target
# train = train.drop(nonscored_target, axis=1)
# train = train.merge(train_targets_nonscored, on="sig_id")
# train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_scored, on='sig_id')
# train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
# test = test[test['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# target = train[train_targets_scored.columns]
target = train[train_targets_scored.columns]

In [None]:
# train["cp_time"] = train_features[train_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_time"]
# train["cp_dose"] = train_features[train_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_dose"]
# test["cp_time"] = test_features[test_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_time"]
# test["cp_dose"] = test_features[test_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_dose"]

In [None]:
from sklearn.preprocessing import QuantileTransformer

scored_target_pred = [
    c + "_pred" for c in train_targets_scored.columns if c != 'sig_id'
]

for col in (scored_target_pred):

    #     transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = train[col].values.reshape(vec_len, 1)
    #     transformer.fit(raw_vec)
    if IS_TRAIN:
        transformer = QuantileTransformer(n_quantiles=100,
                                          random_state=0,
                                          output_distribution="normal")
        transformer.fit(raw_vec)
        pd.to_pickle(transformer,
                     f"{MODEL_DIR}/{NB}_{col}_quantile_scored.pkl")
    else:
        transformer = pd.read_pickle(
            f"{MODEL_DIR}/{NB}_{col}_quantile_scored.pkl")

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(
        vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
# train = train.drop('cp_type', axis=1)
# test = test.drop('cp_type', axis=1)

In [None]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [None]:
train

In [None]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds

In [None]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

In [None]:
folds

In [None]:
def process_data(data):

    #     data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    #     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2, 0:0, 1:1, 2:2})
    #     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1, 0:0, 1:1})

    # --------------------- Normalize ---------------------
    #     for col in GENES:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #     for col in CELLS:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #--------------------- Removing Skewness ---------------------
    #     for col in GENES + CELLS:
    #         if(abs(data[col].skew()) > 0.75):

    #             if(data[col].skew() < 0): # neg-skewness
    #                 data[col] = data[col].max() - data[col] + 1
    #                 data[col] = np.sqrt(data[col])

    #             else:
    #                 data[col] = np.sqrt(data[col])

    return data

In [None]:
feature_cols = [c for c in folds.columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

In [None]:
feature_cols

In [None]:
folds

In [None]:
EPOCHS = 25
num_features = len(feature_cols)
num_targets = len(target_cols)
hidden_size = 1024
# hidden_size=4096
# hidden_size=9192

In [None]:
def run_training(fold, seed):

    seed_everything(seed)

    train = (folds)
    test_ = (test)

    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index

    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)

    x_train, y_train = train_df[feature_cols].values, train_df[
        target_cols].values
    x_valid, y_valid = valid_df[feature_cols].values, valid_df[
        target_cols].values

    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=5e-3,
                                 weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                              pct_start=0.2,
                                              div_factor=1e3,
                                              max_lr=1e-2,
                                              epochs=EPOCHS,
                                              steps_per_epoch=len(trainloader))

    loss_fn = nn.BCEWithLogitsLoss()
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0

    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf

    for epoch in range(EPOCHS):

        train_loss = train_fn(model, optimizer, scheduler, loss_fn,
                              trainloader, DEVICE)
        print(
            f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}"
        )
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(
            f"SEED: {seed} ,FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}"
        )

        if valid_loss < best_loss:

            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(),
                       f"model/{NB}-scored2-SEED{seed}-FOLD{fold}_.pth")

        elif (EARLY_STOP == True):

            early_step += 1
            if (early_step >= early_stopping_steps):
                break

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset,
                                             batch_size=BATCH_SIZE,
                                             shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    model.load_state_dict(
        torch.load(f"model/{NB}-scored2-SEED{seed}-FOLD{fold}_.pth"))
    model.to(DEVICE)

    #   if not IS_TRAIN:
    # valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
    # oof[val_idx] = valid_preds

    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)

    return oof, predictions

In [None]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))

    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)

        predictions += pred_ / NFOLDS
        oof += oof_

    return oof, predictions

In [None]:
SEED = [940, 1513, 1269, 1392, 1119, 1303]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_start = time.time()

for seed in SEED:

    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    print(f"elapsed time: {time.time() - time_start}")

train[target_cols] = oof
test[target_cols] = predictions

In [None]:
train.to_pickle(f"{INT_DIR}/{NB}-train-score-stack-pred.pkl")
test.to_pickle(f"{INT_DIR}/{NB}-test-score-stack-pred.pkl")

In [None]:
train[target_cols] = np.maximum(PMIN, np.minimum(PMAX, train[target_cols]))
valid_results = train_targets_scored.drop(columns=target_cols).merge(
    train[['sig_id'] + target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_true = y_true > 0.5
y_pred = valid_results[target_cols].values

y_pred = np.minimum(SMAX, np.maximum(SMIN, y_pred))

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]

print("CV log_loss: ", score)

In [None]:
# for c in test.columns:
#     if c != "sig_id":
#         test[c] = np.maximum(PMIN, np.minimum(PMAX, test[c]))

sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id'] +
                                                             target_cols],
                                                        on='sig_id',
                                                        how='left').fillna(0)
# sub.to_csv('submission.csv', index=False)
sub.to_csv('submission_2stageNN_with_ns_oldcv_0.01822.csv', index=False)

In [None]:
sub