In [1]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
import tensorflow_addons as tfa

from sklearn.metrics import log_loss
from tqdm.notebook import tqdm

## Loading the data

In [2]:
train_features = pd.read_csv('../input/train_features.csv')
train_targets = pd.read_csv('../input/train_targets_scored.csv')
test_features = pd.read_csv('../input/test_features.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv')

In [3]:
train_features.head(1)

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176


## Creating folds

In [4]:
def create_fold(data):
    
    data['fold'] = -1
    
    data = data.sample(frac = 1).reset_index(drop = True)
    
    targets = data.drop('sig_id', axis=1).values
    
    splitter = MultilabelStratifiedKFold(n_splits=10, random_state=0)
    
    for fold, (train, valid) in enumerate(splitter.split(X=data, y=targets)):
        
        data.loc[valid, 'fold'] = fold
        
    return data

In [5]:
train_targets = create_fold(train_targets)



In [None]:
train_targets.to_csv('../output/fold_data.csv', index = False)

## Preprocessing the data

In [6]:
def preprocess(data):
    
    cp_time = pd.get_dummies(data['cp_time'])
    cp_type = pd.get_dummies(data['cp_type'])
    cp_dose = pd.get_dummies(data['cp_dose'])
    
    
    data = data.join(cp_time)
    data = data.join(cp_type)
    data = data.join(cp_dose)
    
    data.drop(columns = ['cp_time', 'cp_dose', 'cp_type'], inplace=True)
    
    return data

In [7]:
train_features = preprocess(train_features)

## Unifying the data

In [8]:
train_df = train_features.merge(train_targets, on='sig_id', how='outer')
train_df.shape, train_features.shape, train_targets.shape

((23814, 1087), (23814, 880), (23814, 208))

## Create model and metric

In [9]:
def create_model(num_columns):
    
    model = tf.keras.Sequential([
        
        tf.keras.layers.Input(num_columns),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        
        tfa.layers.WeightNormalization(tf.keras.layers.Dense(2048, activation="relu")),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        
        tfa.layers.WeightNormalization(tf.keras.layers.Dense(1048, activation="relu")),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        
        tfa.layers.WeightNormalization(tf.keras.layers.Dense(206, activation="sigmoid"))
        
    ])
    
    
    model.compile(optimizer=tfa.optimizers.Lookahead(tf.optimizers.Adam(), sync_period=10),
                  loss='binary_crossentropy', 
                  )
    
    return model

In [10]:
def metric(y_true, y_pred):
    
    metrics = []
    
    for _target in train_targets.columns[1:-1]:
        
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels=[0,1]))
        
    return np.mean(metrics)

## Training the model

In [11]:
def run(fold):
    
    model = create_model(879)
    
    cheakpoint_path = f'Fold_{fold}.hdf5'

    reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')

    cb_checkpt = ModelCheckpoint(cheakpoint_path, monitor='val_loss', verbose=0, save_best_only=True, 
                                save_weights_only=True, mode='min')
    
    
    #==============================================
    
    cols = train_df.columns.values
    cols = cols[1:880]
    
    tar = train_df.columns.values
    tar = tar[880:-1]
    
    x_train = train_df[train_df['fold'] != fold]
    x_train = x_train.loc[:, cols]
    
    y_train = train_df[train_df['fold'] != fold]
    y_train = y_train.loc[:, tar]
 
    x_valid = train_df[train_df['fold'] == fold]
    x_valid = x_valid.loc[:, cols]
    
    y_valid = train_df[train_df['fold'] == fold]
    y_valid = y_valid.loc[:, tar]    
    
    #==============================================
    
    model.fit(x_train, y_train, validation_data=(x_valid, y_valid), epochs=35, batch_size=128,
             callbacks = [reduce_lr_loss, cb_checkpt], verbose=2)
    
    #==============================================
    
    model.load_weights(cheakpoint_path)
    
    y_pred = model.predict(x_valid)
    
    y_pred = pd.DataFrame(y_pred, columns = y_valid.columns)
    
    print('Metric: ', metric(y_valid, y_pred))

In [12]:
run(0)

Epoch 1/35
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
168/168 - 2s - loss: 0.5639 - val_loss: 0.2124 - lr: 0.0010
Epoch 2/35
168/168 - 2s - loss: 0.1137 - val_loss: 0.0449 - lr: 0.0010
Epoch 3/35
168/168 - 2s - loss: 0.0377 - val_loss: 0.0275 - lr: 0.0010
Epoch 4/35
168/168 - 2s - loss: 0.0261 - val_loss: 0.0228 - lr: 0.0010
Epoch 5/35
168/168 - 2s - loss: 0.0225 - val_loss: 0.0203 - lr: 0.0010
Epoch 6/35
168/168 - 2s - loss: 0.0206 - val_loss: 0.0189 - lr: 0.0010
Epoch 7/35
168/168 - 2s - loss: 0.0196 - val_loss: 0.0183 - lr: 0.0010
Epoch 8/35
168/168 - 2s - loss: 0.0188 - val_loss: 0.0178 - lr: 0.0010
Epoch 9/35
168/168 - 2s - loss: 0.0184 - val_loss: 0.0173 - lr: 0.0010
Epoch 10/35
168/168 - 2s - loss: 0.0179 - val_loss: 0.0170 - lr: 0.0010
Epoch 11/35
168/168 - 2s - loss: 0.0175 - val_loss: 0.0166 - lr: 0.0010
Epoch 12/35
168/168 - 2s - loss: 0.0171 - val_loss: 0.0164 - lr: 0.0010
Epoch 13/35
168/168 - 2s - loss: 0.0169 - val_loss: 0.0162 - lr: