In [None]:
!pip install iterative-stratification

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K

from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow_addons as tfa

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn import preprocessing


from tqdm.notebook import tqdm

import math

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

data = train_features.append(test_features)

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

print(len(train_features))
train_features.head()

In [None]:
def preprocess(df):
    df = df.copy()
    #df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.drop(['cp_type'], axis=1, inplace=True)
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})    
    df = pd.get_dummies(df, columns=['cp_time','cp_dose'])
    del df['sig_id']
    return df

train = preprocess(train_features)
test = preprocess(test_features)

#del train_targets['sig_id']
train.head()

In [None]:
# Fit scaler to join train and test data
scaler = preprocessing.MinMaxScaler()     #(x-x_min)/(x_max-x_min)
scaler.fit(train.append(test))

train_trans = scaler.transform(train)
test_trans = scaler.transform(test)

train = pd.DataFrame(train_trans, columns=train.columns)
test = pd.DataFrame(test_trans, columns=test.columns)

In [None]:
somthing_rate = 1e-15
P_MIN = somthing_rate
P_MAX = 1 - P_MIN

def loss_fn(yt, yp):
    yp = np.clip(yp, P_MIN, P_MAX)
    return log_loss(yt, yp, labels=[0,1])

In [None]:
def create_model(num_columns, actv='relu'):
    model = tf.keras.Sequential([tf.keras.layers.Input(num_columns)])
                
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(1024, activation=actv)))
    
    if actv == 'elu':
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.AlphaDropout(0.2))
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(512, kernel_initializer='lecun_normal', activation='selu')))
    else:
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(0.2))
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(1024, activation=actv))) 

    #============ Final Layer =================
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(206, activation="sigmoid")))
    
    model.compile(optimizer=tfa.optimizers.AdamW(lr = 1e-3, weight_decay = 1e-5, clipvalue = 756), 
                  loss=BinaryCrossentropy(label_smoothing=somthing_rate),
                  )
    return model

In [None]:
# Use All feats as top feats
top_feats = [i for i in range(train.shape[1])]
print("Top feats length:",len(top_feats))

In [None]:
mod = create_model(len(top_feats))
mod.summary()

In [None]:
def metric(y_true, y_pred):
    metrics = []
    for _target in train_targets.columns:
        metrics.append(loss_fn(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float)))
    return np.mean(metrics)

In [None]:
N_STARTS = 14
S_STARTS = int(N_STARTS/2) 

res_relu = train_targets.copy()
res_elu = train_targets.copy()
res_relu.loc[:, train_targets.columns] = 0
res_elu.loc[:, train_targets.columns] = 0

ss_relu = ss.copy()
ss_elu = ss.copy()
ss_relu.loc[:, train_targets.columns] = 0
ss_elu.loc[:, train_targets.columns] = 0

#ss.loc[:, train_targets.columns] = 0
ss_dict = {}

historys = dict()

tf.random.set_seed(42)
for seed in range(N_STARTS):
    for n, (tr, te) in enumerate(MultilabelStratifiedKFold(n_splits=7, random_state=seed, shuffle=True).split(train_targets, train_targets)):
        print(f"======{train_targets.values[tr].shape}========{train_targets.values[te].shape}=====")
        
        if seed < S_STARTS: # every actv. will train for 7 times seed.
            print(f'Seed: {seed} => Fold: {n} ==> (RELU MODEL)')
            model = create_model(len(top_feats), actv='relu')
        else:
            print(f'Seed: {seed} => Fold: {n} ==> (ELU MODEL)')
            model = create_model(len(top_feats), actv='elu')

        
        checkpoint_path = f'repeat:{seed}_Fold:{n}.hdf5'
        reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.2, min_lr=1e-6, patience=4, verbose=1, mode='auto')
        cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 1, save_best_only = True,
                                     save_weights_only = True, mode = 'auto')
        early = EarlyStopping(monitor="val_loss", mode="min", restore_best_weights=True, patience= 10, verbose = 1)
        
        history = model.fit(train.values[tr][:, top_feats],
                  train_targets.values[tr],
                  validation_data=(train.values[te][:, top_feats], train_targets.values[te]),
                  epochs=60, batch_size=128,
                  callbacks=[reduce_lr_loss, cb_checkpt, early], verbose=2
                 )
        
        historys[f'history_{seed+1}'] = history
        print("Model History Saved.")
        
        model.load_weights(checkpoint_path)
        
        test_predict = model.predict(test.values[:, top_feats])
        val_predict = model.predict(train.values[te][:, top_feats])

        if seed < S_STARTS: 
            ss_relu.loc[:, train_targets.columns] += test_predict
            res_relu.loc[te, train_targets.columns] += val_predict
        else:
            ss_elu.loc[:, train_targets.columns] += test_predict
            res_elu.loc[te, train_targets.columns] += val_predict
            
        print(f'OOF Metric For SEED {seed} => FOLD {n} : {metric(train_targets.loc[te, train_targets.columns], pd.DataFrame(val_predict, columns=train_targets.columns))}')
        print('+-' * 10)
        
ss_relu.loc[:, train_targets.columns] /= ((n+1) * S_STARTS)
res_relu.loc[:, train_targets.columns] /= S_STARTS

ss_elu.loc[:, train_targets.columns] /= ((n+1) * S_STARTS)
res_elu.loc[:, train_targets.columns] /= S_STARTS

In [None]:
# Show Model loss in plots

for k,v in historys.items():
    loss = []
    val_loss = []
    loss.append(v.history['loss'][:40])
    val_loss.append(v.history['val_loss'][:40])
    
import matplotlib.pyplot as plt
fig=plt.figure(figsize = (10, 6))
plt.plot(np.mean(loss, axis=0),marker='o', linestyle='--', color='tab:olive', alpha =0.6)
plt.plot(np.mean(val_loss, axis=0),marker='o', linestyle='--', color='tab:cyan', alpha =0.6)
plt.yscale('log')
plt.yticks(ticks=[1,1e-1,1e-2], fontsize=12, family='serif')
plt.xticks(fontsize=12, family='serif')
plt.xlabel('Epochs',fontsize=15, family='serif')
plt.ylabel('Average log loss',fontsize=15, family='serif')
plt.legend(['Training','Validation'])
fig.savefig('loss.pdf',format='pdf')

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Show Model accuracy in plots

for k,v, k2, v2 in historys.items():
    loss = []
    val_loss = []
    loss.append(v2.history['accuracy'][:40])
    val_loss.append(v2.history['val_accuracy'][:40])
    
import matplotlib.pyplot as plt
plt.figure(figsize = (15, 6))
plt.plot(np.mean(loss, axis=0))
plt.plot(np.mean(val_loss, axis=0))
#plt.yscale('log')
#plt.yticks(ticks=[1,1e-1,1e-2])
plt.xlabel('Epochs')
plt.ylabel('Model Accuracy')
plt.legend(['Training','Validation'])

In [None]:
print(f'OOF Metric (relu): {metric(train_targets, res_relu)}')
print(f'OOF Metric (elu): {metric(train_targets, res_elu)}')

In [None]:
ss_relu.to_csv('submission_relu.csv', index=False)
ss_elu.to_csv('submission_elu.csv', index=False)