Our aim is to predict multiple targets of the Mechanism of Action (MoA) response(s) of different samples (`sig_id`), given various inputs such as gene expression data and cell viability data.

# 1. Import Packages

In [1]:
import os
import random
import numpy as np 
import pandas as pd 
import tensorflow as tf
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

In [2]:
SEED = 9
def all_seeds(s):
    random.seed(s)
    os.environ['PYTHONHASHSEED'] = str(s)
    np.random.seed(s)
    tf.random.set_seed(s)
    
all_seeds(SEED)

# 2. Load Data

In [3]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lish-moa/test_features.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/sample_submission.csv


In [4]:
test_df = pd.read_csv('../input/lish-moa/test_features.csv')
train_df = pd.read_csv('../input/lish-moa/train_features.csv')
tr_target_df = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
sample_df = pd.read_csv('../input/lish-moa/sample_submission.csv')

target_cols = tr_target_df.columns[1:]
N_TARGETS = len(target_cols)

# 3. Preprocess Data

In [5]:
def preprocess_df(df):
    df['cp_type'] = (df['cp_type'] == 'trt_cp').astype(int)
    df['cp_dose'] = (df['cp_dose'] == 'D2').astype(int)
    return df

x_train = preprocess_df(train_df.drop(columns="sig_id"))
y_train = tr_target_df.drop(columns="sig_id")
x_test = preprocess_df(test_df.drop(columns="sig_id"))

N_FEATURES = x_train.shape[1]

In [6]:
print("Number of Features:",N_FEATURES)
print("Number of Targets:",N_TARGETS)
print("x_train shape:",x_train.shape)
print("y_train shape:",y_train.shape)
print("x_test shape:",x_test.shape)

Number of Features: 875
Number of Targets: 206
x_train shape: (23814, 875)
y_train shape: (23814, 206)
x_test shape: (3982, 875)


# 4. Building Model

In [7]:
EPOCHS = 64
BATCH_SIZE = 128
FOLDS = 5
REPEATS = 5
LR = 0.0008
N_TARGETS = len(target_cols)

def multi_log_loss(y_true, y_pred):
    losses = []
    for col in y_true.columns:
        losses.append(log_loss(y_true.loc[:, col], y_pred.loc[:, col]))
    return np.mean(losses)

In [8]:
def my_model():
    model = tf.keras.Sequential([
        
                tf.keras.layers.Input(N_FEATURES), 
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.1),
        
                tf.keras.layers.Dense(3500, activation="relu"),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.4),
        
                tf.keras.layers.Dense(1750, activation="relu"),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.2),
                
                tf.keras.layers.Dense(875, activation="relu"),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.1),
                
                tf.keras.layers.Dense(412, activation="relu"),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.2),
        
                tf.keras.layers.Dense(N_TARGETS, activation="sigmoid") 
            ])
    
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR),
                  loss = 'binary_crossentropy', 
                  metrics = ["accuracy"])
    return model

# 5. Training

In [9]:
def train(resume_models = None, repeat_number = 0, folds = 5, skip_folds = 0):
    
    models = []
    oof_pred = y_train.copy()  

    kfold = KFold(folds, shuffle = True, random_state = 9)
    for fold, (i_tr, i_va) in enumerate(kfold.split(x_train)):

        print('-'*85)
        print(f'Repeat {repeat_number}, Fold {fold}')
        
        cb_lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', 
                                                              factor = 0.4, 
                                                              patience = 1, 
                                                              verbose = 2, 
                                                              min_delta = 0.0001, 
                                                              mode = 'auto')
        
        cb_early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', 
                                                         min_delta = 0, 
                                                         patience = 4, 
                                                         verbose = 1, 
                                                         mode = 'min')
        
        checkpt_path = f'repeat:{repeat_number}_Fold:{fold}.hdf5'
        cb_checkpt = tf.keras.callbacks.ModelCheckpoint(checkpt_path, 
                                                        monitor = 'val_loss', 
                                                        verbose = 2, 
                                                        save_best_only = True, 
                                                        save_weights_only = True, 
                                                        mode = 'min')

        model = my_model()
        model.fit(x_train.values[i_tr], y_train.values[i_tr],
                  validation_data = (x_train.values[i_va], y_train.values[i_va]),
                  callbacks = [cb_lr_schedule, cb_early_stop, cb_checkpt],
                  epochs = EPOCHS, 
                  batch_size = BATCH_SIZE, 
                  verbose = 2)
        
        model.load_weights(checkpt_path)
        oof_pred.loc[i_va, :] = model.predict(x_train.values[i_va])
        models.append(model)

    return models, oof_pred

In [10]:
models = []
oof_pred = []

for i in range(REPEATS):
    m, oof = train(repeat_number = i, folds = FOLDS)
    print('-'*85)
    models = models + m
    oof_pred.append(oof)

-------------------------------------------------------------------------------------
Repeat 0, Fold 0
Epoch 1/64

Epoch 00001: val_loss improved from inf to 0.07935, saving model to repeat:0_Fold:0.hdf5
149/149 - 22s - loss: 0.4460 - accuracy: 0.0222 - val_loss: 0.0793 - val_accuracy: 0.0292
Epoch 2/64

Epoch 00002: val_loss improved from 0.07935 to 0.02535, saving model to repeat:0_Fold:0.hdf5
149/149 - 22s - loss: 0.0399 - accuracy: 0.0388 - val_loss: 0.0253 - val_accuracy: 0.0407
Epoch 3/64

Epoch 00003: val_loss improved from 0.02535 to 0.02052, saving model to repeat:0_Fold:0.hdf5
149/149 - 22s - loss: 0.0227 - accuracy: 0.0553 - val_loss: 0.0205 - val_accuracy: 0.0831
Epoch 4/64

Epoch 00004: val_loss improved from 0.02052 to 0.01867, saving model to repeat:0_Fold:0.hdf5
149/149 - 21s - loss: 0.0197 - accuracy: 0.0764 - val_loss: 0.0187 - val_accuracy: 0.0779
Epoch 5/64

Epoch 00005: val_loss improved from 0.01867 to 0.01770, saving model to repeat:0_Fold:0.hdf5
149/149 - 22s - 

# 6. Mean OOF Log Loss

In [11]:
mean_oof_pred = y_train.copy()
mean_oof_pred.loc[:, target_cols] = 0
for i, p in enumerate(oof_pred):
    print(f"Repeat {i} OOF Log Loss: {multi_log_loss(y_train, p)}")
    mean_oof_pred.loc[:, target_cols] += p[target_cols]

mean_oof_pred.loc[:, target_cols] /= len(oof_pred)
print(f"Mean OOF Log Loss: {multi_log_loss(y_train, mean_oof_pred)}")
mean_oof_pred.loc[x_train['cp_type'] == 0, target_cols] = 0
print(f"Mean OOF Log Loss (ctl adjusted): {multi_log_loss(y_train, mean_oof_pred)}")

Repeat 0 OOF Log Loss: 0.015483665395740838
Repeat 1 OOF Log Loss: 0.01552470895805344
Repeat 2 OOF Log Loss: 0.015517090891618757
Repeat 3 OOF Log Loss: 0.015605339865058434
Repeat 4 OOF Log Loss: 0.01553044152174556
Mean OOF Log Loss: 0.015222323265951992
Mean OOF Log Loss (ctl adjusted): 0.015186998066889489


# 7. Predictions and Submission File

In [12]:
test_pred = sample_df.copy()
test_pred[target_cols] = 0

for model in models:
    test_pred.loc[:,target_cols] += model.predict(x_test)
test_pred.loc[:,target_cols] /= len(models)
test_pred.loc[x_test['cp_type'] == 0, target_cols] = 0
test_pred.to_csv('submission.csv', index=False)