# Contents

- [Environment Setup](#Environment_Setup)
- [Ensure Reproducibility](#Ensure_Reproducibility)
- [HyperParameters - All](#HyperParameters_-_All)
- [Competition Supplied Data](#Competition_Supplied_Data)
- [Import Data](#Import_Data)
- [PreProcessing](#PreProcessing)
- [Initial Filtering](#Initial_Filtering)
- [Feature Engineering](#Feature_Engineering)
- [Feature Reduction](#Feature_Reduction)
- [CV Folds](#CV_Folds)
- [PyTorch Setup](#PyTorch_Setup)
- [Training Process](#Training_Process)
- [Results](#Results)
- [Prepare Submission File](#Prepare_Submission_File)

<a id='Environment_Setup'></a>
# Environment Setup

In [1]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import QuantileTransformer

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel

from xgboost import XGBClassifier

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import sys
# sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import warnings
warnings.filterwarnings('ignore')

### Ensure Reproducibility

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

<a id='HyperParameters_-_All'></a>
# HyperParameters - All

In [3]:
# INITIAL FILTERING PARAMETERS
Include_neg_10 = False
Variance_Threshold = True
VAR_THRESH=0.5
PCA_VAR = True
PCA_VAR_THRESH = 0.98

In [4]:
# PREPROCESSING PARAMETERS
Category_Encoding = "OHE"  # "OHE" or "Mapping"
Normalization = True
Scaling = False
Remove_Skewness = False
Quantile_Transforming = False

In [5]:
# FEATURE ENGINEERING PARAMETERS

In [6]:
# FEATURE REDUCTION PARAMETERS

In [7]:
# MODEL PARAMETERS
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 30
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
HIDDEN_SIZE=1024

Label_Smoothing = True
SMOOTHING = 0.000625
CLAMPING = False
CLAMP_MIN = 0.0001
CLAMP_MAX = 0.9999

In [8]:
# POSTPROCESSING PARAMETERS
FORCE_BAD_COLS = True

### Hardcoded Features to Drop or Keep

In [9]:
KEPT_FEATURES = ['g-307']
DROPPED_FEATURES = []

<a id='Competition_Supplied_Data'></a>
# Competition Supplied Data

### Import Data

In [10]:
# 3 training related files
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

# 2 test related files
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

### Reformat/Reshape the data

In [11]:
# train merges the train features and targets
train = train_features.merge(train_targets_scored, on='sig_id')

In [12]:
# Then train drops the control cases and resets the pandas index
ctrl_train = train[train_features['cp_type'] == 'ctl_vehicle'].reset_index(drop=True)
train = train[train_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# Similarly drop the control cases from the test set. Will later fill with 0.
ctrl_test = test_features[test_features['cp_type'] == 'ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# Also drop these control cases from the nonscored target df
targets_nonscored = train_targets_nonscored[train_features['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

In [13]:
# target is a subset of train with just sig_id and the 206 target columns
target = train[train_targets_scored.columns]

In [14]:
# Drop the cp_type column. Homogenous in each df anyways
train = train.drop('cp_type', axis=1)
ctrl_train = ctrl_train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)
ctrl_test = ctrl_test.drop('cp_type', axis=1)

In [15]:
# Data Dimensions
print(f"train: {train.shape}")
print(f"ctrl_train: {ctrl_train.shape}")
print(f"target: {target.shape}")
print(f"nonscored_target: {targets_nonscored.shape}")
print()
print(f"test: {test.shape}")
print(f"ctrl_test: {ctrl_test.shape}")
print(f"sample_submission: {sample_submission.shape}")

train: (21948, 1081)
ctrl_train: (1866, 1081)
target: (21948, 207)
nonscored_target: (21948, 403)

test: (3624, 875)
ctrl_test: (358, 875)
sample_submission: (3982, 207)


### Renaming Feature columns
Forcing all feature columns to start with 'f-' for easier tracking of features:  
- f- -> features  
- f-cat- -> categorical (no point in scaling)  
- f-c- -> CELLS  
- f-g- -> GENES  
- f-pca-c/g -> pca based on c/g  
- f-model_A- -> features created from model A


In [16]:
# Categoricals
train = train.rename(columns={'cp_time': 'f-cat-cp_time', 
                              'cp_dose': 'f-cat-cp_dose'})
test = test.rename(columns={'cp_time': 'f-cat-cp_time', 
                            'cp_dose': 'f-cat-cp_dose'})
# Cells and Genes
train.columns = ['f-' + col if (col.startswith('c-') or col.startswith('g-')) else col for col in train.columns]
test.columns = ['f-' + col if (col.startswith('c-') or col.startswith('g-')) else col for col in test.columns]

### Column Subsets

In [17]:
# List of column names for easy reference throughout
CAT_COLS = [col for col in test.columns if col.startswith('f-cat-')]
GENES = [col for col in test.columns if col.startswith('f-g-')]
CELLS = [col for col in test.columns if col.startswith('f-c-')]
temp_feature_cols = [col for col in test.columns if col.startswith('f-')]
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()
targets_nonscored_cols = targets_nonscored.drop('sig_id', axis=1).columns.values.tolist()
# feature_cols not defined here because new features will be created and some will be dropped

# Column Data Dimensions
print(f"CAT_COLS: {len(CAT_COLS)}")
print(f"GENES: {len(GENES)}")
print(f"CELLS: {len(CELLS)}")
print()
print(f"Temp Feature Columns: {len(temp_feature_cols)}")
print(f"Target Columns: {len(target_cols)}")
print(f"Nonscored target Columns: {len(targets_nonscored_cols)}")

CAT_COLS: 2
GENES: 772
CELLS: 100

Temp Feature Columns: 874
Target Columns: 206
Nonscored target Columns: 402


<a id='PreProcessing'></a>
# PreProcessing

### Quantile Transformation

In [18]:
if Quantile_Transforming:
#     all_data = pd.concat([pd.DataFrame(train[G_and_C]), pd.DataFrame(test[G_and_C])])
    non_cat_cols = list(set(temp_feature_cols) - set(CAT_COLS))
    all_data = pd.DataFrame(train[non_cat_cols])
    all_data_nans = all_data.copy()
#     all_data_nans[all_data == -10] = np.nan

    qt = QuantileTransformer(n_quantiles=100, output_distribution='normal', random_state=42)
    qt.fit(all_data_nans.values)
#     all_data_trans = qt.transform(all_data)

    train.loc[:, non_cat_cols] = qt.transform(train.loc[:, non_cat_cols])
    test.loc[:, non_cat_cols] = qt.transform(test.loc[:, non_cat_cols])

<a id='Initial_Filtering'></a>
# Initial Filtering

### Variance Threshold

In [19]:
var_drops = []
if Variance_Threshold:
    non_cat_cols = list(set(temp_feature_cols) - set(CAT_COLS))
    selector = VarianceThreshold(VAR_THRESH)
    all_data = pd.concat([pd.DataFrame(train[non_cat_cols]), pd.DataFrame(test[non_cat_cols])])
       
    selector.fit(all_data[non_cat_cols])
    kept_columns = all_data.columns.values[selector.get_support(indices=True)]
    dropped_columns = set(non_cat_cols) - set(kept_columns)
    
    var_drops = dropped_columns

print(f"Var drops: {len(var_drops)}")

Var drops: 10


### Drop Features from Filters

In [20]:
dropped_features = var_drops

# Incorporate Forced keep and drop features
dropped_features = set(dropped_features).union(set(DROPPED_FEATURES))
dropped_features = set(dropped_features) - set(KEPT_FEATURES)


print(f"Var Dropped Features: {len(var_drops)}")
print(var_drops)
print(f"Total Dropped Features: {len(dropped_features)}")

train.drop(columns=dropped_features, inplace=True)
test.drop(columns=dropped_features, inplace=True)

Var Dropped Features: 10
{'f-g-15', 'f-g-104', 'f-g-550', 'f-g-331', 'f-g-435', 'f-g-536', 'f-g-307', 'f-g-611', 'f-g-481', 'f-g-219'}
Total Dropped Features: 10


### Column Subsets

In [21]:
# List of column names for easy reference throughout
CAT_COLS = [col for col in test.columns if col.startswith('f-cat-')]
GENES = [col for col in test.columns if col.startswith('f-g-')]
CELLS = [col for col in test.columns if col.startswith('f-c-')]
temp_feature_cols = [col for col in test.columns if col.startswith('f-')]
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()
targets_nonscored_cols = targets_nonscored.drop('sig_id', axis=1).columns.values.tolist()
# feature_cols not defined here because new features will be created and some will be dropped

# Column Data Dimensions
print(f"CAT_COLS: {len(CAT_COLS)}")
print(f"GENES: {len(GENES)}")
print(f"CELLS: {len(CELLS)}")
print()
print(f"Temp Feature Columns: {len(temp_feature_cols)}")
print(f"Target Columns: {len(target_cols)}")
print(f"Nonscored target Columns: {len(targets_nonscored_cols)}")

CAT_COLS: 2
GENES: 762
CELLS: 100

Temp Feature Columns: 864
Target Columns: 206
Nonscored target Columns: 402


<a id='Feature_Engineering'></a>
# Feature Engineering

### -10 Boolean Columns

In [22]:
# Create Boolean columns for -10 cases

# Find Columns with an Unusual amount of -10 values -> I set a threshold of just 5 for now
if Include_neg_10:
    non_cat_cols = list(set(temp_feature_cols) - set(CAT_COLS))
    count_neg_10 = (train[non_cat_cols] == -10).astype(int).sum(axis=0)
    neg_10_cols = count_neg_10[count_neg_10 >= 5].index
    neg_10_bool_cols = ['f-cat-10-' + col for col in neg_10_cols] 
    len(neg_10_cols)

    train[neg_10_bool_cols] = 0
    for col, new_col in zip(neg_10_cols, neg_10_bool_cols):
        train.loc[train[train[col] == -10].index, new_col] = 1

    test[neg_10_bool_cols] = 0
    for col, new_col in zip(neg_10_cols, neg_10_bool_cols):
        test.loc[test[test[col] == -10].index, new_col] = 1

### Column Subsets

In [23]:
# List of column names for easy reference throughout
CAT_COLS = [col for col in test.columns if col.startswith('f-cat-')]
GENES = [col for col in test.columns if col.startswith('f-g-')]
CELLS = [col for col in test.columns if col.startswith('f-c-')]
temp_feature_cols = [col for col in test.columns if col.startswith('f-')]
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()
targets_nonscored_cols = targets_nonscored.drop('sig_id', axis=1).columns.values.tolist()
# feature_cols not defined here because new features will be created and some will be dropped

# Column Data Dimensions
print(f"CAT_COLS: {len(CAT_COLS)}")
print(f"GENES: {len(GENES)}")
print(f"CELLS: {len(CELLS)}")
print()
print(f"Temp Feature Columns: {len(temp_feature_cols)}")
print(f"Target Columns: {len(target_cols)}")
print(f"Nonscored target Columns: {len(targets_nonscored_cols)}")

CAT_COLS: 2
GENES: 762
CELLS: 100

Temp Feature Columns: 864
Target Columns: 206
Nonscored target Columns: 402


<a id='Feature_Reduction'></a>
# Feature Reduction

### PCA Variance Threshold

In [24]:
# Only trying Cells right now

if PCA_VAR:
    data = pd.concat([pd.DataFrame(train[CELLS]), pd.DataFrame(test[CELLS])])
    scaler = MinMaxScaler()
    data.loc[:, CELLS] = scaler.fit_transform(data[CELLS])
    
    pca_cells = PCA(n_components = PCA_VAR_THRESH)
    data2 = pca_cells.fit_transform(data)
    
    num_pca_cell_comp = data2.shape[1]
    print(f"n_components selected: {num_pca_cell_comp}")
          
    train2 = data2[:train.shape[0]]
    test2 = data2[train.shape[0]:]
    
    train2 = pd.DataFrame(train2, columns=[f'f-pca-c-{i}' for i in range(num_pca_cell_comp)])
    test2 = pd.DataFrame(test2, columns=[f'f-pca-c-{i}' for i in range(num_pca_cell_comp)])
    
#     train = train.drop(columns=CELLS)
    train = pd.concat([train, train2], axis=1)
#     test = test.drop(columns=CELLS)
    test = pd.concat([test, test2], axis=1)

n_components selected: 72


### Column Subsets

In [25]:
# List of column names for easy reference throughout
CAT_COLS = [col for col in test.columns if col.startswith('f-cat-')]
GENES = [col for col in test.columns if col.startswith('f-g-')]
CELLS = [col for col in test.columns if col.startswith('f-c-')]
temp_feature_cols = [col for col in test.columns if col.startswith('f-')]
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()
targets_nonscored_cols = targets_nonscored.drop('sig_id', axis=1).columns.values.tolist()
# feature_cols not defined here because new features will be created and some will be dropped

# Column Data Dimensions
print(f"CAT_COLS: {len(CAT_COLS)}")
print(f"GENES: {len(GENES)}")
print(f"CELLS: {len(CELLS)}")
print()
print(f"Temp Feature Columns: {len(temp_feature_cols)}")
print(f"Target Columns: {len(target_cols)}")
print(f"Nonscored target Columns: {len(targets_nonscored_cols)}")

CAT_COLS: 2
GENES: 762
CELLS: 100

Temp Feature Columns: 936
Target Columns: 206
Nonscored target Columns: 402


<a id='CV_Folds'></a>
# CV Folds

In [26]:
# folds will be identical to train, but with a kfold column. 
# folds wll be used in the actual training
folds = train.copy()
mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)

In [27]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1143)
(21948, 1144)
(3624, 937)
(21948, 207)
(3982, 207)


<a id='PyTorch_Setup'></a>
# PyTorch Setup

### Define Dataset Classes

In [28]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct

### Define train/val/inference functions

In [29]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    # Tell the model which mode it is in. At the very least this enables dropout for train mode only.
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    # Tell the model which mode it is in. At the very least this disables dropout for eval mode.
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)

        if CLAMPING:
            loss = loss_fn(torch.clamp(outputs, min=CLAMP_MIN, max=CLAMP_MAX), targets)
            valid_preds.append(torch.clamp(outputs, min=CLAMP_MIN, max=CLAMP_MAX).detach().cpu().numpy())
        else:
            loss = loss_fn(outputs, targets)
            valid_preds.append(outputs.detach().cpu().numpy())
            
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    # Tell the model which mode it is in. At the very least this disables dropout for eval mode.
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
            
        if CLAMPING:
            preds.append(torch.clamp(outputs, min=CLAMP_MIN, max=CLAMP_MAX).detach().cpu().numpy())
        else:
            preds.append(outputs.detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

### Custom Label Smoothing Loss Function

In [30]:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, smoothing=0.0):
        super().__init__()
        self.smoothing = smoothing

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        m = nn.Sigmoid()
        
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1), self.smoothing)
#         loss = F.binary_cross_entropy_with_logits(inputs, targets,)
        loss = F.binary_cross_entropy(inputs, targets,)
        loss = loss.mean()

        return loss


### Define Models

In [31]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size, verbose=False):
        super(Model, self).__init__()    
        
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.2)
        self.fc1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        self.prelu1 = nn.PReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.2)
        self.fc2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        self.prelu2 = nn.PReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.5)
        self.fc3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
        self.sigmoid = nn.Sigmoid()
        
    # https://www.kaggle.com/c/lish-moa/discussion/188651
    def recalibrate_layer(self, layer):
        if(torch.isnan(layer.weight_v).sum() > 0):
            print ('recalibrate layer.weight_v')
            layer.weight_v = torch.nn.Parameter(torch.where(torch.isnan(layer.weight_v), torch.zeros_like(layer.weight_v), layer.weight_v))
            layer.weight_v = torch.nn.Parameter(layer.weight_v + 1e-10)

        if(torch.isnan(layer.weight).sum() > 0):
            print ('recalibrate layer.weight')
            layer.weight = torch.where(torch.isnan(layer.weight), torch.zeros_like(layer.weight), layer.weight)
            layer.weight += 1e-10
            
    def forward(self, x):
        # lesson learned here: do not use F.prelu. Worse results.
        
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        self.recalibrate_layer(self.fc1)
        x = self.prelu1(self.fc1(x))

        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        self.recalibrate_layer(self.fc2)
        x = self.prelu2(self.fc2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        self.recalibrate_layer(self.fc3)
        x = self.sigmoid(self.fc3(x))
        
        return x

### Preprocessing Steps

In [32]:
def process_data(input_data):
    data = input_data.copy()
    non_cat_cols = list(set(temp_feature_cols) - set(CAT_COLS))
    ###### Category Encoding-----------------------------------------------------
    # One-Hot-Encoding
    if Category_Encoding == "OHE":
        data = pd.get_dummies(data, prefix='f-cat-OHE-', prefix_sep='', columns=CAT_COLS)
        print(f" Number of OHE columns added: {len([col for col in data.columns if col.startswith('f-cat-OHE-')])}")
    # Label-Encoding
    
    # Manual-Mapping
    if Category_Encoding == "Mapping":
        data.loc[:, 'f-cat-cp_time'] = data.loc[:, 'f-cat-cp_time'].map({24: 1, 48: 2, 72: 3}).values
        data.loc[:, 'f-cat-cp_dose'] = data.loc[:, 'f-cat-cp_dose'].map({'D1': 0, 'D2': 1}).values
    
    ###### Normalizing------------------------------------------------------------
    # normalizes the train against train and the test against test
    if Normalization:
        scaler = StandardScaler(with_mean=True, with_std=True)
        data.loc[:, non_cat_cols] = scaler.fit_transform(data.loc[:, non_cat_cols])
    
    ###### Scaling
    # scales the train against train and the test against test
    if Scaling:
        MinMaxscaler = MinMaxScaler(feature_range=(-1, 1))
        data.loc[:, non_cat_cols] = MinMaxscaler.fit_transform(data.loc[:, non_cat_cols])
    
    ###### Remove Skewness --------------------------------------------------------
    # Removing Skewness
    if Remove_Skewness:
        for col in non_cat_cols:
            if(abs(data[col].skew()) > 0.75):

                if(data[col].skew() < 0): # neg-skewness
                    data[col] = data[col].max() - data[col] + 1
                    data[col] = np.sqrt(data[col])

                else:
                    data[col] = np.sqrt(data[col])
    #------------------------------------------------------------------
    
    return data

<a id='Training_Process'></a>
## Training Process

### Single Fold Training Function

In [33]:
def run_training(X_train, y_train, X_test, proc_feature_columns, target_columns, HyperParams, fold, seed):
    # List of global things passed into here that aren't identified
    # - valid_loss_array = np.zeros((EPOCHS, NFOLDS))
    # - seed_everything function to avoid randomness
    seed_everything(seed)

    # trn_idx is never used, but val_idx is. Both kept because it looks cleaner.
    trn_idx = X_train[X_train['kfold'] != fold].index
    val_idx = X_train[X_train['kfold'] == fold].index
    
    # For the current fold, separate the train(80%) and val(20%) data by the kfold column
    X_train_df = X_train[X_train['kfold'] != fold].reset_index(drop=True)
    X_valid_df = X_train[X_train['kfold'] == fold].reset_index(drop=True)
    y_train_df = y_train[X_train['kfold'] != fold].reset_index(drop=True)
    y_valid_df = y_train[X_train['kfold'] == fold].reset_index(drop=True)
    
    # separate the data by column and put into respective arrays
    x_train = X_train_df[proc_feature_columns].values
    y_train = y_train_df[target_columns].values
    x_valid = X_valid_df[proc_feature_columns].values
    y_valid = y_valid_df[target_columns].values
    
    # init class with the given train and val data
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    
    # DataLoader is set up as a 'Map-style' dataset 
    # a.k.a. it utilizes the --getitem-- and --len-- protocols we defined in the class
    
    # shuffle is good for training because we do not want any significance to order while training. 
    # Once the weights are determined, shuffle doesn't matter(for val and test)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=HyperParams["BATCH_SIZE"], shuffle=True, num_workers=5, pin_memory=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=HyperParams["BATCH_SIZE"], shuffle=False, num_workers=5, pin_memory=True)
    
    # Our model is simply the layers and forward function. These parameters define the model
    model = Model(
        num_features=len(proc_feature_columns),
        num_targets=len(target_columns),
        hidden_size=HIDDEN_SIZE,
    )
    
    # Device is set to cuda if available, else cpu (in the hyperparameter section)
    model.to(HyperParams["DEVICE"])
    
    # model.parameters() returns an iterator over the module parameters (most commonly used such as in this case, for the optimizer)
    # OneCycleLR looks to be a very fast learining rate convergence method.
    optimizer = torch.optim.Adam(model.parameters(), 
                                 lr=HyperParams["LEARNING_RATE"], 
                                 weight_decay=HyperParams["WEIGHT_DECAY"])
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, 
                                              pct_start=0.1, 
                                              div_factor=1e3, 
                                              max_lr=1e-2, 
                                              epochs=HyperParams["EPOCHS"], 
                                              steps_per_epoch=len(trainloader))
    
    if Label_Smoothing:
        loss_tr = SmoothBCEwLogits(smoothing=SMOOTHING)
    else:
        loss_tr = nn.BCELoss()
    
    loss_val = nn.BCELoss()

    # create array of zeros for oof predictions. Shape is num of observations x num targets
    oof = np.zeros((X_train.shape[0], len(target_columns)))

    # start off with a big number. Will update each cycle on improvements
    best_loss = np.inf
    
    # currently set for 30
    for epoch in range(HyperParams["EPOCHS"]):
        
        # In train_fn: return the loss for the data through our model. 
            # No predictions. 
            # We step the optimizer and scheduler.
        # In valid_fn: return loss and predictions
            # No training is done, as in weights/parameters are left unchanged
        train_loss = train_fn(model, optimizer,scheduler, loss_tr, trainloader, HyperParams["DEVICE"])
        print(f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_val, validloader, HyperParams["DEVICE"])
        print(f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")
        valid_loss_array[epoch, fold] = valid_loss
        
        # if valid loss is the best so far, save the corresponding model
        if valid_loss < best_loss:
            
            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(), f"FOLD{fold}_.pth")
            
    #--------------------- PREDICTION---------------------
    # After training the model, load the best model based on validation results, and run once with the test data for its predictions
    x_test = X_test[proc_feature_columns].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=HyperParams["BATCH_SIZE"], shuffle=False)
    
    # not sure why the model is re-initialized here. Is, and hsould be, identical to the train/val model. 
    # The load_state is still necessary though.
    model = Model(
        num_features=len(proc_feature_columns),
        num_targets=len(target_columns),
        hidden_size=HIDDEN_SIZE,
    )
    
    model.load_state_dict(torch.load(f"FOLD{fold}_.pth"))
    model.to(HyperParams["DEVICE"])
    
    predictions = np.zeros((len(X_test), len(target_columns)))
    predictions = inference_fn(model, testloader, HyperParams["DEVICE"])
    
    # oof: The oof prediction array
    # predictions: The test set prediction array
    return oof, predictions


### Multi-Fold Training Function

In [34]:
def run_k_fold(X_train, y_train, X_test, feature_columns, target_columns, HyperParams, seed):
    # Set up the oof and predictions arrays to be filled each time.
    oof = np.zeros((X_train.shape[0], len(target_columns)))
    predictions = np.zeros((len(X_test), len(target_columns)))
    
    # Process Features here for X_train and X_test rather than every fold redoing it
    X_train = process_data(X_train)
    X_test = process_data(X_test)
    proc_feature_columns = [c for c in X_train.columns if c.startswith('f-')]
    
    for fold in range(HyperParams["NFOLDS"]):
        oof_, pred_ = run_training(X_train, y_train, X_test, proc_feature_columns, target_columns, HyperParams, fold, seed)
        
        # predictions happen each fold, this method averages the predicitons over all of the folds.
        predictions += pred_ / HyperParams["NFOLDS"]
        # Only a subset of the oof is nonzero each time corresponding to the fold
        oof += oof_
        
    return oof, predictions

### Multi-Seed Training

In [35]:
# HyperParameters
HyperParams = {
    "EPOCHS": EPOCHS,
    "NFOLDS": NFOLDS,
    "DEVICE": DEVICE,
    "BATCH_SIZE": BATCH_SIZE,
    "HIDDEN_SIZE": HIDDEN_SIZE,
    "LEARNING_RATE": LEARNING_RATE,
    "WEIGHT_DECAY": WEIGHT_DECAY,
}

### Input for Model A
Model A will use the same parameters as our original model (for now). 
Model A trains with our original features but wiht the nonscored targets, to predict them. 
These predictions will then be used as features into our original model.

In [36]:
# List of seeds to run over.
SEEDS = [1, 2, 3, 4, 42]

# list of target/feature column names - should be 206 targets of them
target_columns = target_cols + targets_nonscored_cols
feature_columns = [c for c in folds.columns if c.startswith('f-')]

# training x and kfold column. Contains 'sig_id' column
X_train = folds[['sig_id'] + feature_columns + ['kfold']]
y_train = folds[['sig_id'] + target_cols].merge(targets_nonscored, on='sig_id', how='left')

# test data. Includes 'sig_id' and features, no target columns or kfold
X_test = test.copy()

### Train Model A

In [None]:
# This chunk is just putting a wrapper around the main training function 'run_k_fold'
# To allow many SEEDs to be tested and averaged at once.

oof = np.zeros((X_train.shape[0], len(target_columns)))
predictions = np.zeros((len(X_test), len(target_columns)))
valid_loss_array = np.zeros((HyperParams["EPOCHS"], HyperParams["NFOLDS"]))

for seed in SEEDS:
    
    oof_, predictions_ = run_k_fold(X_train.copy(), y_train.copy(), X_test.copy(), feature_columns, target_columns, HyperParams, seed)
    oof += oof_ / len(SEEDS)
    predictions += predictions_ / len(SEEDS)

# Make new outputs later
Model_A_validation_predictions = X_train[['sig_id']]
Model_A_validation_predictions[target_columns] = oof

Model_A_test_predictions = X_test[['sig_id']]
Model_A_test_predictions[target_columns] = predictions

# Model A Outputs
Model A will output features to be used as input features for the train and test sets of Model B

In [None]:
# Training Features with sig_id column
Model_A_to_Model_B_train = Model_A_validation_predictions
Model_A_to_Model_B_train.columns = ['f-model_A-' + col if (col != 'sig_id') else col for col in Model_A_to_Model_B_train.columns]
# Test Features
Model_A_to_Model_B_test = Model_A_test_predictions
Model_A_to_Model_B_test.columns = ['f-model_A-' + col if (col != 'sig_id') else col for col in Model_A_to_Model_B_test.columns]

### Use columns with highest correlation (from unchanged target file)

In [None]:
nonzero_targets_df = targets_nonscored.drop('sig_id', axis=1).loc[:, (targets_nonscored.drop('sig_id', axis=1).sum() > 0).values]
corr_mtx = nonzero_targets_df.corr()
corr_map = corr_mtx[corr_mtx>=.7]
corr_map[corr_map == 1] = np.nan
# Columns that contain high correlation
corr_cols = corr_map.columns[(corr_map.sum(axis=1) >0).values]

temp_corr_cols = ['f-model_A-' + col for col in corr_cols]

print(f"Number of Columns kept from Model A based on high Correlations {len(corr_cols)}")

Model_A_to_Model_B_train = Model_A_to_Model_B_train[['sig_id'] + temp_corr_cols]
Model_A_to_Model_B_test = Model_A_to_Model_B_test[['sig_id'] + temp_corr_cols]

### Plot Validation Loss by Fold

In [None]:
# Plot the validation loss for each fold over the epochs
# valid_loss_array is an array of the validaiton loss for each fold for each epoch
fold_names = [f"Fold_{fold}" for fold in range(NFOLDS)]
val_loss_df = pd.DataFrame(data=valid_loss_array, columns=fold_names)
p = sns.lineplot(data=val_loss_df, dashes=False)
p.set(ylim=(0.003, 0.01))

### Input for Model B
Model B is trained to predict the <b>scored</b> targets.

In [None]:
# List of seeds to run over.
# SEEDS = [42]

# list of target/feature column names - should be 206 targets of them
target_columns = target_cols
feature_columns = [c for c in folds.columns if c.startswith('f-')]

# training x and kfold column. Contains 'sig_id' column
X_train = folds[['sig_id'] + feature_columns + ['kfold']]
X_train = X_train.merge(Model_A_to_Model_B_train, on='sig_id', how='left')

y_train = folds[['sig_id'] + target_columns]

# test data. Includes 'sig_id' and features, no target columns or kfold
X_test = test.copy()
X_test = X_test.merge(Model_A_to_Model_B_test, on='sig_id', how='left')

# redeclaring the feature columns now that I have modified the input sets for train/test. Not the cleanest.
feature_columns = [c for c in X_train.columns if c.startswith('f-')]

In [None]:
# This chunk is just putting a wrapper around the main training function 'run_k_fold'
# To allow many SEEDs to be tested and averaged at once.

oof = np.zeros((X_train.shape[0], len(target_columns)))
predictions = np.zeros((len(X_test), len(target_columns)))
valid_loss_array = np.zeros((HyperParams["EPOCHS"], HyperParams["NFOLDS"]))

for seed in SEEDS:
    
    oof_, predictions_ = run_k_fold(X_train.copy(), y_train.copy(), X_test.copy(), feature_columns, target_columns, HyperParams, seed)
    oof += oof_ / len(SEEDS)
    predictions += predictions_ / len(SEEDS)

# Make new outputs later
validation_predictions = X_train[['sig_id']]
validation_predictions[target_columns] = oof
test_predictions = X_test[['sig_id']]
test_predictions[target_columns] = predictions

#### Save OOF Predictions to CSV

In [None]:
# Saving OOF predictions on non-control set for analysis
validation_predictions.to_csv('unprocessed_validation_predictions.csv', index=False)

<a id='Results'></a>
# Results

## Results - Raw

### Plot Validation Loss by Fold

In [None]:
# Plot the validation loss for each fold over the epochs
# valid_loss_array is an array of the validaiton loss for each fold for each epoch
fold_names = [f"Fold_{fold}" for fold in range(NFOLDS)]
val_loss_df = pd.DataFrame(data=valid_loss_array, columns=fold_names)
p = sns.lineplot(data=val_loss_df, dashes=False)
p.set(ylim=(0.014, 0.020))

### CV log_loss

In [None]:
raw_y_true = target[target_cols].values
raw_y_pred = validation_predictions[target_cols].values

raw_scores = []
for i in range(len(target_cols)):
    raw_scores.append(log_loss(raw_y_true[:, i], raw_y_pred[:, i]))
    
raw_scores_arr = np.asarray(raw_scores)
print("Raw CV log_loss: ", raw_scores_arr.mean())

In [None]:
raw_scores_df = pd.DataFrame(raw_scores_arr, index=target_cols)
raw_scores_df = raw_scores_df.sort_values(by=raw_scores_df.columns[0], ascending=True)
raw_scores_df.columns = ["Raw_Log_Loss"]
# raw_scores_df.tail(20).plot.barh()

### ROC AUC

In [None]:
roc_auc = roc_auc_score(raw_y_true, raw_y_pred)

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(raw_y_true.shape[1]):
    roc_auc[i] = roc_auc_score(raw_y_true[:, i], raw_y_pred[:, i])
    fpr[i], tpr[i], _ = roc_curve(raw_y_true[:, i], raw_y_pred[:, i])
roc_arr_raw = np.asarray(list(roc_auc.values()))

In [None]:
plt.hist(roc_arr_raw, bins = 30)
plt.title(f'ROC Scores by Label: {np.round(roc_arr_raw.mean(), 4)} Average')

raw_roc_mean = roc_arr_raw.mean()
print(f"Raw roc_score: {np.round(raw_roc_mean, 4)}")

In [None]:
bad_cols = np.argwhere(roc_arr_raw < 0.5)
for col in bad_cols:
    print(target_cols[col[0]])
    plt.figure()
    lw = 2
    plt.plot(fpr[col[0]], tpr[col[0]], color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[col[0]])
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

## Results - Adjusted
CV Loss reported above and in the plots above are only for non-control cases.   
Below is the adjusted CV_loss for when control cases are hard-coded to 0 and included in loss statistics.

In [None]:
# train_targets_scored is the original unaltered df.
train_temp = pd.read_csv('../input/lish-moa/train_features.csv')
valid_results = train_targets_scored.drop(columns=target_cols).merge(validation_predictions, on='sig_id', how='left').fillna(0)

if FORCE_BAD_COLS:
    for col in bad_cols:
        valid_results.iloc[:, col[0]+1] = 1.75/3624
        
valid_results.loc[train_temp['cp_type'] == 'ctl_vehicle', 1:] = 0

adj_y_true = train_targets_scored[target_cols].values
adj_y_pred = valid_results[target_cols].values

### CV LogLoss

In [None]:
adj_scores = []
for i in range(len(target_cols)):
    adj_scores.append(log_loss(adj_y_true[:, i], adj_y_pred[:, i]))
    
adj_scores_arr = np.asarray(adj_scores)
print("Adj CV log_loss: ", adj_scores_arr.mean())

In [None]:
adj_scores_df = pd.DataFrame(adj_scores_arr, index=target_cols)
adj_scores_df = adj_scores_df.sort_values(by=adj_scores_df.columns[0], ascending=True)
adj_scores_df.columns = ["Adj_Log_Loss"]

log_loss_df = raw_scores_df.merge(adj_scores_df, left_index=True, right_index=True)

fig, ax = plt.subplots(figsize=(20, 10))
log_loss_df.tail(20).plot.barh(ax=ax)
plt.show()

### ROC AUC

In [None]:
roc_auc = roc_auc_score(adj_y_true, adj_y_pred)

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(adj_y_true.shape[1]):
    roc_auc[i] = roc_auc_score(adj_y_true[:, i], adj_y_pred[:, i])
    fpr[i], tpr[i], _ = roc_curve(adj_y_true[:, i], adj_y_pred[:, i])
roc_arr_adj = np.asarray(list(roc_auc.values()))

In [None]:
plt.hist(roc_arr_adj, bins = 30)
plt.title(f'ROC Scores by Label: {np.round(roc_arr_adj.mean(), 4)} Average')

adj_roc_mean = roc_arr_adj.mean()
print(f"Adj roc_score: {np.round(adj_roc_mean, 4)}")

In [None]:
# Using roc_arr_raw to show the improvement on earlier problem examples
bad_cols = np.argwhere(roc_arr_raw < 0.5)
for col in bad_cols:
    print(target_cols[col[0]])
    plt.figure()
    lw = 2
    plt.plot(fpr[col[0]], tpr[col[0]], color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[col[0]])
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

### Collective Stats

In [None]:
print(f"Raw CV log_loss: {raw_scores_arr.mean()}  ")
print(f"Adj CV log_loss: {adj_scores_arr.mean()}  ")
print(f"Raw roc_score: {np.round(raw_roc_mean, 4)}  ")
print(f"Adj roc_score: {np.round(adj_roc_mean, 4)}  ")

<a id='Prepare_Submission_File'></a>
# Prepare Submission File

In [1]:
# Here is the part where cp_type == ctl_vehicle is handled. 
# We merge on sig_id, so the sig_id's missing from our test predictions(only the ctl-vehicle set) remain 0.
test_temp = pd.read_csv('../input/lish-moa/test_features.csv')

sub = sample_submission.drop(columns=target_cols).merge(test_predictions, on='sig_id', how='left').fillna(0)

if FORCE_BAD_COLS:
    for col in bad_cols:
        sub.iloc[:, col[0]] = 1.75/3624

sub.loc[test_temp['cp_type'] == 'ctl_vehicle', 1:] = 0
        
sub.to_csv('submission.csv', index=False)

NameError: name 'pd' is not defined