In [2]:
import sys
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
### General ###
import os
import copy
import tqdm
import pickle
import random
import warnings
warnings.filterwarnings("ignore")
sys.path.append("../input/rank-gauss")
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

### Data Wrangling ###
import numpy as np
import pandas as pd
from scipy import stats

### Machine Learning ###
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from pickle import load,dump

### Deep Learning ###
import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.nn.modules.loss import _WeightedLoss
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
# Tabnet 
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

### Make prettier the prints ###
from colorama import Fore
c_ = Fore.CYAN
m_ = Fore.MAGENTA
r_ = Fore.RED
b_ = Fore.BLUE
y_ = Fore.YELLOW
g_ = Fore.GREEN

In [4]:
from sklearn.preprocessing import QuantileTransformer

In [5]:
os.listdir('../input/lish-moa')

['sample_submission.csv',
 'test_features.csv',
 'train_drug.csv',
 'train_features.csv',
 'train_targets_nonscored.csv',
 'train_targets_scored.csv']

In [6]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
df = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [7]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [8]:
seed = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(seed)

In [9]:
from feature_engineering_blend import *

train_features = pd.read_csv('../input/lish-moa/train_features.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
train_features, test_features = inference_preprocess(train_features, test_features,                                             input_path =  "../input/results_tabnet/")

making gaussian distributions
performing pca on genes
number of columns for pca G is: 600
performing pca on cells
number of columns for pca C is: 50
variance threshold: 0.85
adding clusters generated from KMeans as features
adding statistics and square of columns as new features
g_sum
new number of columns: 1239


In [10]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

target = train[train_targets_scored.columns]

In [11]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [12]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [13]:
target=target[target_cols]

In [14]:
train = pd.get_dummies(train, columns=['cp_time','cp_dose'])
test = pd.get_dummies(test, columns=['cp_time','cp_dose'])

In [15]:
feature_cols = [c for c in train.columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['sig_id']]

In [16]:
print(train.shape)
print(test.shape)

(21948, 1447)
(3624, 1241)


In [17]:
train = train[feature_cols]
test = test[feature_cols]

In [18]:
X_test = test.values

In [19]:
class LogitsLogLoss(Metric):

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 5e-5) + y_true * np.log(logits + 5e-5)
        return np.mean(-aux)

In [20]:
MAX_EPOCH = 200

tabnet_params = dict(
    n_d = 32,
    n_a = 32,
    n_steps = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = seed,
    verbose = 10
)

In [21]:
test_cv_preds = []

NB_SPLITS = 10
mskf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, random_state = 0, shuffle = True)
SEED = [20,21,22]
for s in SEED:
    tabnet_params['seed'] = s
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, target)):
        
        model = TabNetRegressor(**tabnet_params)
        ### Predict on test ###
        model.load_model(f"../input/results_tabnet/TabNet_seed_{s}_fold_{fold_nb+1}.zip")
        preds_test = model.predict(X_test)
        test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

test_preds_all = np.stack(test_cv_preds)

Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device used : cpu
Device use

In [22]:
all_feat = [col for col in df.columns if col not in ["sig_id"]]
# To obtain the same lenght of test_preds_all and submission
test = pd.read_csv("../input/lish-moa/test_features.csv")
sig_id = test[test["cp_type"] != "ctl_vehicle"].sig_id.reset_index(drop = True)
tmp = pd.DataFrame(test_preds_all.mean(axis = 0), columns = all_feat)
tmp["sig_id"] = sig_id


submission_tabnet = pd.merge(test[["sig_id"]], tmp, on = "sig_id", how = "left")
submission_tabnet.fillna(0, inplace = True)
#submission_tabnet.to_csv("submission_tabnet.csv", index = None)
submission_tabnet.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001001,0.00118,0.002022,0.019151,0.022228,0.005246,0.002998,0.005494,0.000341,...,0.000791,0.001165,0.003596,0.000863,0.000768,0.000551,0.00053,0.002032,0.002621,0.001774
1,id_001897cda,0.000553,0.00097,0.00205,0.003869,0.002154,0.002155,0.002373,0.010095,0.001831,...,0.000823,0.001038,0.004116,0.000436,0.003736,0.000538,0.003671,0.001173,0.007768,0.003193
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.000941,0.001069,0.001757,0.00952,0.014225,0.004072,0.002586,0.004707,0.000272,...,0.000677,0.001883,0.002904,0.014936,0.00434,0.000663,0.001643,0.001733,0.000565,0.001571
4,id_0027f1083,0.001404,0.001343,0.001599,0.01577,0.02087,0.004714,0.003634,0.003071,0.000482,...,0.000753,0.00082,0.00295,0.001631,0.001472,0.00068,0.000892,0.001871,0.000559,0.00155


In [23]:
print(f"{b_}submission_tabnet.shape: {r_}{submission_tabnet.shape}")

[34msubmission_tabnet.shape: [31m(3982, 207)


In [24]:
def check_submission(actual, control_path):
    sub_control = pd.read_csv(control_path)
    np.testing.assert_almost_equal(actual = actual.drop("sig_id", axis = 1).to_numpy(),
                               desired = sub_control.drop("sig_id", axis = 1).to_numpy(), decimal=5, err_msg='', verbose=True)

    for i in range(len(actual["sig_id"])):
    
        assert list(actual["sig_id"])[i] == list(sub_control["sig_id"])[i] 
    
check_submission(submission_tabnet, "../input/results_tabnet/submission.csv") 

# Neural Network

In [25]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
df = pd.read_csv('../input/lish-moa/sample_submission.csv')


GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]



In [26]:
train_features, test_features = inference_preprocess(train_features, test_features, input_path="../input/results_NN/")

making gaussian distributions
performing pca on genes
number of columns for pca G is: 600
performing pca on cells
number of columns for pca C is: 50
variance threshold: 0.85
adding clusters generated from KMeans as features
adding statistics and square of columns as new features
g_sum
new number of columns: 1239


In [27]:
train_drug = pd.read_csv("../input/lish-moa/train_drug.csv")
train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_nonscored, on='sig_id')
train = train.merge(train_drug, on='sig_id')
train = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

In [28]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)


In [29]:
target_cols = [x for x in train_targets_scored.columns if x != 'sig_id']
aux_target_cols = [x for x in train_targets_nonscored.columns if x != 'sig_id']
all_target_cols = target_cols + aux_target_cols

num_targets = len(target_cols)
num_aux_targets = len(aux_target_cols)
num_all_targets = len(all_target_cols)

print('num_targets: {}'.format(num_targets))
print('num_aux_targets: {}'.format(num_aux_targets))
print('num_all_targets: {}'.format(num_all_targets))

num_targets: 206
num_aux_targets: 402
num_all_targets: 608


In [30]:
def process_data(data):
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    return data

In [31]:
feature_cols = [c for c in process_data(train).columns if c not in all_target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold', 'sig_id', 'drug_id']]
num_features = len(feature_cols)
num_features

1240

In [32]:
train = process_data(train)
test = process_data(test)

In [33]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }

        return dct
    
    

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    return preds

In [34]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 24
BATCH_SIZE = 128

WEIGHT_DECAY = {'ALL_TARGETS': 1e-5, 'SCORED_ONLY': 3e-6}
MAX_LR = {'ALL_TARGETS': 1e-2, 'SCORED_ONLY': 3e-3}
DIV_FACTOR = {'ALL_TARGETS': 1e3, 'SCORED_ONLY': 1e2}
PCT_START = 0.1

class Model(nn.Module):
    def __init__(self, num_features, num_targets):
        super(Model, self).__init__()
        self.hidden_size = [1500, 1250, 1000, 750]
        self.dropout_value = [0.5, 0.35, 0.3, 0.25]

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, self.hidden_size[0])
        
        self.batch_norm2 = nn.BatchNorm1d(self.hidden_size[0])
        self.dropout2 = nn.Dropout(self.dropout_value[0])
        self.dense2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])

        self.batch_norm3 = nn.BatchNorm1d(self.hidden_size[1])
        self.dropout3 = nn.Dropout(self.dropout_value[1])
        self.dense3 = nn.Linear(self.hidden_size[1], self.hidden_size[2])

        self.batch_norm4 = nn.BatchNorm1d(self.hidden_size[2])
        self.dropout4 = nn.Dropout(self.dropout_value[2])
        self.dense4 = nn.Linear(self.hidden_size[2], self.hidden_size[3])

        self.batch_norm5 = nn.BatchNorm1d(self.hidden_size[3])
        self.dropout5 = nn.Dropout(self.dropout_value[3])
        self.dense5 = nn.utils.weight_norm(nn.Linear(self.hidden_size[3], num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.leaky_relu(self.dense3(x))

        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = F.leaky_relu(self.dense4(x))

        x = self.batch_norm5(x)
        x = self.dropout5(x)
        x = self.dense5(x)
        return x
    
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)

        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
            
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))    
    
    


In [35]:
def make_predictions(model_path):
    
    x_test = test[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
    )
    
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    return predictions

In [45]:
def create_submission_NN():
    
    predictions = np.zeros((len(test), len(target_cols)))
    NFOLDS = 7
    NSEEDS = 7
    
    for seed in range(NSEEDS):
        print("seed:", seed)
        for fold in range(NFOLDS):
            pred_ = make_predictions("../input/results_NN/SCORED_ONLY_FOLD" + str(fold) + "_SEED" + str(seed) + ".pth")
            predictions += pred_ / (NFOLDS * NSEEDS)
    
    pred_df = pd.DataFrame(data = predictions, columns = target_cols)
    pred_df = pd.concat((test["sig_id"], pred_df), axis = 1)
    sample_submission = pd.read_csv("../input/lish-moa/sample_submission.csv")
    sub = sample_submission.drop(columns=target_cols).merge(pred_df[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
    return sub

In [46]:
submission_NN = create_submission_NN()

seed: 0
seed: 1
seed: 2
seed: 3
seed: 4
seed: 5
seed: 6


In [48]:
check_submission(submission_NN, "../input/results_NN/submission.csv") 

# Resnet

In [51]:
# Import train data, drop sig_id, cp_type
import tensorflow.keras as keras
import json
train_data = pd.read_csv('../input/lish-moa/train_features.csv')

targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_data = pd.read_csv('../input/lish-moa/test_features.csv')



json_file_path = '../input/main_predictors.json'

with open(json_file_path, 'r') as j:
    predictors = json.loads(j.read())
    predictors = predictors['start_predictors']


def preprocess_data_resnet(train_data, test_data, targets, predictors):
    columns_numeric = ["g-" + str(i) for i in range(772)] + ["c-" + str(i) for i in range(100)]
    GENES = [col for col in train_data.columns if col.startswith('g-')]
    CELLS = [col for col in train_data.columns if col.startswith('c-')]
    
    
    print("performing Rank Gauss")
    #Rank Gauss
    transformer = QuantileTransformer(n_quantiles=1000,random_state=0, output_distribution="normal")
    train_data[columns_numeric] = transformer.fit_transform(train_data[columns_numeric])
    test_data[columns_numeric] = transformer.transform(test_data[columns_numeric])

    
    #FEATURES + PCA
    print("performing PCA on GENES")
    #GENES
    n_comp_genes = 100
    pca_genes = PCA(n_components=n_comp_genes, random_state=42)
    train_pca_genes = pca_genes.fit_transform(train_data[GENES])
    test_pca_genes = pca_genes.transform(test_data[GENES])
    #adding columns
    train_pca_genes = pd.DataFrame(train_pca_genes, columns=[f'pca_G-{i}' for i in range(n_comp_genes)])
    test_pca_genes = pd.DataFrame(test_pca_genes, columns=[f'pca_G-{i}' for i in range(n_comp_genes)])
    train_data = pd.concat((train_data, train_pca_genes), axis=1)
    test_data = pd.concat((test_data, test_pca_genes), axis=1)
    print("after PCA on GENES, number of features:", train_data.shape[1])
    
    
    print("performing PCA on CELLS")
    #CELLS
    n_comp_cells = 10
    pca_cells = PCA(n_components=n_comp_cells, random_state=42)
    train_pca_cells = pca_cells.fit_transform(train_data[CELLS])
    test_pca_cells = pca_cells.transform(test_data[CELLS])
    #adding columns
    train_pca_cells = pd.DataFrame(train_pca_cells, columns=[f'pca_G-{i}' for i in range(n_comp_cells)])
    test_pca_cells = pd.DataFrame(test_pca_cells, columns=[f'pca_G-{i}' for i in range(n_comp_cells)])
    train_data = pd.concat((train_data, train_pca_cells), axis=1)
    test_data = pd.concat((test_data, test_pca_cells), axis=1)
    print("after PCA on CELLS, number of features:", train_data.shape[1])
    
    columns_numeric = columns_numeric + [f'pca_G-{i}' for i in range(n_comp_cells)] + [f'pca_G-{i}' for i in range(n_comp_genes)]
    
    print("dropping cp_type and rows with cp_type = ctl_vehicle from train, test and target dataframes")
    train_data = train_data[train_data["cp_type"] != "ctl_vehicle"].drop("cp_type", axis = 1)
    test_data = test_data[test_data["cp_type"] != "ctl_vehicle"].drop("cp_type", axis = 1)
    features_labels = train_data.merge(targets, on='sig_id')
    targets = features_labels[targets.columns]  
    train_data = pd.get_dummies(train_data, columns=['cp_time','cp_dose'])
    test_data = pd.get_dummies(test_data, columns=['cp_time','cp_dose'])
    
    targets.drop("sig_id", inplace = True, axis=1)
    train_data.drop("sig_id", inplace = True, axis=1)
    test_data.drop("sig_id", inplace = True, axis=1)
    train_data2 = train_data[predictors]
    test_data2 = test_data[predictors]
    
    train_data = train_data.to_numpy()
    test_data = test_data.to_numpy()
    targets = targets.to_numpy()
    train_data2 = train_data2.to_numpy()
    test_data2 = test_data2.to_numpy()
    
    return train_data, train_data2, test_data, test_data2,  targets



train_data, train_data2, test_data, test_data2,  targets = preprocess_data_resnet(train_data, test_data, targets, predictors)


# Prediction Clipping Thresholds

p_min = 0.0005
p_max = 0.9995

# OOF Evaluation Metric with clipping and no label smoothing

def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

n_test = test_data.shape[0]
n_labels = targets.shape[1]

n_folds = 10
y_pred = np.zeros((n_test,n_labels))

n_seeds = 7
for seed in range(10, 17):
    print("current seed:", seed)
    for fold in range(n_folds):
        model = keras.models.load_model("../input/results_resnet/TwoHeads_seed_" + str(seed) + "_fold_" + str(fold), custom_objects={'logloss':logloss})
        y_pred += model.predict([test_data, test_data2]) / (n_folds * n_seeds)



performing Rank Gauss
performing PCA on GENES
after PCA on GENES, number of features: 976
performing PCA on CELLS
after PCA on CELLS, number of features: 986
dropping cp_type and rows with cp_type = ctl_vehicle from train, test and target dataframes
current seed: 10
current seed: 11
current seed: 12
current seed: 13
current seed: 14
current seed: 15
current seed: 16


In [55]:
sub_resnet = pd.read_csv('../input/lish-moa/sample_submission.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
y_pred = np.clip(y_pred,p_min,p_max)
sub_resnet.loc[test_features['cp_type'] != 'ctl_vehicle',1:] = y_pred

# Set ctl_vehicle to 0
sub_resnet.loc[test_features['cp_type'] == 'ctl_vehicle',1:] = 0

# Save Submission
#sub_resnet.to_csv('submission_resnet.csv', index=False)
check_submission(sub_resnet, "../input/results_resnet/submission.csv") 

In [56]:
weight = [0.33387592693072893, 0.3345993850605896, 0.3315246880086815]


submission = pd.read_csv('../input/lish-moa/sample_submission.csv')
submission.iloc[:, 1:] = 0
submission.iloc[:, 1:] = submission_NN.iloc[:, 1:] * weight[0] + submission_tabnet.iloc[:, 1:] * weight[1] + sub_resnet.iloc[:, 1:] * weight[2]

#submission.to_csv('submission.csv', index=False)

In [57]:
print(submission.shape)
submission.head()

(3982, 207)


Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000859,0.001217,0.002069,0.01711,0.021275,0.004772,0.002751,0.005552,0.000416,...,0.001143,0.001231,0.003854,0.001197,0.000916,0.000652,0.000826,0.00183,0.003465,0.001496
1,id_001897cda,0.000555,0.000967,0.001594,0.002448,0.001468,0.001771,0.005085,0.013294,0.025995,...,0.000712,0.000968,0.003476,0.000585,0.007731,0.000576,0.005445,0.001285,0.004174,0.003499
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.000802,0.000869,0.002009,0.015152,0.018656,0.005114,0.00324,0.005064,0.000424,...,0.000679,0.001453,0.003222,0.024122,0.005562,0.000658,0.001538,0.002031,0.001259,0.002222
4,id_0027f1083,0.001796,0.001289,0.001706,0.018504,0.023352,0.004596,0.004608,0.002297,0.000487,...,0.001035,0.000773,0.003514,0.001878,0.00126,0.000735,0.001164,0.00187,0.000675,0.001305
