In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/test_features.csv


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import log_loss
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [3]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored  = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
s_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [4]:
def average_log_loss(y_true, y_pred):
    print(y_true.shape, y_pred.shape)
    num_samples, num_outputs = y_true.shape
    loss = 0.00
    for i in range(num_outputs):
        loss += log_loss(y_true[:, i], y_pred[:, i], labels=[0, 1])
    loss /= num_outputs
    return loss

In [5]:
def preprocess(df):
    df = df.drop(columns=['sig_id'])
    df.cp_dose = df.cp_dose.map({'D1': -1, 'D2': 1})
    df.cp_time = df.cp_time.map({24: -1, 48: 0, 72: 1})
    df.cp_type = df.cp_type.map({'trt_cp': -1, 'ctl_vehicle': 1})
    return df

In [6]:
train_features = preprocess(train_features)
train_targets_scored = train_targets_scored.drop(columns=['sig_id'])
test_features = preprocess(test_features)

targets_np = train_targets_scored.to_numpy()

In [7]:
g_cols = [col for col in train_features.columns if col.startswith('g-')]
c_cols = [col for col in train_features.columns if col.startswith('c-')]
cp_cols = [col for col in train_features.columns if col.startswith('cp_')]

def scaler_and_PCA(pca_num_components, train, test):
    data = np.concatenate((train, test), axis=0)
    n = train.shape[0]
    
    # variance threshold
    selector = VarianceThreshold(threshold=0.8)
    data = selector.fit_transform(data)
    
    # scale
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    # PCA
    pca = PCA(pca_num_components)
    pca_data = pca.fit_transform(scaled_data)

    train_trans = pca_data[:n, :]
    test_trans = pca_data[n:, :]

    return train_trans, test_trans


train_X_g = train_features[g_cols].to_numpy()
test_X_g = test_features[g_cols].to_numpy()
train_X_g, test_X_g = scaler_and_PCA(80, train_X_g, test_X_g)

train_X_c = train_features[c_cols].to_numpy()
test_X_c = test_features[c_cols].to_numpy()
train_X_c, test_X_c = scaler_and_PCA(20, train_X_c, test_X_c)

features_np = np.concatenate((train_features[cp_cols].to_numpy(), train_X_g, train_X_c), axis=1)
test_np = np.concatenate((test_features[cp_cols].to_numpy(), test_X_g, test_X_c), axis=1)
print('Shape after scaler and PCA', features_np.shape)

Shape after scaler and PCA (23814, 103)


In [8]:
best_model = None
best_loss = 999999999999999999
kf = KFold(n_splits=5)
j = 1
for train_indices, val_indices in kf.split(features_np):
    X_train, Y_train = features_np[train_indices, :], targets_np[train_indices, :]
    X_val, Y_val = features_np[val_indices, :], targets_np[val_indices, :]

    all_categories = list(train_targets_scored.columns)
    model_dict = {}
    for i in tqdm(range(206)):
        if Y_train[:, i].max() == 0:
            model_dict[all_categories[i]] = logistic_model
        else:
            logistic_model = LogisticRegression(C=0.1, max_iter=1000, class_weight={0: 0.4, 1: 0.6})
            logistic_model.fit(X_train, Y_train[:, i])
            model_dict[all_categories[i]] = logistic_model
    Y_pred = np.zeros(Y_val.shape)
    i = 0
    for category in tqdm(all_categories):
        Y_pred[:, i] = np.copy(model_dict[category].predict_proba(X_val)[:, 1])
        i += 1
    cur_loss = average_log_loss(Y_val, Y_pred)
    print('Log_loss', j, cur_loss)
    if cur_loss < best_loss:
        best_model = model_dict
        best_loss = cur_loss
    j += 1

print('Best loss is:', best_loss)

100%|██████████| 206/206 [00:58<00:00,  3.54it/s]
100%|██████████| 206/206 [00:00<00:00, 996.73it/s] 


(4763, 206) (4763, 206)
Log_loss 1 0.01649091835483895


100%|██████████| 206/206 [01:00<00:00,  3.43it/s]
100%|██████████| 206/206 [00:00<00:00, 918.55it/s]


(4763, 206) (4763, 206)
Log_loss 2 0.01668314475610082


100%|██████████| 206/206 [00:57<00:00,  3.56it/s]
100%|██████████| 206/206 [00:00<00:00, 1052.86it/s]


(4763, 206) (4763, 206)
Log_loss 3 0.016552801681052536


100%|██████████| 206/206 [00:59<00:00,  3.49it/s]
100%|██████████| 206/206 [00:00<00:00, 983.55it/s] 


(4763, 206) (4763, 206)
Log_loss 4 0.016740139663352253


100%|██████████| 206/206 [00:55<00:00,  3.70it/s]
100%|██████████| 206/206 [00:00<00:00, 1024.18it/s]


(4762, 206) (4762, 206)
Log_loss 5 0.016826482064448
Best loss is: 0.01649091835483895


In [9]:
Y_res = s_submission.drop(columns=['sig_id']).to_numpy()
i = 0
all_categories = list(train_targets_scored.columns)
for category in tqdm(all_categories):
    Y_res[:, i] = np.copy(best_model[category].predict_proba(test_np)[:, 1])
    i += 1
# POSTPROCESS
for i in range(test_np.shape[0]):
    if test_np[i][0] == 1:
        Y_res[i, :] = np.zeros(Y_res.shape[1])
s_res = pd.DataFrame(Y_res, columns=all_categories)
s_res = pd.concat([s_submission['sig_id'], s_res], axis=1)
s_res.to_csv('submission.csv', index=False)

100%|██████████| 206/206 [00:00<00:00, 1225.78it/s]
