In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
train_lr = pd.read_csv('./train_nb_logistic_regression_100.csv')
train_gru = pd.read_csv('./train_gru_106.csv')
train_lstm = pd.read_csv('./train_lstm_100.csv')

train_dict = {
    'logistic_regression': train_lr,
    'gru': train_gru,
    'lstm': train_lstm,
}

In [None]:
val_lr = pd.read_csv('./submission_nb_logistic_regression_100.csv')
val_gru = pd.read_csv('./submission_gru_106.csv')
val_lstm = pd.read_csv('./submission_lstm_100.csv')

val_dict = {
    'logistic_regression': val_lr,
    'gru': val_gru,
    'lstm': val_lstm,
}

In [None]:
train_targets = pd.read_csv('./input/train.csv')

In [None]:
coly = [c for c in val_gru.columns if c not in ['id']]

In [None]:
def _item():
    return (0., [])

def coef_seacrh_recursive(col_name, 
                          models, 
                          prev_coefs=[], 
                          coef_dict=defaultdict(_item), 
                          cum_pred=None):
    
    left_coefs = 1 - sum(prev_coefs + [0.])
    X_test = train_dict[models[-1]][col_name]
    y_test = train_targets[col_name]
    
    if cum_pred is None:
        cum_pred = np.zeros_like(X_test)
        
    if len(models) == 1:
            curr_pred = cum_pred + left_coefs * X_test
            score = roc_auc_score(y_test, curr_pred)
            
            if coef_dict[col_name][0] < score:
                coef_dict[col_name] = (score, prev_coefs + [left_coefs])
            
            return coef_dict
    else:
        for coef in np.arange(0., left_coefs + 0.01, 0.01):
            curr_pred = cum_pred + coef * X_test
            coef_seacrh_recursive(col_name, 
                                  models[:-1], 
                                  prev_coefs + [coef],
                                  coef_dict,
                                  curr_pred)

In [None]:
coef_dict = defaultdict(_item)
scores = []
for col_name in coly:
    coef_seacrh_recursive(col_name, list(train_dict.keys()), coef_dict=coef_dict)
    print('%s:' % col_name)
    print('\tROC-AUC: %s' % coef_dict[col_name][0])
    for idx, name in enumerate(train_dict.keys()):
        print('\t\t%s - %s' % (name, coef_dict[col_name][1][idx]))
    scores.append(coef_dict[col_name][0])
print('Total score: %s' % np.mean(scores))

In [None]:
coef_dict

In [None]:
test = pd.read_csv('./input/test.csv').fillna('Unknown')
submission = pd.DataFrame.from_dict({'id': test['id']})

In [None]:
for col_name in coly:
    cum_sum = None
    for idx, name in enumerate(train_dict.keys()):
        add = np.clip(val_dict[name][col_name], 1e-3, 1 - 1e-3) * coef_dict[col_name][1][idx]
        if cum_sum is None:
            cum_sum = add
        else:
            cum_sum += add
    
    submission[col_name] = np.clip(cum_sum, 1e-3, 1 - 1e-3)

In [None]:
submission.to_csv('ensemble_004.csv', index=False)