In [2]:
import os
from pdb import set_trace as bp
from functools import reduce

import numpy as np
import math
import pandas as pd

from constant import target_indices

In [1]:
def softmax(x):
    exps = np.exp(x)
    return exps / np.sum(exps)


def divide_by_sum(df):
    return df.div(df.sum(axis=1), axis=0)


def divide_by_softmax(df):
    return df.apply(softmax, axis=1)


def get_weights_by_normal_distribution(pbs):
    pbs = np.array(pbs)
    _mean = np.mean(pbs)
    _std = np.std(pbs)
    n_std = (pbs - _mean) / _std
    pvalues = np.array([ .5 * (math.erf(n / 2 ** .5) + 1) for n in n_std])
    weights = pvalues / np.sum(pvalues)
    # weights = softmax(pvalues)
    return weights

In [3]:
def merge_dfs(dfs, models_name, weights):
    file_name = '_'.join([f'{name}{weights[i]}' for i, name in enumerate(models_name)])
    save_dir = f'../save_dir/ensemble_result/brute_force/'
    save_path = f'{save_dir}/{file_name}.csv'
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)
    assert not os.path.exists(save_path)

    # sum weighted logits
    dfs = [df * weights[i] for i, df in enumerate(dfs)]
    df = reduce(lambda x, y: x.add(y, fill_value=0), dfs)

    # get top3
    topk_df_indices = df.values.argsort(axis=1)[:, -3:]
    topk_df_indices = np.flip(topk_df_indices, axis=1)
    topk_values = target_indices[topk_df_indices].astype(int)+1

    # create dataframe
    df_top3 = pd.DataFrame(
        data=np.concatenate([np.array(df.index).reshape(-1, 1), topk_values], axis=1),
        columns=['chid', 'top1', 'top2', 'top3']).astype(int).astype(str)

    df_top3.to_csv(save_path, index=None)

In [5]:
ensemble_dir = '../save_dir/ensemble_result'

df_paths = [
    f'{ensemble_dir}/mm_cnn_hidden256_5fold/logits.csv',
    f'{ensemble_dir}/mm_hidden256_5fold_test/logits.csv',
    f'{ensemble_dir}/nn3_attn_5fold/logits.csv',
    f'{ensemble_dir}/mm_CnnAggBn_hidden256_5fold/logits.csv',
    f'{ensemble_dir}/mm_nnbn_h192/logits.csv'
]
models_name = ['MMCnn', 'MMNN', 'nn3Attn', 'CnnAggBn','MMNNBn']

# pbs = [0.72782, 0.72697, 0.72652, 0.72669, 0.72724]
dfs = [divide_by_softmax(pd.read_csv(path).sort_values(by='chid').set_index('chid')) for path in df_paths]


# merge_dfs([], models_name, [0.1, 0.2, 0.3, 0.4, 0.5])

../save_dir/ensemble_result/MMCnn0.1_MMNN0.2_nn3Attn0.3_CnnAggBn0.4_MMNNBn0.5
