In [8]:
import numpy as np
import pandas as pd

# 1. Function

In [9]:
def blend(PATHS, aggregator):
    '''
    @param PATHS        list of filepaths to submission files
    @param aggregator   callable, string, dictionary, or list of string/callables
    
                        see: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.aggregate.html

                        Function to use for aggregating the data. 
                        If a function, must either work when passed a 
                        DataFrame or when passed to DataFrame.apply. 
                        For a DataFrame, can pass a dict, if the keys 
                        are DataFrame column names.

                        Accepted Combinations are:

                        string function name
                        function
                        list of functions
                        dict of column names -> functions (or list of functions)
    '''
    
    df_joined = pd.DataFrame(columns=['SUB_ID'])
    df_joined['SUB_ID'] = pd.read_csv(PATHS[0])['SUB_ID']
    for i, path in enumerate(PATHS):
        df = pd.read_csv(path)
        df = df.rename(index=str, columns={'DATA_VALUE': 'DATA_VALUE_' + str(i+1)})
        df_joined = df_joined.join(df.set_index('SUB_ID'), on='SUB_ID', how='right')
                                   
    column_names = list(map(lambda i: 'DATA_VALUE_' + str(i), list(range(1,len(PATHS)+1))))
    
    df_joined['DATA_VALUE'] = df_joined[column_names].agg(aggregator, axis=1)
    
    return df_joined

def export(df, PATH):
    ''' Exports a aggregated dataframe in submission file format. '''
    df_export = df[['SUB_ID', 'DATA_VALUE']].copy()
    df_export.to_csv(PATH, index=False)

In [13]:
# Example Usage

PATHS = [
    '../data/predictions/prediction_boosting_clustered_MAX.csv',
    '../data/predictions/prediction_simple_random_forest_clustered_MAX copy.csv',
    '../data/predictions/prediction_simple_random_forest_MAX copy.csv'
]

df_aggr = blend(PATHS, 'mean')
df_aggr.head()

export(df_aggr, '../data/predictions/aggregated_submission.csv')