In [2]:
import numpy as np
import pandas as pd

%matplotlib inline

## Params

In [309]:
IS_EVAL = False
DATA_PATH = 'data/'

# Note: the '_evaluation' or '_validation' will be add automatically 
FORECAST_FILES = ['forecast_prophet_store_dpt', 'forecast_wavenet_store_dpt', 'lgb_estim_top_down']
WEIGHT_FILE = 'lgb_weights'
OUTPUT_NAME = 'top_down_ensemble'

if IS_EVAL:
    PERIOD_LABEL = 'evaluation'
else:
    PERIOD_LABEL = 'validation'

## Load data

In [310]:
forecast = {}
for FORECAST_FILE in FORECAST_FILES:
    forecast[FORECAST_FILE] = (pd.read_csv(DATA_PATH + 'external/' + FORECAST_FILE + '_' + PERIOD_LABEL + '.csv'))\
                              .sort_values(['store_id', 'dept_id'])

weights = pd.read_csv(DATA_PATH + 'external/' + WEIGHT_FILE + '_' + PERIOD_LABEL + '.csv')

## Load true solution

In [311]:
true_solution = pd.read_csv('./data/raw/sales_train_evaluation.csv')
true_solution = true_solution.groupby(['store_id', 'dept_id'])\
                [['d_%s' % c for c in range(1914, 1942)]].sum()
true_solution.columns = ['F%s' % c for c in range(1, 29)]
true_solution = true_solution.reset_index()
forecast['true'] = true_solution

In [312]:
# viz
explo = pd.concat(forecast).reset_index().drop('level_1', axis=1)
explo = pd.melt(explo, id_vars=['store_id', 'dept_id', 'level_0'])\
.pivot_table(index=['store_id', 'dept_id', 'variable'], columns=['level_0']).reset_index()

explo.columns = ['store_id', 'dept_id', 'd', 'prophet', 'wavenet', 'lgb', 'true']
explo['d'] = explo['d'].str.replace('F', '').astype(int)

explo = explo.sort_values(['store_id', 'dept_id', 'd'])


In [313]:
explo['neutral'] = 0

In [314]:
prophet_errors = explo.groupby(['store_id', 'dept_id'])\
                      .apply(lambda x: np.sqrt(((x['true']-x['prophet'])**2).mean()))\
                      .reset_index().rename(columns={0:'prophet'})

wavenet_errors = explo.groupby(['store_id', 'dept_id'])\
                      .apply(lambda x: np.sqrt(((x['true']-x['wavenet'])**2).mean()))\
                      .reset_index().rename(columns={0:'wavenet'})

lgb_errors = explo.groupby(['store_id', 'dept_id'])\
                      .apply(lambda x: np.sqrt(((x['true']-x['lgb'])**2).mean()))\
                      .reset_index().rename(columns={0:'lgb'})

neutral_errors = explo.groupby(['store_id', 'dept_id'])\
                      .apply(lambda x: np.sqrt(((x['true']-x['neutral'])**2).mean()))\
                      .reset_index().rename(columns={0:'neutral'})

In [338]:
list_ensemble = list()
for _, gp in explo.groupby(['store_id', 'dept_id']):
    errors_gp = list()
    for m in ['prophet', 'wavenet', 'lgb', 'neutral']:
        errors_gp.append(np.sqrt(((gp['true']-gp[m])**2).mean()))
    
    es = errors_gp[:3]
    ps = gp[['prophet', 'wavenet', 'lgb']].values
    e0 = errors_gp[-1]
    l=0.0001
    m = len(es)
    n = len(ps)
    X = ps
    pTy = .5 * (n * e0**2 + (X**2).sum(axis=0) - n * np.array(es)**2)
    w = np.linalg.pinv(X.T.dot(X) + l * n * np.eye(m)).dot(pTy)
    weights_dict[_] = w
    
    gp['ensemble'] = gp[['prophet', 'wavenet', 'lgb']].values.dot(w)
    list_ensemble.append(gp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [340]:
ensemble = pd.concat(list_ensemble)

In [341]:
for m in ['prophet', 'wavenet', 'lgb', 'ensemble']:
    print(m, np.sqrt(((ensemble['true']-ensemble[m])**2).mean()))

prophet 98.71759349430133
wavenet 110.8929846744209
lgb 118.62957439714238
ensemble 78.41090015930227


In [343]:
ensemble = ensemble.pivot_table(index=['store_id', 'dept_id'], columns=['d'], values=['ensemble']).reset_index()
ensemble.columns = ['store_id', 'dept_id'] + ['F%s' % c for c in range(1,29)]

In [344]:
sample_submission = pd.read_csv(DATA_PATH + 'raw/sample_submission.csv')
submission_ids = pd.read_csv(DATA_PATH + 'raw/sales_train_' + PERIOD_LABEL + '.csv')[['id', 'store_id', 'dept_id']]

## Reshape forecast & weights

In [345]:
ensemble = pd.merge(submission_ids, ensemble).drop(columns=['store_id', 'dept_id'])
weights = pd.merge(submission_ids, weights).drop(columns=['store_id', 'dept_id'])

## Generate submission

In [346]:
res = sample_submission.set_index('id')
res.update(ensemble.set_index('id') * weights.set_index('id'))
res.reset_index(inplace=True)
res

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.968868,0.857337,0.806886,0.916843,0.999603,1.155798,1.355122,1.061533,1.011617,...,0.951224,1.142778,1.050906,0.926373,0.836951,0.799583,0.880296,0.940557,1.084827,1.071882
1,HOBBIES_1_002_CA_1_validation,0.221557,0.209325,0.253616,0.231805,0.259665,0.258121,0.292516,0.211631,0.244296,...,0.228458,0.274063,0.263771,0.250313,0.207450,0.227118,0.219710,0.235970,0.257133,0.248575
2,HOBBIES_1_003_CA_1_validation,0.505714,0.478158,0.464786,0.418200,0.466899,0.561244,0.583049,0.448192,0.419251,...,0.470077,0.540812,0.535150,0.480380,0.439462,0.418804,0.410891,0.447449,0.538217,0.507772
3,HOBBIES_1_004_CA_1_validation,2.228134,1.853840,1.662609,1.673875,2.147355,2.529960,2.560452,2.095643,2.005074,...,2.040224,2.519888,2.513748,2.223000,1.887380,1.762730,1.761605,2.091446,2.684666,2.376198
4,HOBBIES_1_005_CA_1_validation,1.165846,1.089261,1.022740,1.110707,1.305795,1.417988,1.684869,1.398182,1.328381,...,1.231963,1.435643,1.389103,1.213826,1.094196,1.048561,1.093534,1.241992,1.465695,1.501426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60976,FOODS_3_824_WI_3_evaluation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60977,FOODS_3_825_WI_3_evaluation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60978,FOODS_3_826_WI_3_evaluation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [347]:
res.to_csv(DATA_PATH + 'submission/' + OUTPUT_NAME + '_' + PERIOD_LABEL + '.csv', index=False)