In [None]:
import numpy as np
import pandas as pd
from functools import reduce
import pickle

%matplotlib inline

## Params

In [None]:
ENS_LEVEL_KEYS = ['store_id', 'dept_id']

LIST_ALGO = ['lgb_estim', 
             'tf_estim', 
             'Prophet_store_dpt_lgb_weights']

OUTPUT_NAME = 'lgb_tf_prophet_ensembling'

## Load usefull data

In [None]:
ids = pd.read_csv('data/raw/sales_train_validation.csv')
ids = ids[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']]

## Load and melt forecast files

In [None]:
melted_df = []

for algo in LIST_ALGO:
    df = pd.read_csv('data/submission/' + algo + '_validation.csv')
    df_melt = pd.melt(df[df['id'].str.contains('validation')], id_vars='id', value_name=algo)
    melted_df.append(df_melt)

## Load and melt solution file

In [None]:
true = pd.read_csv('data/raw/sales_train_evaluation.csv')[['id'] + ['d_%s' % c for c in range(1914, 1942)]]
true.columns = ['id'] + ['F%s' % c for c in range(1, 29)]
true['id'] = true['id'].str.replace('evaluation', 'validation')
true = pd.melt(true, id_vars='id', value_name='true')
melted_df.append(true)

## Merge all & add neutral forecast (only 0)

In [None]:
errors = reduce(pd.merge, melted_df)
errors = pd.merge(ids, errors)
errors['neutral'] = 0
errors.head(5)

## Calculate ensembling weights & apply them

In [None]:
list_ensemble = list()
weights_dict = {}

for _, gp in errors.groupby(ENS_LEVEL_KEYS):
    errors_gp = list()
    
    for m in LIST_ALGO + ['neutral']:
        errors_gp.append(np.sqrt(np.mean(np.square(gp['true'] - gp[m]))))
    
    e0 = errors_gp[-1] # neutral error
    es = errors_gp[:len(LIST_ALGO)] # algo error
    ps = gp[LIST_ALGO].values
    
    l = 0.0001
    m = len(es)
    n = len(ps)
    X = ps
    pTy = 0.5 * (n * e0**2 + (X**2).sum(axis=0) - n * np.array(es)**2)
    w = np.linalg.pinv(X.T.dot(X) + l * n * np.eye(m)).dot(pTy)
    weights_dict[_] = w
    
    gp['ensemble'] = gp[LIST_ALGO].values.dot(w)
    list_ensemble.append(gp)

ensemble = pd.concat(list_ensemble)
ensemble['ensemble'].clip(0, inplace=True)
ensemble.head()

## Compare RMSE

In [None]:
for m in LIST_ALGO + ['ensemble']:
    print(m, np.sqrt(np.mean(np.square(ensemble['true'] - ensemble[m]))))

## Format outputs

In [None]:
ensemble = ensemble.pivot_table(index=['id'], columns=['variable'], values=['ensemble']).reset_index()
ensemble.columns = [c[0] if c[1]=='' else c[1] for c in ensemble.columns.tolist()]
ensemble = ensemble[['id'] + ['F%s' % c for c in range(1,29)]]
ensemble.head()

## Save weights and validation outputs

In [None]:
f = open('data/external/weights_' + OUTPUT_NAME + '.pkl', 'wb')
pickle.dump(weights_dict, f)
f.close()

In [None]:
ensemble.to_csv('data/submission/' + OUTPUT_NAME + '_validation.csv', index=False)