In [1]:
import pandas as pd
import numpy as np

## Data

In [2]:
sales = pd.read_csv('sales_train_validation.csv')
price = pd.read_csv('sell_prices.csv')
calendar = pd.read_csv('calendar.csv')
submission_format = pd.read_csv('sample_submission.csv')

## Preprocessing

In [3]:
#Remove 'd_' before number in column 'd'
calendar['d'] = [i.replace('d_','') for i in calendar['d']]
#Generate id in price df
price['id'] = price['item_id'] + "_" + price['store_id'] + '_validation'

## Weight for the level 12 series

In [4]:
sales

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3


In [5]:
#Use second last 28 days since we use last 28 days as test set
for day in range(1858,1886):
    wk_id = list(calendar[calendar['d']==str(day)]['wm_yr_wk'])[0]
    wk_price_df = price[price['wm_yr_wk']==wk_id]
    sales = sales.merge(wk_price_df[['sell_price','id']],on=['id'],how='inner')
    #Unit sales means the amount of money made at that day. It's trivial to see since Total money = sell_price * num of sales
    sales['unit_sales_' + str(day)] = sales['sell_price'] * sales['d_'+str(day)]
    sales.drop(['sell_price'],axis=1, inplace=True)

In [6]:
#Sum of all unit_sales
sales['dollar_sales'] = sales[[c for c in sales.columns if c.find('unit_sales')==0]].sum(axis=1)

In [7]:
#Now we drop all unit_sales columns
sales.drop([c for c in sales.columns if c.find('unit_sales')==0],axis=1,inplace = True)

In [8]:
sales['weight'] = sales['dollar_sales'] / sales['dollar_sales'].sum()
# sales.drop('dollar_sales',axis=1, inplace=True)

In [9]:
sales.shape

(30490, 1921)

**Multi-label training process**

In [10]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [11]:
sales

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,dollar_sales,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,1,1,3,0,1,1,231.28,0.000063
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,1,0,0,0,0,35.73,0.000010
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,1,0,1,1,1,44.55,0.000012
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,5,4,1,0,1,3,7,2,245.92,0.000067
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,1,1,2,2,2,4,89.28,0.000024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,0,0,0,0,1,0,0,1,20.66,0.000006
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,1,0,0.00,0.000000
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,2,0,1,0,0,1,0,139.30,0.000038
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,0,0,1,0,3,1,3,44.80,0.000012


In [12]:
#Last 28 days, i.e. from 1886 to 1914
target_col = [col for col in sales.columns if col.find('d_')==0 and int(col.split('_')[1])>=1886]
y_train = sales[target_col]

In [13]:
#training features, note that it's important to decide the starting date of training data
#Cat features
cat_onehot = pd.get_dummies(sales[['dept_id','cat_id','store_id','state_id']])

In [14]:
sales

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,dollar_sales,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,1,1,3,0,1,1,231.28,0.000063
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,1,0,0,0,0,35.73,0.000010
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,1,0,1,1,1,44.55,0.000012
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,5,4,1,0,1,3,7,2,245.92,0.000067
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,1,1,2,2,2,4,89.28,0.000024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,0,0,0,0,1,0,0,1,20.66,0.000006
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,1,0,0.00,0.000000
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,2,0,1,0,0,1,0,139.30,0.000038
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,0,0,1,0,3,1,3,44.80,0.000012


In [15]:
useless_cols = ['dept_id','cat_id','store_id','state_id','weight']
features = [col for col in sales.columns if col not in useless_cols]
train_set = sales[features]

In [16]:
x_train = train_set.drop(['id','item_id'],axis=1)

In [17]:
y_train

Unnamed: 0,d_1886,d_1887,d_1888,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,1,0,0,0,0,0,1,0,4,2,...,1,3,0,1,1,1,3,0,1,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,0,0,0,0,3,1,2,1,3,1,...,1,0,5,4,1,0,1,3,7,2
4,1,0,4,4,0,1,4,0,1,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,1,0,0,1
30486,0,0,0,0,0,1,1,2,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,1,0,0,0,0,1,1,2,1,4,...,2,1,0,2,0,1,0,0,1,0
30488,3,0,0,0,0,0,1,1,2,1,...,0,0,1,0,0,1,0,3,1,3


Exception ignored in: <function Booster.__del__ at 0x000001B5F676E1E0>
Traceback (most recent call last):
  File "d:\python demo\lib\site-packages\xgboost\core.py", line 957, in __del__
    if self.handle is not None:
AttributeError: 'Booster' object has no attribute 'handle'


TypeError: train() missing 1 required positional argument: 'params'

<xgboost.core.DMatrix at 0x1b5f67be978>

### Naive logic baseline
* All 0's
* Mean from all prev data: 1.641176
* Mean of previous 10,20,30,40,50 days
* Same as the last 28 days: 0.855823
* Weekly propagation

*All 0's*

In [88]:
for d in range(1886,1914):
    sales['F_'+str(d)] = 0

*Mean from all prev data*

In [214]:
cols = [c for c in sales.columns if c.find('d_')==0]
mean_all_prev = sales[cols].mean(axis=1)
sales['mean'] = mean_all_prev

In [215]:
for d in range(1886,1914):
    sales['F_'+str(d)] = sales['mean']

*Same as last 28 days*

In [245]:
for d in range(1886,1914):
    sales['F_'+str(d)] = sales['d_'+str(d-28)]

## Infer forecast, ground truth values, and weights for all higher level aggregations

In [246]:
agg_df = pd.DataFrame(sales[[c for c in sales.columns if c.find('d_')==0 or c.find('F_')==0]].sum()).transpose()
agg_df['level']=1
agg_df['weight']=1/12
column_agg = agg_df.columns

In [247]:
agg_df

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913,level,weight
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,47825,37360,35475,34786,34003,45611,53863,46360,1,0.083333


In [248]:
level_groupings = {2:['state_id'],3:['store_id'],4:['cat_id'],
                   5:['dept_id'],6:['state_id','cat_id'],
                   7:['state_id','dept_id'],8:['store_id','cat_id'],
                   9:['store_id','dept_id'],10:['item_id'],
                   11:['item_id','state_id']}

In [219]:
#Interactive data groupings of different aggregation level by modifying level
sales.groupby(by=level_groupings[11]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,F_1904,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913
item_id,state_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FOODS_1_001,CA,6,3,2,3,7,5,8,3,5,2,...,3.506012,3.506012,3.506012,3.506012,3.506012,3.506012,3.506012,3.506012,3.506012,3.506012
FOODS_1_001,TX,0,1,2,2,0,4,0,0,4,3,...,1.576581,1.576581,1.576581,1.576581,1.576581,1.576581,1.576581,1.576581,1.576581,1.576581
FOODS_1_001,WI,0,2,0,1,0,9,2,1,2,5,...,1.319394,1.319394,1.319394,1.319394,1.319394,1.319394,1.319394,1.319394,1.319394,1.319394
FOODS_1_002,CA,3,3,4,4,3,3,0,2,1,1,...,1.730789,1.730789,1.730789,1.730789,1.730789,1.730789,1.730789,1.730789,1.730789,1.730789
FOODS_1_002,TX,0,0,2,0,0,0,0,1,0,0,...,0.685311,0.685311,0.685311,0.685311,0.685311,0.685311,0.685311,0.685311,0.685311,0.685311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HOUSEHOLD_2_515,TX,0,0,0,0,0,0,0,0,0,0,...,0.166231,0.166231,0.166231,0.166231,0.166231,0.166231,0.166231,0.166231,0.166231,0.166231
HOUSEHOLD_2_515,WI,0,0,0,0,0,0,0,0,0,0,...,0.145844,0.145844,0.145844,0.145844,0.145844,0.145844,0.145844,0.145844,0.145844,0.145844
HOUSEHOLD_2_516,CA,0,2,0,1,1,1,0,2,1,2,...,0.587036,0.587036,0.587036,0.587036,0.587036,0.587036,0.587036,0.587036,0.587036,0.587036
HOUSEHOLD_2_516,TX,2,1,0,0,0,0,1,1,0,0,...,0.495034,0.495034,0.495034,0.495034,0.495034,0.495034,0.495034,0.495034,0.495034,0.495034


In [277]:
#Automate the process of appending time series of different level of aggs into agg_df
for level in level_groupings:
    df = sales.groupby(by=level_groupings[level]).sum().reset_index(drop=True)
    df['level'] = level
    df['weight'] /= 12
    agg_df = agg_df.append(df[column_agg])
del df

KeyError: "['dept_id', 'store_id', 'state_id', 'cat_id', 'item_id'] not in index"

In [250]:
sales['weight'] /= 12

In [251]:
print(sales.shape[0],agg_df.shape[0],sales.shape[0]+agg_df.shape[0])

30490 12350 42840


In [252]:
agg_df['weight'].sum() + sales['weight'].sum()

0.9999999999999996

## RMSSE Calculation

In [253]:
h = 28
n = 1885
def rmsse(ground_truth, forecast, train_series, axis=1):
    assert axis == 0 or axis == 1
    if axis == 1:
        #Using axis = 1 we must guarantee these are matrices and not arrays
        assert ground_truth.shape[1] > 1 and forecast.shape[1] > 1 and train_series.shape[1] > 1
    numerator = ((ground_truth - forecast) ** 2).sum(axis=axis)
    if axis == 1:
        denominator = 1/(n-1) * ((train_series[:,1:]-train_series[:,:-1]) ** 2).sum(axis=axis)
    else:
        denominator = 1/(n-1) * ((train_series[1:]-train_series[:-1]) ** 2).sum(axis=axis)
    return (1/h * numerator/denominator) ** 0.5

In [254]:
#When calling .find(a) == 0 that means a has been found
train_series_cols = [c for c in sales.columns if c.find('d_')==0][:-28]
ground_truth_cols = [c for c in sales.columns if c.find('d_')==0][-28:]
forecast_cols = [c for c in sales.columns if c.find('F_')==0]

In [255]:
sales['rmsse'] = rmsse(np.array(sales[ground_truth_cols]),np.array(sales[forecast_cols]),np.array(sales[train_series_cols]))
agg_df['rmsse'] = rmsse(np.array(agg_df[ground_truth_cols]),np.array(agg_df[forecast_cols]),np.array(agg_df[train_series_cols]))

In [256]:
sales['wrmsse'] = sales['weight'] * sales['rmsse']
agg_df['wrmsse'] = agg_df['weight'] * agg_df['rmsse']

In [257]:
sales['wrmsse'].sum() + agg_df['wrmsse'].sum()

0.8558233199674036

### Submission file generation

In [258]:
def sub_format(df):
    #Rename columns
    sub_cols = [f'F{i}' for i in range(1,29)]
    df.columns = sub_cols
    
    #Required ids
    validation_ids = sales['id'].values
    evaluation_ids = [i.replace('validation', 'evaluation') for i in validation_ids]
    ids = np.concatenate([validation_ids, evaluation_ids])
    
    predictions = pd.DataFrame(ids, columns=['id'])
    forecast = pd.concat([df] * 2).reset_index(drop=True)
    predictions = pd.concat([predictions, forecast], axis=1)
    predictions = predictions.set_index('id')
    return predictions

In [259]:
forecast = sales[[c for c in sales.columns if c.find('F_')==0]]

In [260]:
same_as_last_28 = sub_format(forecast)

In [261]:
same_as_last_28.to_csv('last_28.csv')