## Import

In [7]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn

import pandas as pd

import os

import numpy as np

from skorch import NeuralNetRegressor
from skorch.callbacks import EarlyStopping, Checkpoint, LRScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from skorch.helper import predefined_split

from models import FFNeuralNetwork, LSTMNeuralNetwork, LSTMDataset
from utilities import create_scaled_data_by_col, rmsle


## Loading Data and Scaling

In [8]:
data_dir = 'data/'
df = pd.read_csv(os.path.join(data_dir, 'train_data.csv'))
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by=['date', 'store_nbr'])
display(df.head())

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,store_type,cluster,oil,...,dow_avg_transactions,dow_rolling_1_transactions,dow_rolling_3_transactions,dow_rolling_7_transactions,prev_1_sales,prev_7_sales,prev_14_sales,prev_1_transactions,prev_7_transactions,prev_14_transactions
0,2013-02-01,1,0,3.0,0,0,0,0,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2013-02-01,1,1,0.0,0,0,0,0,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013-02-01,1,2,0.0,0,0,0,0,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2013-02-01,1,3,941.0,0,0,0,0,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013-02-01,1,4,0.0,0,0,0,0,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
min_max_cols = ['store_nbr', 'city', 'state', 'store_type', 'cluster', 'h_type_nat', 'h_description_nat', 'h_transferred_nat', 'h_type_loc', 'h_description_loc', 'h_transferred_loc', 'month', 'day', 'day_of_week']
normalize_cols = ['onpromotion', 'oil', 'dow_avg_sales', 'dow_rolling_1_sales', 'dow_rolling_3_sales', 'prev_1_sales', 'prev_7_sales', 'prev_14_sales', 'dow_avg_transactions', 'dow_rolling_1_transactions', 'dow_rolling_3_transactions', 'prev_1_transactions', 'prev_7_transactions', 'prev_14_transactions']
x_cols = min_max_cols + normalize_cols
y_cols = ['sales']
split_col = 'family'

print(min_max_cols)
print(df.columns)

final_run = False

if final_run:
    train_df = df
else:
    rows_before = (df['date'] < '2017-07-27')
    rows_after = ~rows_before

    print('rows_before', rows_before.sum())
    print('rows_after', rows_after.sum())
    print('rows_total', len(df))

    train_df = df[rows_before]
    val_df = df[rows_after]

train_df_by_cluster = {}
scaler_x_by_cluster = {}
scaler_y_by_cluster = {}

for cluster in df[split_col].unique():
    cluster_df, cluster_min_max_scaler, cluster_normalize_scaler, cluster_y_scaler = create_scaled_data_by_col(train_df, min_max_cols, normalize_cols, y_cols, split_col, cluster)
    train_df_by_cluster[cluster] = cluster_df
    scaler_x_by_cluster[cluster] = (cluster_min_max_scaler, cluster_normalize_scaler)
    scaler_y_by_cluster[cluster] = cluster_y_scaler

if not final_run:
    val_df_by_cluster = {}

    for cluster in df[split_col].unique():
        val_cluster_min_max_scaler, val_cluster_normalize_scaler = scaler_x_by_cluster[cluster]
        val_cluster_y_scaler = scaler_y_by_cluster[cluster]

        val_cluster_df = val_df[val_df[split_col] == cluster]
        val_cluster_df = val_cluster_df.drop(columns=split_col)

        val_cluster_x_min_max = val_cluster_df[min_max_cols].values.astype(np.float32)
        val_cluster_x_normalize = val_cluster_df[normalize_cols].values.astype(np.float32)
        val_cluster_y = val_cluster_df[y_cols].values.reshape(-1, len(y_cols)).astype(np.float32)

        val_cluster_x_min_max = val_cluster_min_max_scaler.transform(val_cluster_x_min_max)
        val_cluster_x_normalize = val_cluster_normalize_scaler.transform(val_cluster_x_normalize)
        val_cluster_y = val_cluster_y_scaler.transform(val_cluster_y)

        val_cluster_df[min_max_cols] = val_cluster_x_min_max
        val_cluster_df[normalize_cols] = val_cluster_x_normalize
        val_cluster_df[y_cols] = val_cluster_y

        val_df_by_cluster[cluster] = val_cluster_df

['store_nbr', 'city', 'state', 'store_type', 'cluster', 'h_type_nat', 'h_description_nat', 'h_transferred_nat', 'h_type_loc', 'h_description_loc', 'h_transferred_loc', 'month', 'day', 'day_of_week']
Index(['date', 'store_nbr', 'family', 'sales', 'onpromotion', 'city', 'state',
       'store_type', 'cluster', 'oil', 'h_type_nat', 'h_description_nat',
       'h_transferred_nat', 'h_type_loc', 'h_description_loc',
       'h_transferred_loc', 'transactions', 'year', 'month', 'day',
       'day_of_week', 'dow_avg_sales', 'dow_rolling_1_sales',
       'dow_rolling_3_sales', 'dow_rolling_7_sales', 'dow_avg_transactions',
       'dow_rolling_1_transactions', 'dow_rolling_3_transactions',
       'dow_rolling_7_transactions', 'prev_1_sales', 'prev_7_sales',
       'prev_14_sales', 'prev_1_transactions', 'prev_7_transactions',
       'prev_14_transactions'],
      dtype='object')


## NN Training

In [10]:
net_by_cluster = {}
train_params = {
                "criterion": nn.L1Loss,
                "optimizer": torch.optim.AdamW,
                "optimizer__weight_decay": 1e-8,
                #'train_split' : None,
                #"train_split": predefined_split(Dataset(val_x, val_y)),
                "lr": 0.001,
                "batch_size": 128,
                "max_epochs": 1000,
                "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                "iterator_train__shuffle": False,
                "iterator_train__num_workers": 2,
                "iterator_train__pin_memory": True,
                "iterator_valid__shuffle": False,
                "iterator_valid__num_workers": 2,
                "iterator_valid__pin_memory": True,
                "verbose": 2,
        }

net_params = {
    'input_dim': len(x_cols),
    'out_dim': 1,
    'hidden_dim': 200,
    'num_hidden_layers': 6,
    }

In [11]:
for cluster in df[split_col].unique():
    train_df = train_df_by_cluster[cluster]
    train_x = train_df[x_cols].values.astype(np.float32)
    train_y = train_df[y_cols].values.reshape(-1, len(y_cols)).astype(np.float32)

    if not final_run:
        val_df = val_df_by_cluster[cluster]
        train_params['train_split'] = predefined_split(Dataset(val_df[x_cols].values.astype(np.float32), val_df[y_cols].values.reshape(-1, len(y_cols)).astype(np.float32)))
    else:
        train_params['train_split'] = None

    callbacks = [EarlyStopping(patience=15, threshold=0.001, threshold_mode='abs', monitor='valid_loss', lower_is_better=True),
            Checkpoint(monitor='valid_loss_best', f_params=f'sales_forecaster_{cluster}.pt', dirname='models/'),
            LRScheduler(policy=ReduceLROnPlateau, monitor='valid_loss', factor=0.5, patience=5, threshold=0.001, threshold_mode='abs', mode='min', verbose=True)
            ]

    train_params['callbacks'] = callbacks

    net = NeuralNetRegressor(FFNeuralNetwork(**net_params), **train_params)

    net.fit(train_x, train_y)
    net_by_cluster[cluster] = net



KeyError: "Key 'valid_loss' was not found in history."

# Load Nets from Checkpoints

In [5]:
for cluster in df[split_col].unique():
    net = NeuralNetRegressor(FFNeuralNetwork(**net_params), **train_params)
    net.initialize()
    net.load_params(f_params=f'models/sales_forecaster_{cluster}.pt')
    net_by_cluster[cluster] = net

## LSTM Training

In [23]:
endogenous_cols = [
        'sales', 'onpromotion', 'oil', 
       'dow_avg_sales', 'dow_rolling_3_sales', 'dow_rolling_7_sales',
       'dow_avg_transactions', 'dow_rolling_3_transactions',
       'dow_rolling_7_transactions', 'rolling_7_sales', 'rolling_14_sales',
       'rolling_7_transactions', 'rolling_14_transactions']

exogenous_cols = [
    'h_type_nat', 'h_description_nat', 'h_transferred_nat', 'h_type_loc',
    'h_description_loc', 'h_transferred_loc', 'month', 'day', 'day_of_week', 'store_nbr'
    ]

out_cols = ['sales']


lstm_net_by_cluster = {}
lstm_net_params = {
    'input_dim': 512,
    'endogenous_dim': len(endogenous_cols)*54,
    'endogenous_len': 5,
    'exogenous_dim': len(exogenous_cols),
    'hidden_dim': 1024,
    'out_dim': 54,
    'out_seq_len': 15,
    'num_layers': 4
}

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 1000

for cluster in df[split_col].unique():
    train_df = train_df_by_cluster[cluster]
    train_dataset = LSTMDataset(train_df, 5, 15, 'date', endogenous_cols, exogenous_cols, out_cols, 'store_nbr')
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=30, shuffle=False)

    scaler_y = scaler_y_by_cluster[cluster]

    val_df = val_df_by_cluster[cluster]
    val_dataset = LSTMDataset(val_df, 5, 15, 'date', endogenous_cols, exogenous_cols, out_cols, 'store_nbr')

    model = LSTMNeuralNetwork(**lstm_net_params)
    optim = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-8)
    criterion = nn.L1Loss()
    model.train()
    model.to(device)
    model.zero_grad()

    val = val_dataset[0]
    val_endog = torch.tensor(val['endog']).unsqueeze(0).to(device).to(torch.float32)
    val_exog = torch.tensor(val['exog']).unsqueeze(0).to(device).to(torch.float32)
    val_y = scaler_y.inverse_transform(val['label'].reshape(-1,1).astype(np.float32))

    for epoch in range(num_epochs):
        for sample in train_loader:
            endog = sample['endog'].to(device).to(torch.float32)
            exog = sample['exog'].to(device).to(torch.float32)
            y = sample['label'].to(device).to(torch.float32)
            optim.zero_grad()
            output = model((endog, exog))
            loss = criterion(output, y)
            loss.backward()
            optim.step()
        with torch.no_grad():
            model.eval()
            val_pred = model((val_endog, val_exog))
            val_pred = val_pred.view(-1,1).cpu().detach().numpy()
            val_pred = scaler_y.inverse_transform(val_pred)

            val_loss1 = rmsle(val_y, val_pred.clip(0))
            val_loss2 = np.abs(val_y - val_pred).sum()
            print(f'Epoch {epoch+1}/{num_epochs}, RMSLE: {val_loss1.item():6.5f} L1: {val_loss2:6.5f}')
            model.train()


    lstm_net_by_cluster[cluster] = model

Epoch 1/1000, RMSLE: 0.59281 L1: 2621.91406
Epoch 2/1000, RMSLE: 0.59805 L1: 2591.96606
Epoch 3/1000, RMSLE: 0.61355 L1: 2627.24707
Epoch 4/1000, RMSLE: 0.61980 L1: 2648.24902
Epoch 5/1000, RMSLE: 0.63566 L1: 2669.86035
Epoch 6/1000, RMSLE: 0.64438 L1: 2688.27637
Epoch 7/1000, RMSLE: 0.66854 L1: 2746.78809
Epoch 8/1000, RMSLE: 0.68094 L1: 2764.41455
Epoch 9/1000, RMSLE: 0.69042 L1: 2781.10596
Epoch 10/1000, RMSLE: 0.69891 L1: 2787.39185
Epoch 11/1000, RMSLE: 0.69744 L1: 2791.86279
Epoch 12/1000, RMSLE: 0.69394 L1: 2789.51074
Epoch 13/1000, RMSLE: 0.70647 L1: 2797.06030
Epoch 14/1000, RMSLE: 0.69428 L1: 2787.90845
Epoch 15/1000, RMSLE: 0.70769 L1: 2804.49365
Epoch 16/1000, RMSLE: 0.69218 L1: 2788.97607
Epoch 17/1000, RMSLE: 0.71373 L1: 2809.92358
Epoch 18/1000, RMSLE: 0.68829 L1: 2788.63428
Epoch 19/1000, RMSLE: 0.71235 L1: 2810.70557
Epoch 20/1000, RMSLE: 0.68650 L1: 2790.86255
Epoch 21/1000, RMSLE: 0.71120 L1: 2808.48950


## Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor
cluster_rfs = {}

for cluster in df[split_col].unique():
    train_df = train_df_by_cluster[cluster]

    train_x = train_df[x_cols].values.astype(np.float32)
    train_y = train_df[y_cols].values.reshape(-1, len(y_cols)).astype(np.float32)

    rf = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=4)
    rf.fit(train_x, train_y.squeeze())

    cluster_rfs[cluster] = rf

## XGBoost

In [18]:
import xgboost as xgb
cluster_xgb = {}
for cluster in df[split_col].unique():
    train_x = train_x_by_cluster[cluster]
    train_y = train_y_by_cluster[cluster]

    xgb_model = xgb.XGBRegressor(n_estimators=1000, max_depth=12, learning_rate=0.001, random_state=42, n_jobs=2)
    xgb_model.fit(train_x, train_y.squeeze())

    cluster_xgb[cluster] = xgb_model

## Predict on Training Data

In [27]:
net_train_preds = []
rf_train_preds = []

for cluster in df[split_col].unique():
    train_df = train_df_by_cluster[cluster]

    train_x = train_df[x_cols].values.astype(np.float32)
    train_y = train_df[y_cols].values.reshape(-1, len(y_cols)).astype(np.float32)
    y_scaler = scaler_y_by_cluster[cluster]

    net = net_by_cluster[cluster]
    rf = cluster_rfs[cluster]

    net_preds = net.predict(train_x)
    rf_preds = rf.predict(train_x)

    train_df['sales_nn'] = net_preds
    train_df['sales_rf'] = rf_preds

## Validation Loss Evaluation

In [None]:
rf_preds = []
net_preds = []
xgb_preds = []
val_y_true = []

for cluster in df[split_col].unique():
    val_cluster_df = val_df_by_cluster[cluster]
    val_x = val_cluster_df[x_cols].values.astype(np.float32)
    val_y = val_cluster_df[y_cols].values.reshape(-1, len(y_cols)).astype(np.float32)

    rf = cluster_rfs[cluster]
    net = net_by_cluster[cluster]

    rf_preds.append(scaler_y_by_cluster[cluster].inverse_transform(rf.predict(val_x).reshape(-1, 1)))
    net_preds.append(scaler_y_by_cluster[cluster].inverse_transform(net.predict(val_x).reshape(-1, 1)).clip(0))    
    #xgb_preds.append(scaler_y_by_cluster[cluster].inverse_transform(cluster_xgb[cluster].predict(val_x).reshape(-1, 1)))
    val_y_true.append(scaler_y_by_cluster[cluster].inverse_transform(val_y))

rf_preds = np.concatenate(rf_preds)
net_preds = np.concatenate(net_preds)
#xgb_preds = np.concatenate(xgb_preds)
val_y_true = np.concatenate(val_y_true)

print(f'RF RMSLE: {rmsle(val_y_true, rf_preds)}')
#print(f'XGB RMSLE: {rmsle(val_y_true, xgb_preds)}')
print(f'NN RMSLE: {rmsle(val_y_true, net_preds)}')

RF RMSLE: 0.4127919069148833
NN RMSLE: 0.3906131386756897


## Loading Test Data

In [6]:
test_df = pd.read_csv(os.path.join(data_dir, 'test_data.csv'), index_col=0)
display(test_df.head())

test_x_by_cluster = {}
test_id_by_cluster = {}

for cluster in df[split_col].unique():
    test_cluster_min_max_scaler, test_cluster_normalize_scaler = scaler_x_by_cluster[cluster]
    test_cluster_y_scaler = scaler_y_by_cluster[cluster]

    test_cluster_x_df = test_df[test_df[split_col] == cluster]
    test_cluster_x_df = test_cluster_x_df.drop(columns=split_col)

    test_cluster_x_min_max = test_cluster_x_df[min_max_cols].values.astype(np.float32)
    test_cluster_x_normalize = test_cluster_x_df[normalize_cols].values.astype(np.float32)

    test_cluster_x_min_max = test_cluster_min_max_scaler.transform(test_cluster_x_min_max)
    test_cluster_x_normalize = test_cluster_normalize_scaler.transform(test_cluster_x_normalize)

    test_x_by_cluster[cluster] = np.concatenate([test_cluster_x_min_max, test_cluster_x_normalize], axis=1)
    test_id_by_cluster[cluster] = test_cluster_x_df.index


test_preds_dfs = []

for cluster in df[split_col].unique():
    test_x = test_x_by_cluster[cluster]
    id = test_id_by_cluster[cluster]
    #rf = cluster_rfs[cluster]

    #pred_rf = scaler_y_by_cluster[cluster].inverse_transform(rf.predict(test_x).reshape(-1, 1))
    #pred_xgb = scaler_y_by_cluster[cluster].inverse_transform(cluster_xgb[cluster].predict(test_x).reshape(-1, 1))
    pred_nn = scaler_y_by_cluster[cluster].inverse_transform(net_by_cluster[cluster].predict(test_x).reshape(-1, 1)).clip(0)
    
    cluster_df = pd.DataFrame(np.concatenate([pred_nn], axis=1), index=id, columns=['sales_nn'])
    #cluster_df = pd.DataFrame(np.concatenate([pred_rf, pred_nn], axis=1), index=id, columns=['sales_rf', 'sales_nn'])

    test_preds_dfs.append(cluster_df)

test_preds_df = pd.concat(test_preds_dfs)

test_df = test_df.merge(test_preds_df, on='id', how='left')

sub_df_nn = test_df[['sales_nn']]
#sub_df_rf = test_df[['sales_rf']]
#sub_df_xgb = test_df[['sales_xgb']]

#sub_df_rf = sub_df_rf.rename(columns={'sales_rf': 'sales'})
#sub_df_xgb = sub_df_xgb.rename(columns={'sales_xgb': 'sales'})
sub_df_nn = sub_df_nn.rename(columns={'sales_nn': 'sales'})


display(sub_df_nn.head())
#display(sub_df_rf.head())
#display(sub_df_xgb.head())

sub_df_nn.to_csv('data/submission_nn.csv')
#sub_df_xgb.to_csv('data/submission_xgb.csv')
#sub_df_rf.to_csv('data/submission_rf.csv')

Unnamed: 0_level_0,store_nbr,family,onpromotion,city,state,store_type,cluster,oil,h_type_nat,h_description_nat,...,prev_1_sales,prev_7_sales,prev_14_sales,dow_avg_transactions,dow_rolling_1_transactions,dow_rolling_3_transactions,dow_rolling_7_transactions,prev_1_transactions,prev_7_transactions,prev_14_transactions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000888,1,0,0,0,0,0,13,46.8,0,0,...,4.0,4.0,2.0,1861.739316,1892.0,1864.0,1888.857143,1766.0,1892.0,1903.0
3000889,1,1,0,0,0,0,13,46.8,0,0,...,0.0,0.0,0.0,1861.739316,1892.0,1864.0,1888.857143,1766.0,1892.0,1903.0
3000890,1,2,2,0,0,0,13,46.8,0,0,...,2.0,2.0,3.0,1861.739316,1892.0,1864.0,1888.857143,1766.0,1892.0,1903.0
3000891,1,3,20,0,0,0,13,46.8,0,0,...,2418.0,2645.0,2242.0,1861.739316,1892.0,1864.0,1888.857143,1766.0,1892.0,1903.0
3000892,1,4,0,0,0,0,13,46.8,0,0,...,0.0,0.0,0.0,1861.739316,1892.0,1864.0,1888.857143,1766.0,1892.0,1903.0


Unnamed: 0_level_0,sales
id,Unnamed: 1_level_1
3000888,3.959671
3000889,0.0
3000890,5.111362
3000891,2302.990234
3000892,0.0
