In [1]:
import os
import sys
import math
import pickle
import logging
from pathlib import Path

import scipy as sp

from sklearn.linear_model import LinearRegression

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)
_logger = logging.getLogger()

In [2]:
from bhm_at_scale.handler import ModelHandler
from bhm_at_scale.model import model, guide, local_guide, check_model_guide, predictive_model, Site
from bhm_at_scale.utils import summary, stats_to_df, preds_to_df

In [3]:
import jax.numpy as jnp
from jax import random, ops
from jax import lax
from jax import jit
from jax.numpy import DeviceArray
import numpy as np
import numpyro
from numpyro import optim
import numpyro.distributions as dist
from numpyro.infer import ELBO, SVI, Predictive
from numpyro.infer.svi import SVIState

In [4]:
X_train = jnp.array(np.load('../data/preprocessed/X_train.npz')['arr_0'])
X_train.shape

(1000, 942, 24)

## Fit the hierachical model

In [5]:
check_model_guide(X_train, model=model, guide=guide)
train_handler = ModelHandler(model=model, guide=guide)

In [6]:
train_handler.fit(X_train, n_epochs=5_000, log_freq=1_000, lr=0.1)

epoch:    0 loss:      114879.0703
epoch: 1000 loss:        6734.7886
epoch: 2000 loss:        6423.1748
epoch: 3000 loss:        6406.2109
epoch: 4000 loss:        6383.2534
epoch: 5000 loss:        6394.3198


6371.9638671875

In [7]:
train_handler.fit(X_train, n_epochs=1_000, log_freq=200, lr=0.001)

epoch:    0 loss:        6371.9639
epoch:  200 loss:        6369.5010
epoch:  400 loss:        6368.5288
epoch:  600 loss:        6368.1260
epoch:  800 loss:        6370.3936
epoch: 1000 loss:        6370.7739


6367.49755859375

## Checkpoint: Save/restore current state

In [8]:
with open('../data/result/optim_state.pickle', 'bw') as fh:
    train_handler.dump_optim_state(fh)

In [9]:
train_handler = ModelHandler(model=model, guide=guide)
with open('../data/result/optim_state.pickle', 'br') as fh:
     train_handler.load_optim_state(fh)
# this is needed to initialize `svi`
train_handler.fit(X_train, n_epochs=100, lr=0.001)

6367.42529296875

## Predict on training set and check fitted parameters

In [10]:
pred_handler = ModelHandler(model=predictive_model(train_handler.model_params), guide=guide)
pred_handler.optim_state = train_handler.optim_state 

In [11]:
preds_samples = pred_handler.predict(X_train, return_sites=[Site.days], num_samples=200)

In [12]:
latent_samples = train_handler.predict(X_train, return_sites=[Site.coefs, Site.coef_mus, Site.coef_sigmas], num_samples=200)

In [13]:
for site in [Site.coef_mus, Site.coef_sigmas]:
    samples_df = pd.DataFrame(latent_samples[site])
    samples_df.to_csv(f'../data/result/{site}.csv', index=False)

In [14]:
stats = summary(latent_samples, poisson=True)
df_edf = pd.read_csv('../data/preprocessed/edf.csv')
df_stats = stats_to_df(stats, df_edf.columns[2:-1])
df_stats.to_csv('../data/result/stats.csv', index=False)

In [15]:
preds = summary(preds_samples, poisson=False)
df_preds = preds_to_df(preds[Site.days])
df_preds.to_csv('../data/result/train_preds.csv', index=False)

## Predict on test set with only little data

In [16]:
X_test = jnp.array(np.load('../data/preprocessed/X_test.npz')['arr_0'])
X_test.shape

(115, 942, 24)

In [17]:
known_days = 7  # consider only known days of history
X_test_known = X_test[:, :known_days, :]

### Fit on known data

In [18]:
train_local_handler = ModelHandler(model=model, guide=local_guide(train_handler.model_params))

In [19]:
train_local_handler.fit(X_test_known, n_epochs=1_000, log_freq=200, lr=0.1)

epoch:    0 loss:          68.6854
epoch:  200 loss:          49.3264
epoch:  400 loss:          49.5037
epoch:  600 loss:          50.0767
epoch:  800 loss:          50.1149
epoch: 1000 loss:          50.4847


49.72354507446289

In [20]:
train_local_handler.fit(X_test_known, n_epochs=1_000, log_freq=200, lr=0.001)

epoch:    0 loss:          49.7235
epoch:  200 loss:          48.7058
epoch:  400 loss:          48.2243
epoch:  600 loss:          48.0160
epoch:  800 loss:          48.2491
epoch: 1000 loss:          48.5169


47.84416580200195

### Predict future of test data

In [21]:
params = train_handler.model_params
params.update(train_local_handler.model_params)
pred_local_handler = ModelHandler(model=predictive_model(params), guide=local_guide(params))
pred_local_handler.optim_state = train_local_handler.optim_state 

In [22]:
preds_samples = pred_local_handler.predict(X_test, return_sites=[Site.days], num_samples=200)

In [23]:
preds = summary(preds_samples, poisson=False)
df_preds = preds_to_df(preds[Site.days]).assign(StoreId=lambda df: df.StoreId + 1000)
df_preds.to_csv('../data/result/test_preds.csv', index=False)

### Compare with conventional Poisson regression using Scikit-Learn

In [24]:
reg = LinearRegression()

In [25]:
# select a single store_id
store_id = 16
X = np.nan_to_num(X_test_known, nan=1.0)[store_id, ...]
X, y = X[:, :-1], X[:, -1]

In [26]:
# we fit on the log-transformed target to achieve a multiplicate relationship
reg.fit(X, np.log(y))

LinearRegression()

In [27]:
# high overfit since we have more features than target values
np.exp(reg.predict(X)) - y

array([-1.8477440e-06, -7.8125000e-03,  8.7890625e-03,  9.7656250e-04,
        5.8593750e-03, -9.5367432e-07, -2.9296875e-03], dtype=float32)

In [28]:
# no overfitting in case of the Bayesian model
jnp.mean(preds_samples[Site.days], axis=0)[store_id][:known_days] - y

DeviceArray([5634.325   , -189.23486 , -944.6699  , -159.33008 ,
              -24.464844, 5609.98    ,  185.25    ], dtype=float32)

### Compare the coefficients of conventional regression to the hierarchical model

In [29]:
# for many feature there is no meaningful value, i.e. 0, since they were not encountered in training
print(reg.coef_)

[ 1.1747563  -2.2386415   2.1442924   1.9889396   1.9385103   1.8024149
 -6.8102717   1.1747563   2.2386413  -2.2386413   0.          0.
 -0.09434899  0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.        ]


In [30]:
# using the global prior it's possible to derive meaningful values
coefs_samples = pred_local_handler.predict(X_test_known, return_sites=[Site.coefs], num_samples=200)
print(jnp.mean(coefs_samples[Site.coefs], axis=0)[store_id])

[2.7901838  2.6591167  2.6881819  2.6126733  2.6567695  2.554392
 2.4938328  0.332276   3.115697   2.926444   2.6920984  2.9548492
 0.05613966 0.06542115 2.8379254  2.9023964  3.5701404  3.2074354
 4.056987   2.9304535  2.7463415  2.8231895  2.9590065 ]


## Now compare those coefficients to the ones fitted on the whole time-series

In [31]:
all_local_handler = ModelHandler(model=model, guide=local_guide(train_handler.model_params))
all_local_handler.fit(X_test[store_id:store_id+1], n_epochs=10_000, log_freq=1_000, lr=0.001)

epoch:     0 loss:        7813.1240
epoch:  1000 loss:        7226.9434
epoch:  2000 loss:        7251.1406
epoch:  3000 loss:        7163.1885
epoch:  4000 loss:        7166.0078
epoch:  5000 loss:        7116.7900
epoch:  6000 loss:        7119.0220
epoch:  7000 loss:        7191.9624
epoch:  8000 loss:        7128.1733
epoch:  9000 loss:        7165.4966
epoch: 10000 loss:        7136.5723


7158.43896484375

In [32]:
# many coefficients are really similar but mind the log-space!
all_coefs_samples = all_local_handler.predict(X_test[store_id:store_id+1], return_sites=[Site.coefs], num_samples=200)
print(jnp.mean(all_coefs_samples[Site.coefs], axis=0)[0])

[ 2.8149917   2.7551818   2.618212    2.6453648   2.6726866   2.5200973
  2.59303     0.22652954  3.1844525   3.116345    2.542967    2.9477375
 -0.0321863   0.06836611  2.872647    2.9254913   3.56679     3.215816
  4.052345    2.9164748   2.724135    2.8247738   2.9598227 ]
