In [38]:
%load_ext autoreload
%autoreload 2
import sys
import gc

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as data_utils
from torch.autograd import Variable

from tqdm import tqdm

sys.path.append('../')
from wiki.utils import clock
from wiki import rnn, rnn_predict, newphet, val, submissions
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
base_dir = '../data/'

In [79]:
train_df = pd.read_csv(base_dir+'train_1.csv')

In [83]:
values = train_df.drop('Page', axis=1).values ; values.shape

(145063, 550)

Features to add:
1. AGE (number in sequence)
2. DOW
3. WEEK OF YEAR
4. EMBEDDING 
  - with dimensions roughly taken from the amazon paper<sup>[1](https://arxiv.org/pdf/1704.04110.pd)</sup>
    - 20 output dimensions, since they did a grid search and found over a large range of input sizes that 20 was best
    - all series as input dimensions. In the paper they picked only 5 but I they don't seem to say what these were. 
  - This would increase the number of parameters by a factor of 20, so may well be computationally intractable. To lower:
    - Could lower input dimensions by grouping pages together (lowering number of parameters by 3\*)
    - Could lower output dimensions
    - All else fails, could use language for the input dimensions

In [89]:
dates = train_df.columns[1:].values
s_date = dates[0]
e_date = dates[-1]

In [105]:
dates = pd.date_range(s_date, e_date)

In [114]:
ages = np.arange(len(dates))
dows = dates.dayofweek.values
woys = dates.weekofyear.values

In [145]:
#Expand the dims to make broadcasting work - since numpy
#refuses to add dimensions to the right when broadcasting
series_idxs = np.expand_dims(np.arange(values.shape[0]), axis=1)

In [154]:
br = lambda x: np.broadcast_to(x, values.shape)

In [157]:
features = np.stack([values, br(ages), br(dows), br(woys), br(series_idxs)], axis=-1)

In [158]:
features.shape

(145063, 550, 5)

In [2]:
model = rnn.RNN()

In [29]:
#Placeholder for the input
X = np.empty((145063, 490, 1))

In [33]:
embedding = torch.nn.Embedding(num_embeddings=X.shape[0], embedding_dim=20)

Will need to add a few things to the last dimension of X. Currently it just holds the time series value, but we'll want to increase it to size 5 - to hold (time series value, age, DOW, WOY, embedding id) 

In [69]:
X[:,:,-1].long().size()

torch.Size([256, 490])

In [74]:
type(embedding(X[:,:,4].long()).data)

torch.FloatTensor

In [75]:
type(X[:,:,:-1].data)

torch.FloatTensor

In [77]:
torch.cat([X[:,:,:-1], embedding(X[:,:,4].long())], dim=2)

torch.Size([256, 490, 24])

In [5]:
model

RNN (
  (rnn): GRU(1, 128, num_layers=2, batch_first=True, dropout=0.2)
  (out): Linear (128 -> 1)
  (loss_func): L1Loss (
  )
)

In [22]:
count = sum([p.view(-1).size()[0] for p in model.parameters()])

In [23]:
count

149505

In [28]:
145063*20/3/count

6.468590794064858