In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import torch.utils.data as data_utils
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler

In [3]:
torch.manual_seed(1)

<torch._C.Generator at 0x118093228>

In [29]:
batch_size = 16

In [4]:
base_dir = '../data/'

In [5]:
train = pd.read_csv(base_dir+'train_1.csv', nrows=100).fillna(0)

In [12]:
X = train.drop('Page', axis=1).values
Y = train['2016-12-31'].values

In [16]:
shape = X.shape ; shape

(100, 550)

In [17]:
Y.shape

(100,)

In [21]:
sc = MinMaxScaler()
X = np.reshape(sc.fit_transform(np.reshape(X,(-1,1))), shape)
Y = np.reshape(sc.fit_transform(np.reshape(Y,(-1,1))), -1)

In [22]:
X.shape

(100, 550)

In [23]:
Y.shape

(100,)

In [33]:
torch.from_numpy(Y).unsqueeze(-1).shape

torch.Size([100, 1])

The unsqueeze below is adding another dimension to the tensors (size 1, at the end). Also I believe the shuffle only shuffles the first dimension (thus shuffling the sequences whole). This could also probably be turned off with no ill effects

In [34]:
trainloader = data_utils.DataLoader(
    data_utils.TensorDataset(
        torch.from_numpy(X).float().unsqueeze(-1),
        torch.from_numpy(Y).float().unsqueeze(-1)
    ),
    batch_size=batch_size, shuffle=True
)

The below is borrowed from some bloke on the discussion

In [35]:
seq_len = 1

In [None]:
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.hidden_units = 128
        self.model = nn.Sequential()
        
        self.rnn = nn.RNN(
            input_size=seq_len,
            hidden_size=self.hidden_units,
            num_layers=2, #number of RNN layers
            batch_first=True #batch dimension is first
            nonlinearity='relu',
            dropout=0.2
        )
        
        #I can change the below to two softplus outputs for
        #mean and variance in the paper version (see notes below)
        self.out = nn.Linear(self.hidden_units, 1)
        
    def forward(self, x, h_state):
        # dimensions:
        # x (batch, time_step, input_size)
        # h_state (n_layers, batch, hidden_size)
        # r_out (batch, time_step, hidden_size)
        r_out, h_state = self.rnn(x, h_state)
        
        #save all the outputs together (kinda assuming seq_len is bigger)
        out = []
        for time_step in range(r_out.size(1)):
            outs.append(self.out(r_out[:,time_step,:]))
        return torch.stack(outs, dim=1)[:,-1,:], h_state

Notes on the above 
- should check which activation I use (remember something about atan being good for LSTM)
- num hidden units is same, as is dropout, as if number of rnn layers
- i explicitly gave whole sequence and number of timesteps (as opposed to `seq_len = 1` above - mine is literally just as if `seq_len` is larger) but should be the same thing (based on my somewhat shaky understanding of Keras LSTM). however might be more difficult to implement [this paper](https://arxiv.org/pdf/1704.04110.pdf) in the Keras version which suggests 
  - outputting mean and variance and maximising log likelihood of negative binomial distribution
  - including extra features specifically day of week, week/month of year, and 
  - an embedding to capture groupings of like pages
- however could be fine - eg metafeatures would be easily implemented via functional api
- the only interesting bit is the bit which essentially saves the outputs as a sequence (aka, what I was doing via Keras' built in features) 
  

Below directly ripped from discussion guy's gist

In [None]:
def train():
    rnn = RNN().cuda()
    print(rnn)
    optimizer = torch.optim.Adam(rnn.parameters(), lr=1e-3)   # optimize all cnn parameters
    #here the loss function would have to be changed to the log likelihood version
    #and possibly written - which would mean computing gradient (and hessian?) of it
    loss_func = nn.MSELoss()
    h_state = None      # for initial hidden state

    dataiter=iter(trainloader)
    for step in range(200):
        x_all, _ = dataiter.next()
        if x_all.size(0)<batch_size:
            dataiter = iter(trainloader)
            x_all, _ = dataiter.next()
        pos=np.random.randint(pred_date_len,x_all.size(1)-pred_date_len)
        # print('pos:',pos,x_all.size())
        x=Variable(x_all[:,:pos,:]).cuda()
        y=Variable(x_all[:,pos:pos+pred_date_len,:]).cuda()
        y=y.view(batch_size,pred_date_len)
        y=y.sum(dim=1)
        # print(x, y)

        prediction, h_state = rnn(x, h_state)   # rnn output
        # !! next step is important !!
        h_state = Variable(h_state.data)        # repack the hidden state, break the connection from last iteration

        loss = loss_func(prediction, y)         # cross entropy loss
        optimizer.zero_grad()                   # clear gradients for this training step
        loss.backward()                         # backpropagation, compute gradients
        print(step,loss.data[0])
        optimizer.step()                        # apply gradients

        # if step%10==0:
        #     evaluate(rnn,h_state)

    torch.save(rnn,'rnn.mdl')
    pkl.dump(h_state,open('h_state','wb'))
    print('saved rnn.mdl and h_state.')

Notes on above:
- rewriting the loss function to the log likelihood maximizer for negative binomial distribution is doable
    - upon thinking further it seems that a poisson distribution more accurately should model pageviews (see [this paper](http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=18EDC4D891D952D5706850A0B33D0561?doi=10.1.1.77.3333&rep=rep1&type=pdf) for discussion to the contrary though) and pytorch has a loss function for negative likelihood w/ poisson distribution already
    - _however_ we're scaling the inputs so we lose the whole integer thing anyway, in which case a logistic distribution might be more appropriate? hooker will know more about this. pytorch will have some log likelihood loss function 
- adding meta features again doable by concatenating them onto the `Variable`s x and y defined on line 18/19
- to implement an embedding, would presumably have to add the embedding variables to to optimizer's initialiser, then do a lookup to concatentate them before giving them to the RNN. 
  - though it seems somewhat likely that pytorch has something built to deal with embeddings already too