In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import torch.utils.data as data_utils
import pandas as pd
import os
import sys
import gc
import pickle as pkl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
sys.path.append('../')
from wiki.utils import clock

In [2]:
torch.manual_seed(1)
torch.cuda.set_device(0)

In [3]:
batch_size = 256

In [6]:
base_dir = '../data/'

In [7]:
train = pd.read_csv(base_dir+'train_1.csv').fillna(0)

In [8]:
X = train.drop('Page', axis=1).values

In [9]:
shape = X.shape ; shape

(145063, 550)

In [10]:
sc = StandardScaler()
X = sc.fit_transform(X.T).T
print(X.shape)
assert(np.isclose(np.mean(X[0]),0))
# input shape: samples, timesteps, features
X = X.reshape(X.shape[0], X.shape[1], 1)
print(X.shape)
np.max(X)

(145063, 550)
(145063, 550, 1)


23.43074902772026

In [11]:
X.shape

(145063, 550, 1)

In [12]:
X_train, X_val = train_test_split(X, test_size=0.1, random_state=12)

The unsqueeze below is adding another dimension to the tensors (size 1, at the end). Also I believe the shuffle only shuffles the first dimension (thus shuffling the sequences whole). This could also probably be turned off with no ill effects

In [13]:
trainloader = data_utils.DataLoader(
    data_utils.TensorDataset(
        torch.from_numpy(X_train).float().unsqueeze(-1)
    ),
    batch_size=batch_size, shuffle=True
)

In [14]:
valloader = data_utils.DataLoader(
    data_utils.TensorDataset(
        torch.from_numpy(X_val).float().unsqueeze(-1)
    ),
    batch_size=batch_size, shuffle=False
)

The below is borrowed from some bloke on the discussion

In [15]:
seq_len = 1

In [17]:
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.hidden_units = 128
        self.n_layers = 2
        
        self.rnn = nn.GRU(
            input_size=1,
            hidden_size=self.hidden_units,
            num_layers=self.n_layers, #number of RNN layers
            batch_first=True, #batch dimension is first
            #nonlinearity='relu',
            dropout=0.2
        )

        #I can change the below to two softplus outputs for
        #mean and variance in the paper version (see notes below)
        self.out = nn.Linear(self.hidden_units, 1)
        
    def forward(self, x, h_state):
        # dimensions:
        # x (batch, time_step, input_size)
        # h_state (n_layers, batch, hidden_size)
        # r_out (batch, time_step, hidden_size)
        r_out, h_state = self.rnn(x, h_state)
        return self.out(r_out), h_state
    
    def init_hidden(self, batch_size):
        hidden = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_units)).cuda()
        return hidden

Notes on the above 
- todo implement [this paper](https://arxiv.org/pdf/1704.04110.pdf) which suggests
  - outputting mean and variance and maximising log likelihood of negative binomial distribution
  - including extra features specifically day of week, week/month of year, and 
  - an embedding to capture groupings of like pages

In [18]:
for x in trainloader:
    print(x.shape)
    break

torch.Size([256, 550, 1]) torch.Size([256, 1])


In [19]:
print(X.shape[0])

145063


In [20]:
def predict_batch(model, batch, pred_date_len):
    output = []
    h_state = model.init_hidden(batch.size()[0])
    x=Variable(batch, volatile=True).cuda()
    encoder_out, h_state = model(x, h_state)

    input_variable = encoder_out[:,-1:,:]
    output.append(input_variable)
    for i in range(pred_date_len-1):
        encoder_out, h_state = model(input_variable, h_state)
        input_variable = encoder_out
        output.append(encoder_out)
    
    return torch.cat(output, dim=1)

In [21]:
def predict(model, dataloader, pred_date_len):
    all_output = []
    for data_all in dataloader:
        output = predict_batch(model, data_all, pred_date_len)
        all_output.append(output)
    return torch.cat(all_output, dim=0)

In [22]:
def validate(model, valloader, pred_date_len):
    loss_func = nn.L1Loss()
    loss = 0
    for data_all in valloader:
        sequences = data_all[:,:-pred_date_len,:]
        targets = Variable(data_all[:,-pred_date_len:,:], volatile=True).cuda()
        output = predict_batch(model, sequences, pred_date_len)
        loss += loss_func(output, targets)
    print('Val loss, %f' % float(loss.data[0])/pred_date_len)

In [23]:
def train(trainloader, valloader, batch_size):
    rnn = RNN().cuda()
    print(rnn)
    pred_date_len = 60
    teacher_forcing_ratio = 0.5
    
    optimizer = torch.optim.Adam(rnn.parameters(), lr=1e-3)
    #here the loss function would have to be changed to the log likelihood version
    #and possibly written - which would mean computing gradient (and hessian?) of it
    loss_func = nn.L1Loss()

    for x_all in trainloader:
        loss = 0
        h_state = rnn.init_hidden(batch_size)  
        if x_all.size(0)<batch_size:
            dataiter = iter(trainloader)
            x_all, _ = dataiter.next()
        pos=np.random.randint(pred_date_len,x_all.size(1)-pred_date_len)
        # print('pos:',pos,x_all.size())
        x=Variable(x_all[:,:pos,:]).cuda()
        y=Variable(x_all[:,pos:pos+pred_date_len,:]).cuda()
        
        encoder_out, h_state = rnn(x, h_state) #run through 'encoder' stage
        
        #Now 'decoder' stage
        use_teacher_forcing = np.random.rand() < teacher_forcing_ratio
        for i in range(pred_date_len):
            input_variable = y[:,i:i+1,:] if use_teacher_forcing else encoder_out[:,-1:,:]
            encoder_out, h_state = rnn(input_variable, h_state)
            loss += loss_func(encoder_out, y[:,i:i+1,:])
        # print(x, y)

        #This next line repacks the hidden state, breaking the connection
        #so that the backprop doesn't try to keep going back.
        #Only necessary for stateful model (which seems wrong here, between
        #different series)
        #h_state = Variable(h_state.data)
              # cross entropy loss
        optimizer.zero_grad()                   
        loss.backward() 
        if step % 20 == 0:
            print(step,loss.data[0] / pred_date_len)
        optimizer.step()
    
    torch.save(rnn,'rnn.mdl')
    validate(rnn, valloader, pred_date_len) 
    #pkl.dump(h_state,open('h_state','wb'))
    print('saved rnn.mdl and h_state.')
    return rnn

Notes on above:
- rewriting the loss function to the log likelihood maximizer for negative binomial distribution is doable
    - upon thinking further it seems that a poisson distribution more accurately should model pageviews (see [this paper](http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=18EDC4D891D952D5706850A0B33D0561?doi=10.1.1.77.3333&rep=rep1&type=pdf) for discussion to the contrary though) and pytorch has a loss function for negative likelihood w/ poisson distribution already
    - _however_ we're scaling the inputs so we lose the whole integer thing anyway, in which case a logistic distribution might be more appropriate? hooker will know more about this. pytorch will have some log likelihood loss function 
- adding meta features again doable by concatenating them onto the `Variable`s
- to implement an embedding, would presumably have to add the embedding variables to to optimizer's initialiser, then do a lookup to concatentate them before giving them to the RNN. 
  - though it seems somewhat likely that pytorch has something built to deal with embeddings already too

In [26]:
with clock():
    rnn = train(trainloader, valloader, batch_size)

RNN (
  (rnn): GRU(1, 128, num_layers=2, batch_first=True, dropout=0.2)
  (out): Linear (128 -> 1)
)
0 0.003522155433893204
20 0.0001297897504021724
40 2.3441255325451494e-05
60 1.0610820997195939e-05
80 8.707001688890159e-06
100 7.879538073514898e-06
120 6.106500707877179e-06
140 5.111899129891147e-06
160 4.2763126354354124e-06
180 3.6098404962103815e-06
200 3.006507419437791e-06
220 2.706205608167996e-06
240 2.344474584485094e-06
260 2.053584224389245e-06
280 1.8520832478922481e-06
300 2.2827647626399993e-06
320 1.4572029613191263e-06
340 1.3739812857238575e-06
360 1.1985051969531923e-06
380 1.0661109020778288e-06
400 1.1393985914764926e-06
420 8.963846388117721e-07
440 7.728679823533943e-07
460 2.7284064951042333e-06
480 7.018406904535368e-07
500 6.303918174429175e-07
520 5.623238394036889e-07
540 5.403860996011645e-07
560 5.268810127745383e-07
saved rnn.mdl and h_state.
Elapsed time 164.19793605804443 seconds


  "type " + obj.__name__ + ". It won't be checked "


In [28]:
validate(rnn, valloader, 60)

RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1503970438496/work/torch/lib/THC/generic/THCStorage.cu:66