In [4]:
import torch
from torch import nn
import numpy as np
import unidecode
import string
import random

train_data = unidecode.unidecode(open('sherlock.txt').read()) # load the text file, reading it
vocab = string.printable # use all printable string characters as vocabulary
vocab_length = len(vocab) # vocabulary length
data_len = len(train_data) # get length of training data

# utility function
# get_batch utility function will randomly sample a batch of data of size k from a text corpus
def get_batch(text_corpus, batch_size=100):
    start = random.randint(0, data_len-batch_size)
    end = start + batch_size + 1
    return text_corpus[start:end]

## Creating network

In [19]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, n_layers=1, rnn_type='gru'):
        """rnn class making
        
        
        """
        super().__init__()
        self._vocab_size = vocab_size # this is our vocabulary size, i.e 100
        self._embedding_size = embedding_size # this is our embedding size, i.e the output size of embedding our sparse
        # matrix, set at say 50
        self._hidden_size = hidden_size # hidden size for the hidden rnn
        self._n_layers = n_layers
        
        # create layers. If rnn_type is gru, use gru
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        if rnn_type == 'gru':
            self.rnn = nn.GRU(embedding_size, hidden_size, n_layers)
        else:
            raise NotImplementedError # this is to be implemented, for example replace with lstm
        self.h2o = nn.Linear(hidden_size, vocab_size) # the hidden to output layer
        self.softmax = nn.LogSoftmax(dim=0) # numerically stable implementation of log of softmax. Need the log-softmax for
        # computing the cross entropy log loss
        
    def forward(self, x, h):
        """given an x and a hidden h, forward pass through our network. Our final output should be a softmax prediction
        over all the vocabulary.
        
        Args:
            x: input of shape [seq_len] x will be a long tensor of size seq_len, essentially a list of integers ranging from
            0 to 100, i.e x = [0, 5, 24, 0, 66]
            h: h_0** of shape `(num_layers * num_directions, batch, hidden_size)`
        
        """
        # step 1, get sequence length
        seq_len = x.size()[0]
        # step 2. pass our input through our embedding layer, and get the output "embed", reshape it via view to get
        # it ready for the rnn layer
        embed = self.embedding(x).view(seq_len, 1, -1) # rnn takes input of shape [seq_len x batch_size x input_dim]
        # step 3. forward pass our embed through our rnn layers, make sure to pass in hidden as well
        rnn_out, hidden = self.rnn(embed, h) # compute the rnn output
        # step 4: using our rnn output, pass it through the i2o(input to output) linear layer (remember to reshape to 2D)
        # and get the non-normalized output prediction
        prediction = self.h2o(rnn_out.view(seq_len,-1))
        # step 5: normalize our prediction by taking the log_softmax
        log_softmax = self.softmax(prediction)
        # return log softmax prediction and hidden
        return log_softmax, hidden
    
    def init_hidden(self):
        return torch.zeros(self._n_layers, 1, self._hidden_size)
        
            
    

In [22]:
# practice tests

# step 1: create our network
net = RNN(100, 100, 100)
# step 2: create a training batch of data, size 101, format this data and convert it to pytorch long tensors
dat = get_batch(train_data,100)
dat = torch.LongTensor([vocab.find(item) for item in dat])
# step 3: convert our dat into input/output
ho = net.init_hidden()


tensor([[-4.7465, -4.5403, -4.6543,  ..., -4.6799, -4.7051, -4.6042],
        [-4.6086, -4.5102, -4.4681,  ..., -4.7299, -4.6988, -4.5894],
        [-4.6840, -4.5864, -4.6596,  ..., -4.5976, -4.5235, -4.4925],
        ...,
        [-4.6045, -4.6790, -4.5352,  ..., -4.6645, -4.6794, -4.6135],
        [-4.7059, -4.6267, -4.7804,  ..., -4.6240, -4.9693, -4.2340],
        [-4.7111, -4.7448, -4.5945,  ..., -4.6057, -4.8176, -4.4243]])

In [None]:
loss = nn.CrossEntropyLoss

In [None]:
loss = nn.CrossEntropyLoss

In [30]:
a.shape

torch.Size([191, 1, 100])

In [192]:
word_to_idx = {}
idx_to_word = {}
for idx,word in enumerate(vocab):
    word_to_idx[word] = idx
    idx_to_word[idx] = word

In [195]:
print(word_to_idx['h'], idx_to_word[17])

17 h


In [12]:
nn.GRU?

In [143]:
rnn = nn.GRU(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input, h0)

In [4]:
embed = nn.Embedding(100,20)

In [6]:
out = embed(torch.rand(100,100).long())

out.shape

nn.Embedding?

In [196]:
g = 'hello tofuboi'

In [197]:
for i in range(len(g)):
    print(g[i])

h
e
l
l
o
 
t
o
f
u
b
o
i


In [198]:
for char in g:
    print(char)

h
e
l
l
o
 
t
o
f
u
b
o
i


In [200]:
for idx, char in enumerate(g):
    print(idx,char)

0 h
1 e
2 l
3 l
4 o
5  
6 t
7 o
8 f
9 u
10 b
11 o
12 i
