In [1]:
import torch
from torch import nn
import numpy as np
import unidecode
import string
import random

train_data = unidecode.unidecode(open('sherlock.txt').read()) # load the text file, reading it
vocab = string.printable # use all printable string characters as vocabulary
vocab_length = len(vocab) # vocabulary length
data_len = len(train_data) # get length of training data

# utility function
# get_batch utility function will randomly sample a batch of data of size k from a text corpus
def get_batch(text_corpus, batch_size=100):
    start = random.randint(0, data_len-batch_size)
    end = start + batch_size + 1
    return text_corpus[start:end]

## Creating network

In [21]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, n_layers=1, rnn_type='gru'):
        """rnn class making
        
        
        """
        super().__init__()
        self._vocab_size = vocab_size # this is our vocabulary size, i.e 100
        self._embedding_size = embedding_size # this is our embedding size, i.e the output size of embedding our sparse
        # matrix, set at say 50
        self._hidden_size = hidden_size # hidden size for the hidden rnn
        self._n_layers = n_layers
        
        # create layers. If rnn_type is gru, use gru
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        if rnn_type == 'gru':
            self.rnn = nn.GRU(embedding_size, hidden_size, n_layers)
        else:
            raise NotImplementedError # this is to be implemented, for example replace with lstm
        self.h2o = nn.Linear(hidden_size, vocab_size) # the hidden to output layer
        self.softmax = nn.LogSoftmax(dim=0) # numerically stable implementation of log of softmax. Need the log-softmax for
        # computing the cross entropy log loss
        
    def forward(self, x, h):
        """given an x and a hidden h, forward pass through our network. Our final output should be a softmax prediction
        over all the vocabulary.
        
        Args:
            x: input of shape [seq_len] x will be a long tensor of size seq_len, essentially a list of integers ranging from
            0 to 100, i.e x = [0, 5, 24, 0, 66]
            h: h_0** of shape `(num_layers * num_directions, batch, hidden_size)`
        
        """
        # step 1, get sequence length
        seq_len = x.size()[0]
        # step 2. pass our input through our embedding layer, and get the output "embed", reshape it via view to get
        # it ready for the rnn layer
        embed = self.embedding(x).view(seq_len, 1, -1) # rnn takes input of shape [seq_len x batch_size x input_dim]
        # step 3. forward pass our embed through our rnn layers, make sure to pass in hidden as well
        rnn_out, hidden = self.rnn(embed, h) # compute the rnn output
        # step 4: using our rnn output, pass it through the i2o(input to output) linear layer (remember to reshape to 2D)
        # and get the non-normalized output prediction
        prediction = self.h2o(rnn_out.view(seq_len,-1))
        # step 5: normalize our prediction by taking the log_softmax
        log_softmax = self.softmax(prediction)
        # return log softmax prediction and hidden
        return log_softmax, hidden
    
    def init_hidden(self):
        return torch.zeros(self._n_layers, 1, self._hidden_size)
    
    
def logprob_to_words(logprob, vocab):
    """given a sequence of logprobs from a network and a vocabulary, turn the logprob into words
    
    """
    seq_len = logprob.shape[0]
    max_val, max_idx = logprob.max(dim=1)
    txt = ''
    for item in max_idx:
        txt+=(idx_to_word[int(item)])
    
    return txt
        
        

            
    

In [5]:
# practice tests

# step 1: create our network and optimizer
net = RNN(100, 100, 100)
net.cuda()
optim = torch.optim.Adam(net.parameters(),lr=1e-4)
# step 2: create a training batch of data, size 101, format this data and convert it to pytorch long tensors
dat = get_batch(train_data,100)
dat = torch.LongTensor([vocab.find(item) for item in dat])
# step 3: convert our dat into input/output
x_t = dat[:-1].cuda()
y_t = dat[1:].cuda()
# step 4: initialize hidden state and forward pass
ho = net.init_hidden().cuda()
log_prob, hidden = net.forward(x_t, ho)
# step 5: import our loss and compute the loss
loss_func = nn.CrossEntropyLoss()
loss = -loss_func(log_prob, y_t)


RuntimeError: cuda runtime error (77) : an illegal memory access was encountered at /opt/conda/conda-bld/pytorch_1524585239153/work/aten/src/THC/generic/THCTensorCopy.c:20

## Training Loop

In [68]:
epochs = 5000
seq_batch_size = 150
print_yes = 100
loss_func = torch.nn.functional.nll_loss

# step 1: create our network and optimizer
net = RNN(100, 100, 100)
optim = torch.optim.Adam(net.parameters(),lr=5e-4)

# lets see if we can overfit
dada = get_batch(train_data, 100)
dada = torch.LongTensor([vocab.find(item) for item in dada])
x_dada = dada[:-1]
y_data = dada[1:]

# main training loop:
for epoch in range(epochs):
    dat = dat = get_batch(train_data,seq_batch_size)
    dat = torch.LongTensor([vocab.find(item) for item in dat])
    # pull x and y
    x_t = dat[:-1]
    y_t = dat[1:]
    # initialize hidden state and forward pass
    hidden = net.init_hidden()
    logprob, hidden = net.forward(x_t, hidden)
    loss = loss_func(logprob, y_t)
    # update
    optim.zero_grad()
    loss.backward()
    optim.step()
    # print the loss for every kth iteration
    if epoch % print_yes == 0:
        print('*'*100)
        print('\n epoch {}, loss:{} \n'.format(epoch, loss))
        print('sample speech:\n', logprob_to_words(logprob, vocab))
    


****************************************************************************************************

 epoch 0, loss:5.056777000427246 

sample speech:
 L&T38~3TUucP)>U4ia55k)s[F8Zk.q9OdPHQdYP)>UqSn4	nyULL?3,,T?x#PH, H^>~,6WR>B9p,HF>U]P^UH`LUP6]zPT?x#Pwld4'g%Z,Z7aauuakTn"9WO.S~L5UG. 9KdLUGG
****************************************************************************************************

 epoch 100, loss:4.560878276824951 

sample speech:
 d.buubhe bolng ngg
""

u.whhuwon  wnQ  .ba
bn..nenybnhwhe bourrr.barr+er wa
bongrban "er.
bulnd Swugwhe bhen.  woi. wolon  e bonerbouzgwzlthwnE  .""er
****************************************************************************************************

 epoch 200, loss:4.3668437004089355 

sample speech:
 r,erg.Ihu""ug
I 
Iourbhndwxd$u Inhbngborver Iaryywnd.Iouu "I 
Iourbong.Hour"hndgu  Ihne baobn ener Iaobeu  Ind IoxverbnhIhubougeborv.yn,.."rgdy
"I Ine
***********************************************************************************

****************************************************************************************************

 epoch 2700, loss:3.624995231628418 

sample speech:
  ur,brxp,d'

aid!ber

"Qy Krxp.d"
IherKaz?2uxp?d,thzn-ezhd-"
"Qos,"Htwas Khozreng-bfey Kavl-1nd2iug:bnauth-br.
Hor?y-wuze!, bhi K0s Knqaxplx;wf tour b
****************************************************************************************************

 epoch 2800, loss:4.041996479034424 

sample speech:
 hepk-Hir Cuizh-Holpzgngdfxssw0s Ihet Lnl.eugh;Mirq0ud!&hme!&oyk-Homrh Mhck Mn Iirqav bhrzkyd.Sh-Lh-,,3n "Mirqav bfeLovnh.r.w8mlyck2ow:2ovk--zd"
"Yolee
****************************************************************************************************

 epoch 2900, loss:3.864773750305176 

sample speech:
 h-bexuyzk.3n."Mn?wn qflezus:&het IirKav
bixt;&or !;nht,&ow,whetkLor ed,y.""hich
In
qnqoxt-zge.&8opk.&f Llq0xk!d?ng-Sothre."Tnthe Kfher,3avd,"MirKav
q
*******************************************************************************

KeyboardInterrupt: 

In [24]:
# test
test_batch = get_batch(train_data, 500)
test_batch =torch.LongTensor([vocab.find(item) for item in test_batch])

x_test = test_batch[:-1]
y_test = test_batch[1:]

test_hid = net.init_hidden()

test_pred, _ = net.forward(x_test, test_hid)

print(test_pred)
print(y_test)

tensor([[ -6.1075,  -6.1198,  -6.2239,  ...,  -6.1796,  -6.2226,
          -6.3669],
        [ -6.0468,  -6.2410,  -6.0143,  ...,  -6.6546,  -6.3249,
          -6.3432],
        [ -6.3289,  -6.1807,  -6.2836,  ...,  -5.9856,  -6.4997,
          -6.5055],
        ...,
        [ -6.5863,  -6.1425,  -6.3313,  ...,  -6.8871,  -6.0193,
          -6.0095],
        [ -5.8683,  -6.2851,  -6.3315,  ...,  -6.6275,  -6.6342,
          -6.0157],
        [ -5.7948,  -6.1305,  -6.5514,  ...,  -6.1083,  -6.3550,
          -6.2403]])
tensor([ 21,  14,  94,  10,  23,  13,  94,  22,  30,  12,  17,  94,
         21,  14,  28,  28,  94,  10,  22,  18,  10,  11,  21,  14,
         75,  94,  55,  17,  14,  34,  94,  10,  27,  14,  94,  10,
         94,  22,  24,  28,  29,  96,  30,  23,  25,  21,  14,  10,
         28,  10,  23,  29,  94,  12,  24,  30,  25,  21,  14,  73,
         94,  11,  30,  29,  94,  15,  24,  27,  29,  30,  23,  10,
         29,  14,  21,  34,  94,  44,  94,  28,  25,  14,  23,  13,


In [57]:
word_to_idx = {}
idx_to_word = {}
for idx,word in enumerate(vocab):
    word_to_idx[word] = idx
    idx_to_word[idx] = word

In [60]:
def logprob_to_words(logprob, vocab):
    """given a sequence of logprobs from a network and a vocabulary, turn the logprob into words
    
    """
    seq_len = logprob.shape[0]
    max_val, max_idx = logprob.max(dim=1)
    txt = ''
    for item in max_idx:
        txt+=(idx_to_word[int(item)])
    
    return txt


logprob_to_words(test_pred, vocab)


'\ny,Hnd.IapkeHixt.Mnynnly,"Ther wnddwnwau.,"pdey-v,kd,Moul!y, IulhHor[,rgk,ry.I waoxg.Hau.,Mf Ma whn..wngthe joteed,.wnd.Ia wf,,Iouue"Iaich InddMox-hMhuwxvkeMfher,Hngtfd,"ouegd,Hf Mherjulnl.ng."""IWukMhiumov .Mn!.r.Ha wndenekyHn MherjuueexeHuxdkir,Ma wic\rdwiv "ere.wupnd,.Ifdthe jhec,, I#oe"Tophk ,y,Hovydwou,,Iupt,Mn!.r."uexve-v ,Mnd.Iiic er,d,Ioued,eng Mhuwax,was ekd,"""IYWe Iox. \nIavn.Hax Ihreggg Mhuwax IUaxwnddwexe.wapkeHfeechd.Mhuwour Irnt.Mold,r."Ior[bovlygg MngtisheIfl biic..Moumovkbn whuwop'

In [53]:
kk = torch.LongTensor([5,4,3,2])
int(kk.data[0])

5

In [None]:
rnn = nn.GRU(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input, h0)

In [None]:
embed = nn.Embedding(100,20)

out = embed(torch.LongTensor([2, 5, 0, 99, 34]))
out.shape

In [None]:
seq = torch.LongTensor([2,5,4,3,1])
seq

In [None]:
out = embed(torch.rand(100,100).long())

out.shape

nn.Embedding?

In [None]:
g = 'hello tofuboi'

In [None]:
for i in range(len(g)):
    print(g[i])

In [None]:
for char in g:
    print(char)

In [None]:
for idx, char in enumerate(g):
    print(idx,char)