In [24]:
import numpy as np

In [25]:
class Layer(object):
    
    def __init__(self):
        self.parameters = list()
        
    def get_parameters(self):
        return self.parameters

In [None]:
class SGD(object):
    
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
    
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
        
    def step(self, zero=True):
        
        for p in self.parameters:
            
            p.data -= p.grad.data * self.alpha
            
            if(zero):
                p.grad.data *= 0

In [None]:

    
class Tanh(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.tanh()
    
class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()





class Tanh(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.tanh()


class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()
    

class CrossEntropyLoss(object):
    
    def __init__(self):
        super().__init__()
    
    def forward(self, input, target):
        return input.cross_entropy(target)



In [26]:
class Tensor (object):
    
    def __init__(self,data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):
        
        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None
        if(id is None):
            self.id = np.random.randint(0,100000)
        else:
            self.id = id
        
        self.creators = creators
        self.creation_op = creation_op
        self.children = {}
        
        if(creators is not None):
            for c in creators:
                if(self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        for id,cnt in self.children.items():
            if(cnt != 0):
                return False
        return True 
        
    def backward(self,grad=None, grad_origin=None):
        if(self.autograd):
 
            if(grad is None):
                grad = Tensor(np.ones_like(self.data))

            if(grad_origin is not None):
                if(self.children[grad_origin.id] == 0):
                    raise Exception("cannot backprop more than once")
                else:
                    self.children[grad_origin.id] -= 1

            if(self.grad is None):
                self.grad = grad
            else:
                self.grad += grad
            
            # grads must not have grads of their own
            assert grad.autograd == False
            
            # only continue backpropping if there's something to
            # backprop into and if all gradients (from children)
            # are accounted for override waiting for children if
            # "backprop" was called on this variable directly
            if(self.creators is not None and 
               (self.all_children_grads_accounted_for() or 
                grad_origin is None)):

                if(self.creation_op == "add"):
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                    
                if(self.creation_op == "sub"):
                    self.creators[0].backward(Tensor(self.grad.data), self)
                    self.creators[1].backward(Tensor(self.grad.__neg__().data), self)

                if(self.creation_op == "mul"):
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new , self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)                    
                    
                if(self.creation_op == "mm"):
                    c0 = self.creators[0]
                    c1 = self.creators[1]
                    new = self.grad.mm(c1.transpose())
                    c0.backward(new)
                    new = self.grad.transpose().mm(c0).transpose()
                    c1.backward(new)
                    
                if(self.creation_op == "transpose"):
                    self.creators[0].backward(self.grad.transpose())

                if("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.expand(dim,
                                                               self.creators[0].data.shape[dim]))

                if("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))
                    
                if(self.creation_op == "neg"):
                    self.creators[0].backward(self.grad.__neg__())
                    
                if(self.creation_op == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))
                
                if(self.creation_op == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))
                
                if(self.creation_op == "index_select"):
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))
                    
                if(self.creation_op == "cross_entropy"):
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))
                    
    def __add__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if(self.autograd):
            return Tensor(self.data * -1,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(self.data * -1)
    
    def __sub__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data - other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="sub")
        return Tensor(self.data - other.data)
    
    def __mul__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="mul")
        return Tensor(self.data * other.data)    

    def sum(self, dim):
        if(self.autograd):
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_"+str(dim))
        return Tensor(self.data.sum(dim))
    
    def expand(self, dim,copies):

        trans_cmd = list(range(0,len(self.data.shape)))
        trans_cmd.insert(dim,len(self.data.shape))
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies]).transpose(trans_cmd)
        
        if(self.autograd):
            return Tensor(new_data,
                          autograd=True,
                          creators=[self],
                          creation_op="expand_"+str(dim))
        return Tensor(new_data)
    
    def transpose(self):
        if(self.autograd):
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")
        
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if(self.autograd):
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self,x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if(self.autograd):
            return Tensor(1 / (1 + np.exp(-self.data)),
                          autograd=True,
                          creators=[self],
                          creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if(self.autograd):
            return Tensor(np.tanh(self.data),
                          autograd=True,
                          creators=[self],
                          creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    def index_select(self, indices):

        if(self.autograd):
            new = Tensor(self.data[indices.data],
                         autograd=True,
                         creators=[self],
                         creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])

    def softmax(self):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape)-1,
                                       keepdims=True)
        return softmax_output    
    
    def cross_entropy(self, target_indices):

        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape)-1,
                                       keepdims=True)
        
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t),-1)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()
    
        if(self.autograd):
            out = Tensor(loss,
                         autograd=True,
                         creators=[self],
                         creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out

        return Tensor(loss)
        
    
    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())  

In [28]:
class Embedding(Layer):
    
    def __init__(self, vocab_size, dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dim = dim
        
        # this random initialiation style is just a convention from word2vec
        self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True)
        
        self.parameters.append(self.weight)
    
    def forward(self, input):
        return self.weight.index_select(input)

In [27]:
class Linear(Layer):

    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0/(n_inputs))
        self.weight = Tensor(W, autograd=True)
        self.bias = Tensor(np.zeros(n_outputs), autograd=True)
        
        self.parameters.append(self.weight)
        self.parameters.append(self.bias)

    def forward(self, input):
        return input.mm(self.weight)+self.bias.expand(0,len(input.data))


class Sequential(Layer):
    
    def __init__(self, layers=list()):
        super().__init__()
        
        self.layers = layers
    
    def add(self, layer):
        self.layers.append(layer)
        
    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input
    
    def get_parameters(self):
        params = list()
        for l in self.layers:
            params += l.get_parameters()
        return params

In [13]:
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
        output.data *= 10
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()

        m = (temp_dist > np.random.rand()).argmax()
#         m = output.data.argmax()
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s

In [14]:

class RNNCell(Layer):
    
    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        super().__init__()

        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        
        if(activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif(activation == 'tanh'):
            self.activation == Tanh()
        else:
            raise Exception("Non-linearity not found")

        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)
        
        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()        
    
    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden
    
    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size,self.n_hidden)), autograd=True)

In [15]:
import sys,random,math
from collections import Counter
import numpy as np
import sys

np.random.seed(0)
# dataset from http://karpathy.github.io/2015/05/21/rnn-effectiveness/
f = open('shakespear.txt','r')
raw = f.read()
f.close()

vocab = list(set(raw))
word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
indices = np.array(list(map(lambda x:word2index[x], raw)))

embed = Embedding(vocab_size=len(vocab),dim=512)
model = RNNCell(n_inputs=512, n_hidden=512, n_output=len(vocab))

criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

batch_size = 32
bptt = 16
n_batches = int((indices.shape[0] / (batch_size)))

trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches).transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int(((n_batches-1) / bptt))
input_batches = input_batched_indices[:n_bptt*bptt].reshape(n_bptt,bptt,batch_size)
target_batches = target_batched_indices[:n_bptt*bptt].reshape(n_bptt, bptt, batch_size)

In [16]:
raw[0:5]

'That,'

In [17]:
indices[0:5]

array([30,  3, 17, 56, 57])

In [18]:
batched_indices[0:5]

array([[30, 13, 47, 58, 45,  3, 30, 35, 35, 17, 17, 36, 45, 35, 57, 14,
        35, 22,  3, 47, 36, 17, 35, 35, 35,  6, 35,  6,  6, 35, 35, 39],
       [ 3, 14, 14, 58, 11, 36, 17,  8, 22, 39, 28, 36, 28, 56, 35, 14,
        37, 45, 45, 14, 28, 27, 37, 48,  3, 39, 10,  5, 39,  5, 46, 35],
       [17, 14, 44, 36, 36, 17, 28, 60, 45, 35, 28, 47, 48,  3, 15, 19,
        36,  8, 28, 14, 48, 36, 38, 36, 36, 23, 45, 57, 35, 45, 45, 37],
       [56, 32, 30, 38, 35, 38, 37, 35,  8, 39, 35, 35, 13, 36, 36, 40,
        36, 57, 22, 53, 60, 35, 45, 36, 38, 28,  8, 14, 34, 38,  5, 45],
       [57, 40,  6, 44, 15, 35, 45, 38, 35, 45, 15, 55, 14, 35, 35,  2,
        39, 35, 35, 45, 13, 22, 15, 11, 57, 17, 28, 40,  8, 36, 36, 22]])

In [19]:

input_batches[0][0:5]

array([[30, 13, 47, 58, 45,  3, 30, 35, 35, 17, 17, 36, 45, 35, 57, 14,
        35, 22,  3, 47, 36, 17, 35, 35, 35,  6, 35,  6,  6, 35, 35, 39],
       [ 3, 14, 14, 58, 11, 36, 17,  8, 22, 39, 28, 36, 28, 56, 35, 14,
        37, 45, 45, 14, 28, 27, 37, 48,  3, 39, 10,  5, 39,  5, 46, 35],
       [17, 14, 44, 36, 36, 17, 28, 60, 45, 35, 28, 47, 48,  3, 15, 19,
        36,  8, 28, 14, 48, 36, 38, 36, 36, 23, 45, 57, 35, 45, 45, 37],
       [56, 32, 30, 38, 35, 38, 37, 35,  8, 39, 35, 35, 13, 36, 36, 40,
        36, 57, 22, 53, 60, 35, 45, 36, 38, 28,  8, 14, 34, 38,  5, 45],
       [57, 40,  6, 44, 15, 35, 45, 38, 35, 45, 15, 55, 14, 35, 35,  2,
        39, 35, 35, 45, 13, 22, 15, 11, 57, 17, 28, 40,  8, 36, 36, 22]])

In [20]:
target_batches[0][0:5]

array([[ 3, 14, 14, 58, 11, 36, 17,  8, 22, 39, 28, 36, 28, 56, 35, 14,
        37, 45, 45, 14, 28, 27, 37, 48,  3, 39, 10,  5, 39,  5, 46, 35],
       [17, 14, 44, 36, 36, 17, 28, 60, 45, 35, 28, 47, 48,  3, 15, 19,
        36,  8, 28, 14, 48, 36, 38, 36, 36, 23, 45, 57, 35, 45, 45, 37],
       [56, 32, 30, 38, 35, 38, 37, 35,  8, 39, 35, 35, 13, 36, 36, 40,
        36, 57, 22, 53, 60, 35, 45, 36, 38, 28,  8, 14, 34, 38,  5, 45],
       [57, 40,  6, 44, 15, 35, 45, 38, 35, 45, 15, 55, 14, 35, 35,  2,
        39, 35, 35, 45, 13, 22, 15, 11, 57, 17, 28, 40,  8, 36, 36, 22],
       [35, 41, 60, 48,  6, 36, 56, 36, 15, 35, 36,  3, 14, 48, 15,  0,
        35,  5, 37, 60, 14, 45, 35, 35, 35, 15, 48, 39, 58, 35, 57, 60]])

In [21]:
def train(iterations=10):
    for iter in range(iterations):
        total_loss = 0
        n_loss = 0

        hidden = model.init_hidden(batch_size=batch_size)
        for batch_i in range(len(input_batches)):

            hidden = Tensor(hidden.data, autograd=True)
            loss = None
            losses = list()
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)

                target = Tensor(target_batches[batch_i][t], autograd=True)    
                batch_loss = criterion.forward(output, target)
                losses.append(batch_loss)
                if(t == 0):
                    loss = batch_loss
                else:
                    loss = loss + batch_loss
            for loss in losses:
                ""
            loss.backward()
            optim.step()
            total_loss += loss.data
            log = "\r Iter:" + str(iter)
            log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
            log += " - Loss:" + str(np.exp(total_loss / (batch_i+1)))
            if(batch_i == 0):
                log += " - " + generate_sample(n=70, init_char='\n').replace("\n"," ")
            if(batch_i % 10 == 0 or batch_i-1 == len(input_batches)):
                sys.stdout.write(log)
        optim.alpha *= 0.99
        print()

In [22]:
train()

 Iter:0 - Batch 191/195 - Loss:107.40643328648638 tNtttN t tNttN ttN t  NNt Ntt  N  t NN NNt tNtttt tttNttN Nt NtNttN  
 Iter:1 - Batch 191/195 - Loss:23.436273104177086 hethhthhthhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
 Iter:2 - Batch 191/195 - Loss:16.766544117819698 h thetN the the the the the the the the the the the the the the the th
 Iter:3 - Batch 191/195 - Loss:14.042128491419456dthithe the the the the the the the the the the the the the the the t
 Iter:4 - Batch 191/195 - Loss:12.590958509037547d the the the the the the the the the the the the the the the the the
 Iter:5 - Batch 191/195 - Loss:11.624646049372346hd the the the the the the the the the the the the the the the the the
 Iter:6 - Batch 191/195 - Loss:10.890022032749949hd with the the the the the the the the the the the the the the the th
 Iter:7 - Batch 191/195 - Loss:10.293872287268048d Not the the the the the the the the the the the the the the the the
 Iter:8 - Batch 191/195 - Loss:9.79772519

In [23]:
print(generate_sample(n=2000, init_char='\n'))

hishe Nom haNlath ham he haNl he he Nom hath hNaNlN ham ham hNam hNam ham he Nom hNam ham Nom hNam ham hNNENE:
The hNNE ham Nom hNNE ham ham Nom he hNom ham hNNE ham Nom Nom ham Nom hath haNl Nom he Nom hNNE ham Nom hNaNlN haNl hNNE ham haNl hNam haNlN ham he ham ham Nom hNNENE:
The hNaNlN haNl he Nom he hNoNd ham he Nom haNl he hNNE ham ham he ham ham he haNl Nom he Nom ham hNNE haNl Nom Nom hNam haNl he Nom hNam Nom he ham he Nom ham he haNl he Nom hNNE ham he he ham he hNaNlN haNl Nom haNlath hNNE ham Nom hNom ham hNNE haNl haNlaNl ham he Nom he Nom hNNE ham he hNam haNl hNam he hNNE haNl Nom he ham Nom hNNE ham Nom Nom haNl Nom hath Nom Nom he hNam hNam hNNE ham he haNl Nom he hNam he Nom ham he Nom hNamN ham Nom he hNNE haNl he ham hNNE ham Nom ham Nom he haNlN haNl Nom he ham he hNom he hNNE haNl Nom haNl Nom he ham hNNE ham he hNoNd ham he Nom ham he he hNoNd hNam Nom Nom hNNE ham hNNE ham he hNam he he Nom he hNNE ham ham he haNlath hNam haNl ham he ham he hNNE ham Nom ham Nom 