In [1]:
import numpy as np

class Tensor (object):
    def __init__(self,data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 index_select_indices=None,
                 id=None):  # ID to uniquely identify the Tensor

        self.data = np.array(data)
        self.creators = creators
        self.creation_op = creation_op
        self.grad = None
        self.autograd = autograd
        self.children = {} # To keep track of Tensor's child tensors

        # Generate unique integer as an ID to the Tensor
        if(id is None):
            id = np.random.randint(0,100000)
        self.id = id

        # When creating a tensor, if creators are available, 
        #       we modify the creator children dictionary to keep track of newly created child.
        if(creators is not None):
            for c in creators:
                if(self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    # Helper function to check whether all gradients have backpropagated from the child Tensors.
    def all_children_grads_accounted_for(self):
        for id, cnt in self.children.items():
            if(cnt != 0):
                return False
        return True


    def backward(self, grad=None, grad_origin=None):
        if(self.autograd):
            if(grad_origin is not None):
                if(self.children[grad_origin.id] == 0):
                    raise Exception("cannot backprop more than once")
                else:
                    self.children[grad_origin.id] -= 1

            # Accumulate gradients from all the paths and add them up
            if(self.grad is None):
                self.grad = grad
            else:
                self.grad += grad

            if(self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None)):
                if(self.creation_op == "add"):
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)

                if(self.creation_op == "neg"):
                    # Taking the negation of the gradient tensor
                    self.creators[0].backward(self.grad.__neg__())

                if(self.creation_op == "sub"):
                    new = Tensor(self.grad.data)
                    self.creators[0].backward(new, self)
                    new = Tensor(self.grad.__neg__().data)
                    self.creators[1].backward(new, self)

                if(self.creation_op == "mul"):
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new , self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)

                if(self.creation_op == "mm"):
                    act = self.creators[0]
                    weights = self.creators[1]

                    # This isequivalent to --> layer_1_delta=layer_2_delta.dot(weights_1_2.T) part
                    new = self.grad.mm(weights.transpose()) 
                    act.backward(new)
                    new = self.grad.transpose().mm(act).transpose()
                    weights.backward(new)

                if(self.creation_op == "transpose"):
                    self.creators[0].backward(self.grad.transpose())

                if("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    ds = self.creators[0].data.shape[dim]
                    self.creators[0].backward(self.grad.expand(dim,ds))

                if("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))

                if(self.creation_op == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))

                if(self.creation_op == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))

                if(self.creation_op == "index_select"):
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = self.grad.data.reshape(len(indices_), -1)
                    
                    # Adding up gradient according to the indices
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]

                    self.creators[0].backward(Tensor(new_grad))
                
                if(self.creation_op == "cross_entropy"):
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))


    def __add__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(  self.data + other.data,
                            autograd=True,
                            creators=[self,other],
                            creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if(self.autograd):
            return Tensor(  self.data * -1,
                            autograd=True,
                            creators=[self],
                            creation_op="neg")
        return Tensor(self.data * -1)

    def __sub__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(  self.data - other.data,
                            autograd=True,
                            creators=[self,other],
                            creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(  self.data * other.data,
                            autograd=True,
                            creators=[self,other],
                            creation_op="mul")
        return Tensor(self.data * other.data)

    def sum(self, dim):
        if(self.autograd):
            return Tensor(  self.data.sum(dim), # Getting the sum over desired dimension
                            autograd=True,
                            creators=[self],
                            creation_op="sum_" + str(dim))
        return Tensor(self.data.sum(dim))

    def expand(self, dim, copies):
        trans_cmd = list(range(0, len(self.data.shape)))
        trans_cmd.insert(dim, len(self.data.shape))

        new_shape = list(self.data.shape) + [copies]

        new_data = self.data.repeat(copies).reshape(new_shape)
        new_data = new_data.transpose(trans_cmd)

        if(self.autograd):
            return Tensor(  new_data,
                            autograd=True,
                            creators=[self],
                            creation_op="expand_"+str(dim))
        return Tensor(new_data)

    def transpose(self):
        if(self.autograd):
            return Tensor(  self.data.transpose(),
                            autograd=True,
                            creators=[self],
                            creation_op="transpose")
        return Tensor(self.data.transpose())

    def mm(self, x):
        if(self.autograd):
            return Tensor(  self.data.dot(x.data),
                            autograd=True,
                            creators=[self,x],
                            creation_op="mm")
        return Tensor(self.data.dot(x.data))

    # Non Linearity functions
    def sigmoid(self):
        if(self.autograd):
            return Tensor(  1 / (1 + np.exp(-self.data)),
                            autograd=True,
                            creators=[self],
                            creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if(self.autograd):
            return Tensor(  np.tanh(self.data),
                            autograd=True,
                            creators=[self],
                            creation_op="tanh")
        return Tensor(np.tanh(self.data))

    # Embedding Layer support (forward propagation)
    def index_select(self, indices):
        if(self.autograd):
            new = Tensor(self.data[indices.data],
                         autograd=True,
                         creators=[self],
                         creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])

    def cross_entropy(self, target_indices):

        # Similar to Tensorflow `from_logits=True` parameter in cross_entropy
        temp = np.exp(self.data)
        softmax_output = temp / np.sum( temp,
                                        axis=len(self.data.shape)-1,
                                        keepdims=True)

        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t),-1)

        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()

        if(self.autograd):
            out = Tensor(   loss,
                            autograd=True,
                            creators=[self],
                            creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out
        return Tensor(loss)

    def __repr__(self):
        return str(self.data.__repr__())
    def __str__(self):
        return str(self.data.__str__())

### Below is a implementation of Stochastic Gradient Descent Class

In [2]:
class SGD(object):
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha

    # To Make everything zero/ reset
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
    
    # Gradient Descent Step
    def step(self, zero=True):
        for p in self.parameters:
            p.data -= p.grad.data * self.alpha
            if(zero):
                p.grad.data *= 0

In [3]:
import numpy
np.random.seed(0)

data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

w = list()
w.append(Tensor(np.random.rand(2,3), autograd=True))
w.append(Tensor(np.random.rand(3,1), autograd=True))

optim = SGD(parameters=w, alpha=0.1)

for i in range(10):
    pred = data.mm(w[0]).mm(w[1])
    loss = ((pred - target)*(pred - target)).sum(0)
    
    print(loss)
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()

[0.58128304]
[0.48988149]
[0.41375111]
[0.34489412]
[0.28210124]
[0.2254484]
[0.17538853]
[0.1324231]
[0.09682769]
[0.06849361]


### In most DL frameworks Layers are some kind of predefined way of forward propagating data with a high level function like `forward` to do the propagation. Below is a sample implementation of such layer class.

In [4]:
class Layer(object):
    def __init__(self):
        self.parameters = list()

    def get_parameters(self):
        return self.parameters

class Linear(Layer):
    def __init__(self, n_inputs, n_outputs):
        super().__init__()

        # Placeholders for the Weights and Bias parameters in a Linear Layer (Dense layer)
        W = np.random.randn(n_inputs, n_outputs)*np.sqrt(2.0/(n_inputs))
        self.weight = Tensor(W, autograd=True)
        self.bias = Tensor(np.zeros(n_outputs), autograd=True)

        self.parameters.append(self.weight)
        self.parameters.append(self.bias)

    def forward(self, input):
        return input.mm(self.weight) + self.bias.expand(0, len(input.data))

In [5]:
class Sequential(Layer):
    def __init__(self, layers=list()):
        super().__init__()
        self.layers = layers

    def add(self, layer):
        self.layers.append(layer)
    
    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input
    
    def get_parameters(self):
        params = list()
        for l in self.layers:
            params += l.get_parameters()
        return params

### Below include a imlpementation of a sequential model using above defined classes.

In [6]:
data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

model = Sequential([Linear(2,3), Linear(3,1)])
optim = SGD(parameters=model.get_parameters(), alpha=0.05)

for i in range(10):
    pred = model.forward(data)

    loss = ((pred - target)*(pred - target)).sum(0)
    
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

[2.6865042]
[11.271212]
[38.09060898]
[9.34796148]
[2.28355956]
[0.93643926]
[0.61224936]
[0.43932735]
[0.32808561]
[0.24902135]


In [7]:
class MSELoss(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, pred, target):
        return ((pred - target)*(pred - target)).sum(0)

### Same as above except, now we use a new class object `MSELoss` to calculate the error between the prediction and target values.

In [8]:
import numpy
np.random.seed(0)

data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

model = Sequential([Linear(2,3), Linear(3,1)])

criterion = MSELoss()
optim = SGD(parameters=model.get_parameters(), alpha=0.05)

for i in range(10):
    pred = model.forward(data)

    # Calculate the error amount
    loss = criterion.forward(pred, target)

    # Calculate the gradients based on the operations done.
    loss.backward(Tensor(np.ones_like(loss.data)))
    
    # Backpropagating the loss gradient
    optim.step()
    print(loss)

[2.33428272]
[0.06743796]
[0.0521849]
[0.04079507]
[0.03184365]
[0.02479336]
[0.01925443]
[0.01491699]
[0.01153118]
[0.00889602]


In [9]:
class Tanh(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.tanh()

class Sigmoid(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.sigmoid()


In [10]:
import numpy
np.random.seed(0)

data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

model = Sequential([Linear(2,3), Tanh(), Linear(3,1), Sigmoid()])

criterion = MSELoss()
optim = SGD(parameters=model.get_parameters(), alpha=1)

for i in range(10):
    pred = model.forward(data)
    loss = criterion.forward(pred, target)
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

[1.06372865]
[0.75148144]
[0.57384259]
[0.39574294]
[0.2482279]
[0.15515294]
[0.10423398]
[0.07571169]
[0.05837623]
[0.04700013]


### This is an implementation of Embedding layer with indices support, To make sure everything work had to add few modifications to the Tensor class including new property `index_select_indices` and backpropagation support for the indices

In [11]:
class Embedding(Layer):

    def __init__(self, vocab_size, dim):
        super().__init__()

        self.vocab_size = vocab_size
        self.dim = dim

        weight = (np.random.rand(vocab_size, dim) - 0.5)/dim
        self.weight = Tensor(weight, autograd=True)

        self.parameters.append(self.weight)
    
    def forward(self, input):
        return self.weight.index_select(input)


In [12]:
np.random.seed(0)

data = Tensor(np.array([1,2,1,2]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

embed = Embedding(5,3)
model = Sequential([embed, Tanh(), Linear(3,1), Sigmoid()])

criterion = MSELoss()
optim = SGD(parameters=model.get_parameters(), alpha=0.5)

for i in range(10):
    pred = model.forward(data)
    loss = criterion.forward(pred, target)
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

[0.98874126]
[0.6658868]
[0.45639889]
[0.31608168]
[0.2260925]
[0.16877423]
[0.13120515]
[0.10555487]
[0.08731868]
[0.07387834]


In [13]:
class CrossEntropyLoss(object):
    def __init__(self):
        super().__init__()

    def forward(self, input, target):
        return input.cross_entropy(target)

In [14]:
import numpy
np.random.seed(0)
# data indices
data = Tensor(np.array([1,2,1,2]), autograd=True)
# target indices
target = Tensor(np.array([0,1,0,1]), autograd=True)

model = Sequential([Embedding(3,3), Tanh(), Linear(3,4)])
criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters(), alpha=0.1)

for i in range(10):
    pred = model.forward(data)

    loss = criterion.forward(pred, target)
    loss.backward(Tensor(np.ones_like(loss.data)))
    
    optim.step()
    print(loss)

1.3885032434928422
0.9558181509266036
0.6823083585795604
0.509525996749312
0.39574491472895856
0.31752527285348264
0.2617222861964216
0.22061283923954225
0.18946427334830068
0.16527389263866668


In [15]:
class RNNCell(Layer):
    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output

        if(activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif(activation == 'tanh'):
            self.activation == Tanh()
        else:
            raise Exception("Non-linearity not found")

        self.w_ih = Linear(n_inputs, n_hidden)  # Input to Hidden --> dim: Embed size
        self.w_hh = Linear(n_hidden, n_hidden)  # Hidden to Hidden
        self.w_ho = Linear(n_hidden, n_output)  # Hidden to Output --> dim: defined size

        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()

    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden

    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)

### Preparing the data for RNN training for word prediction.

In [50]:
import sys, random, math
from collections import Counter
import numpy as np

f = open('./datasets/tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    temp = []
    for i in line.lower().replace("\n","").replace("\t", " ").split(" ")[:]:
        if(i != "" and not i.isdigit()):
            temp.append(i)

    tokens.append(temp)

# Make all the sentences to the same length if they are not enough by adding "-" symbol to the front.
new_tokens = list()
for line in tokens:
    new_tokens.append(['-'] * (6 - len(line)) + line)
tokens = new_tokens

# Creating the vocabulary with unique words in the dataset
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)
vocab = list(vocab)

# make word to index mapping
word2index = {}
for i, word in enumerate(vocab):
    word2index[word]=i

# Function to turn sentence to list of indices
def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])

    return idx

# Convert all the sentences to list of indices
indices = list()
for line in tokens:
    idx = list()
    for w in line:
        idx.append(word2index[w])
    indices.append(idx)

data = np.array(indices)

### Building the RNN model with an Embedding layer

In [57]:
embed = Embedding(vocab_size=len(vocab), dim=16)
model = RNNCell(n_inputs=16, n_hidden=16, n_output=len(vocab))

criterion = CrossEntropyLoss()

params = model.get_parameters() + embed.get_parameters()

optim = SGD(parameters=params, alpha=0.05)

In [62]:
batch_size = 100

for iter in range(1000):
    total_loss = 0
    hidden = model.init_hidden(batch_size=batch_size)

    # RNN Cell inside training (5 training parts per sequence)
    for t in range(5):
        input = Tensor(data[0:batch_size, t], autograd=True)
        rnn_input = embed.forward(input=input)

        output, hidden = model.forward(input=rnn_input, hidden=hidden)

    target = Tensor(data[0:batch_size,t+1], autograd=True)
    loss = criterion.forward(output, target)

    loss.backward()
    optim.step()

    total_loss += loss.data

    if(iter % 200 == 0):
        p_correct = (target.data == np.argmax(output.data,axis=1)).mean()
        print_loss = total_loss / (len(data)/batch_size)
        print("Loss:",print_loss,"% Correct:",p_correct)

Loss: 0.3162421422842724 % Correct: 0.0
Loss: 0.16556653764166424 % Correct: 0.26
Loss: 0.1612976424004052 % Correct: 0.32
Loss: 0.15528054860427262 % Correct: 0.31
Loss: 0.14431556676685914 % Correct: 0.38


### Prediction

In [63]:
batch_size = 1
hidden = model.init_hidden(batch_size=batch_size)

for t in range(5):
    input = Tensor(data[0:batch_size, t], autograd=False)
    rnn_input = embed.forward(input=input)
    output, hidden = model.forward(input=rnn_input, hidden=hidden)

target = Tensor(data[0:batch_size,t+1], autograd=False)
loss = criterion.forward(output, target)

ctx = ""
for idx in data[0:batch_size][0][0:-1]:
    ctx += vocab[idx] + " "

print("Context:",ctx)
print("Pred:", vocab[output.data.argmax()])

Context: - mary moved to the 
Pred: office.
