In [5]:
import numpy as np
import collections

### Zadatak 1: učitavanje podataka i batching

In [85]:
class TextDataset():
    
    def __init__(self, input_file, batch_size, sequence_length):
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        
        self.preprocess(input_file)
    
    def preprocess(self, input_file):
        with open(input_file, 'r') as f:
            data = f.read() # .decode('utf-8') <- python 2

        # count and sort most frequent characters
        char_counter = collections.Counter(data)
        self.sorted_chars = list(zip(*char_counter.most_common()))[0]

        # self.sorted_chars contains just the characters ordered descending by frequency
        self.char2id = dict(zip(self.sorted_chars, range(len(self.sorted_chars))))
        # reverse the mapping
        self.id2char = {k:v for v,k in self.char2id.items()}
        # convert the data to ids
        self.x = np.array(list(map(self.char2id.get, data)))

    def encode(self, sequence):
        # returns the sequence encoded as integers
        return np.array(list(map(self.char2id.get, sequence)))

    def decode(self, encoded_sequence):
        # returns the sequence decoded as letters
        return ''.join(list(map(self.id2char.get, encoded_sequence)))
    
    def create_minibatches(self):
        self.i = 0
        self.num_batches = int(len(self.x) / (self.batch_size * self.sequence_length)) # calculate the number of batches

        #######################################
        #       Convert data to batches       #
        #######################################

        num_elements = self.num_batches * self.batch_size * self.sequence_length
        
        crop_x = self.x[:num_elements]
        self.minibatches_x = crop_x.reshape(self.num_batches, self.batch_size, self.sequence_length)
        
        crop_y = self.x[1:num_elements + 1]
        self.minibatches_y = crop_y.reshape(self.num_batches, self.batch_size, self.sequence_length)
    
    def next_minibatch(self):
        # handling batch pointer & reset
        # new_epoch is a boolean indicating if the batch pointer was reset
        # in this function call
        
        batch_x, batch_y = self.minibatches_x[self.i], self.minibatches_y[self.i]
        new_epoch = self.i == 0
        
        self.i += 1
        if self.i == self.num_batches:
            self.i = 0
        
        return new_epoch, batch_x, batch_y

In [86]:
input_file = 'data/selected_conversations.txt'
batch_size = 100
sequence_length = 30

dataset = TextDataset(input_file, batch_size, sequence_length)

raw = 'Test123'
print('Raw:', raw)

encoded = dataset.encode(raw)
print('Encoded:', encoded)

decoded = dataset.decode(encoded)
print('Decoded:', decoded)

dataset.create_minibatches()
_, batch_x, batch_y = dataset.next_minibatch()
print('First two sequences of x:', dataset.decode(batch_x[0]), dataset.decode(batch_x[1]))
print('First two sequences of y:', dataset.decode(batch_y[0]), dataset.decode(batch_y[1]))

Raw: Test123
Encoded: [32  1  9  2 59 60 66]
Decoded: Test123
First two sequences of x: CORNELIUS:
I can't pretend to  be your husband... David's in 
First two sequences of y: ORNELIUS:
I can't pretend to b e your husband... David's in g


### Zadatak 2: obična jednoslojna povratna neuronska mreža

In [87]:
class RNN():
    
    def __init__(self, hidden_size, sequence_length, vocab_size, learning_rate):
        self.hidden_size = hidden_size
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.learning_rate = learning_rate
        
        def norm_init(size):
            loc = 1e-2
            scale = 1e-2
            return np.random.normal(loc=loc, scale=scale, size=size)

        self.U = norm_init((self.vocab_size, self.hidden_size)) # ... input projection
        self.W = norm_init((self.hidden_size, self.hidden_size)) # ... hidden-to-hidden projection
        self.b = np.zeros((self.hidden_size, 1)) # ... input bias

        self.V = norm_init((self.hidden_size, self.vocab_size)) # ... output projection
        self.c = np.zeros((self.vocab_size, 1)) # ... output bias

        # memory of past gradients - rolling sum of squares for Adagrad
        self.memory_U, self.memory_W, self.memory_V = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        self.memory_b, self.memory_c = np.zeros_like(self.b), np.zeros_like(self.c)
        
    def rnn_step_forward(self, x, h_prev, U, W, b):
        # A single time step forward of a recurrent neural network with a 
        # hyperbolic tangent nonlinearity.

        # x - input data (minibatch size x input dimension)
        # h_prev - previous hidden state (minibatch size x hidden size)
        # U - input projection matrix (input dimension x hidden size)
        # W - hidden to hidden projection matrix (hidden size x hidden size)
        # b - bias of shape (hidden size x 1)
        
        h = np.tanh(np.dot(h_prev, W) + np.dot(x, U) + b.transpose())
        cache = (h, W, h_prev, x)

        # return the new hidden state and a tuple of values needed for the backward step
        return h, cache


    def rnn_forward(self, x, h0, U, W, b):
        # Full unroll forward of the recurrent neural network with a 
        # hyperbolic tangent nonlinearity

        # x - input data for the whole time-series (minibatch size x sequence_length x input dimension)
        # h0 - initial hidden state (minibatch size x hidden size)
        # U - input projection matrix (input dimension x hidden size)
        # W - hidden to hidden projection matrix (hidden size x hidden size)
        # b - bias of shape (hidden size x 1)
        
        h, cache = [], []

        h_prev = h0
        for i in range(x.shape[1]):
            h_t, cache_t = self.rnn_step_forward(x[:,i], h_prev, U, W, b)

            h.append(h_t)
            cache.append(cache_t)
            
            h_prev = h_t

        # return the hidden states for the whole time series (T+1)
        # and a tuple of values needed for the backward step
        return np.stack(h, axis=1), cache
    
    def rnn_step_backward(self, grad_next, cache):
        # A single time step backward of a recurrent neural network with a 
        # hyperbolic tangent nonlinearity.

        # grad_next - upstream gradient of the loss with respect to the next hidden state and current output
        # cache - cached information from the forward pass
        
        h, W, h_prev, x = cache
        dtanh = 1 - h**2
        
        dh_prev = np.dot(grad_next * dtanh, W)
        dU = np.dot(x.transpose(), grad_next * dtanh)
        dW = np.dot(h_prev.transpose(), grad_next * dtanh)
        db = np.sum(grad_next * dtanh, axis=0).reshape(-1, 1)

        # compute and return gradients with respect to each parameter
        # HINT: you can use the chain rule to compute the derivative of the
        # hyperbolic tangent function and use it to compute the gradient
        # with respect to the remaining parameters
        return dh_prev, dU, dW, db


    def rnn_backward(self, dh, cache):
        # Full unroll forward of the recurrent neural network with a 
        # hyperbolic tangent nonlinearity

        dU = np.zeros(self.U.shape)
        dW = np.zeros(self.W.shape)
        db = np.zeros(self.b.shape)

        # compute and return gradients with respect to each parameter
        # for the whole time series.
        # Why are we not computing the gradient with respect to inputs (x)?
        
        dh_prev = 0
        for i in reversed(range(self.sequence_length)):
            grad_next = dh[:,i] + dh_prev
            
            dh_prev, dU_t, dW_t, db_t = self.rnn_step_backward(grad_next, cache[i])
            
            dU += dU_t
            dW += dW_t
            db += db_t

        return dU, dW, db
    
    def output(self, h, V, c):
        # Calculate the output probabilities of the network
        return np.dot(h, V) + c.transpose()

    def output_loss_and_grads(self, h, V, c, y):
        # Calculate the loss of the network for each of the outputs

        # h - hidden states of the network for each timestep. 
        #     the dimensionality of h is (batch size x sequence length x hidden size (the initial state is irrelevant for the output)
        # V - the output projection matrix of dimension hidden size x vocabulary size
        # c - the output bias of dimension vocabulary size x 1
        # y - the true class distribution - a one-hot vector of dimension 
        #     vocabulary size x 1 - you need to do this conversion prior to
        #     passing the argument. A fast way to create a one-hot vector from
        #     an id could be something like the following code:

        #   y[timestep] = np.zeros((vocabulary_size, 1))
        #   y[timestep][batch_y[timestep]] = 1

        #     where y might be a dictionary.

        batch_size = y.shape[0]
        
        loss = None
        dh = np.zeros(self.W.shape)
        dV = np.zeros(self.V.shape)
        dc = None
        
        # calculate the output (o) - unnormalized log probabilities of classes
        o = self.output(h, V, c)
        
        # calculate yhat - softmax of the output
        yhat = np.exp(o) / np.sum(np.exp(o), axis=2, keepdims=True)
        
        # calculate the cross-entropy loss
        loss = -np.sum(y * np.log(yhat)) / (batch_size * self.sequence_length)
        
        # calculate the derivative of the cross-entropy softmax loss with respect to the output (o)
        do = yhat - y
        
        # calculate the gradients with respect to the output parameters V and c
        for i in range(batch_size):
            dV += np.dot(h[i].transpose(), do[i])
        
        dc = np.sum(np.sum(do, axis=0) / batch_size, axis=0).reshape(-1, 1)
        
        # calculate the gradients with respect to the hidden layer h
        dh = np.dot(do, V.transpose())
        
        return loss, dh, dV, dc
    
    # The inputs to the function are just indicative since the variables are mostly present as class properties
    def update(self, dU, dW, db, dV, dc,
               U, W, b, V, c,
               memory_U, memory_W, memory_b, memory_V, memory_c):
        a_min = -5
        a_max = 5
        for param, dparam, mem in zip([U, W, b, V, c],
                                      [dU, dW, db, dV, dc],
                                      [memory_U, memory_W, memory_b, memory_V, memory_c]):
            dparam = np.clip(dparam, a_min, a_max)
            # update memory matrices
            mem += dparam * dparam
            # perform the Adagrad update of parameters
            param += -self.learning_rate * dparam / np.sqrt(mem + 1e-8)
    
    def step(self, h0, x_oh, y_oh):
        # Makes one forward and backward pass
        
        # h0 - initial hidden state (hidden_size x 1)
        # x_oh - one-hot input (minibatch_size x sequence_length x vocab_size) 
        # y_oh - one-hot output (minibatch_size x sequence_length x vocab_size)
        
        h, cache = self.rnn_forward(x_oh, h0, self.U, self.W, self.b)
        
        loss, dh, dV, dc = self.output_loss_and_grads(h, self.V, self.c, y_oh)
        
        dU, dW, db = self.rnn_backward(dh, cache)
        
        self.update(dU, dW, db, dV, dc,
                    self.U, self.W, self.b, self.V, self.c,
                    self.memory_U, self.memory_W, self.memory_b, self.memory_V, self.memory_c)
        
        # return loss and last hidden state
        return loss, h[:,-1]
    
    def sample(self, seed_onehot, n_sample):
        res = []
        
        h0 = np.zeros((1, self.hidden_size))
        # inicijalizirati h0 na vektor nula
        # seed string pretvoriti u one-hot reprezentaciju ulaza

        x_oh = seed_onehot.reshape(1, seed_onehot.shape[0], seed_onehot.shape[1])
        h, _ = self.rnn_forward(x_oh, h0, self.U, self.W, self.b)
        h0 = h[:,-1]
        
        o = self.output(h0, self.V, self.c)
        yhat = np.exp(o) / np.sum(np.exp(o))
        
        encoded = np.argmax(yhat)
        res.append(encoded)
        x_oh = np.zeros((1, 71))
        x_oh[0][encoded] = 1
        
        for i in range(n_sample - 1):
            h, _ = self.rnn_step_forward(x_oh, h0, self.U, self.W, self.b)
        
            o = self.output(h, self.V, self.c)
            yhat = np.exp(o) / np.sum(np.exp(o))

            encoded = np.argmax(yhat)
            res.append(encoded)
            x_oh = np.zeros((1, 71))
            x_oh[0][encoded] = 1
            
            h0 = h
        
        return res

In [88]:
def run_language_model(dataset, max_epochs, hidden_size=100, sequence_length=30, learning_rate=1e-1, sample_every=100):
    
    vocab_size = len(dataset.sorted_chars)
    rnn = RNN(hidden_size, sequence_length, vocab_size, learning_rate) # initialize the recurrent network

    current_epoch = 0 
    batch = 0

    h0 = np.zeros((hidden_size, 1))

    average_loss = 0

    while current_epoch < max_epochs: 
        e, x, y = dataset.next_minibatch()
        
        if e: 
            current_epoch += 1
            # h0 = np.zeros((hidden_size, 1))
            h0 = np.zeros((batch_size, hidden_size))
            # why do we reset the hidden state here?

        def to_one_hot(encoded, sequence_length, vocab_size):
            N = encoded.shape[0]
            oh = np.zeros((N, sequence_length, vocab_size))
            for i in range(N):
                oh[i][range(sequence_length), encoded[i]] = 1
            return oh
        
        # One-hot transform the x and y batches
        x_oh, y_oh = to_one_hot(x, sequence_length, vocab_size), to_one_hot(y, sequence_length, vocab_size)

        # Run the recurrent network on the current batch
        # Since we are using windows of a short length of characters,
        # the step function should return the hidden state at the end
        # of the unroll. You should then use that hidden state as the
        # input for the next minibatch. In this way, we artificially
        # preserve context between batches.
        loss, h0 = rnn.step(h0, x_oh, y_oh)

        if batch % sample_every == 0:
            print(batch, round(loss, 8))
            # run sampling (2.2)
            seed = 'HAN:\nIs that good or bad?\n\n'
            n_sample = 300
            
            def to_one_hot(encoded, vocab_size):
                N = encoded.shape[0]
                oh = np.zeros((N, vocab_size))
                #oh[range(N)][encoded[range(N)]] = 1
                for i in range(N):
                    oh[i][encoded[i]] = 1
                return oh
            
            seed_encoded = dataset.encode(seed)
            seed_onehot = to_one_hot(seed_encoded, vocab_size)
            
            encoded = rnn.sample(seed_onehot, n_sample)
            decoded = dataset.decode(encoded)
            print(decoded)
        
        batch += 1
        
    print('test:')
    seed = 'HAN:\nIs that good or bad?\n\n'
    n_sample = 300

    def to_one_hot(encoded, vocab_size):
        N = encoded.shape[0]
        oh = np.zeros((N, vocab_size))
        #oh[range(N)][encoded[range(N)]] = 1
        for i in range(N):
            oh[i][encoded[i]] = 1
        return oh

    seed_encoded = dataset.encode(seed)
    seed_onehot = to_one_hot(seed_encoded, vocab_size)

    encoded = rnn.sample(seed_onehot, n_sample)
    decoded = dataset.decode(encoded)
    print(decoded)

In [98]:
input_file = 'data/selected_conversations.txt'
batch_size = 100
sequence_length = 30

dataset = TextDataset(input_file, batch_size, sequence_length)
dataset.create_minibatches()

max_epochs = 10
hidden_size = 100
learning_rate = 5e-2
sample_every = 300

run_language_model(dataset, max_epochs, hidden_size, sequence_length, learning_rate, sample_every)

0 4.26320847
LILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILILI
300 2.22116085
MANERE:
What and at the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
600 2.17843483
DORE:
I we wan the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the t
900 2.16084504
MAVERE:
I wan the wan the wan the wan t