In [3]:
import numpy as np
import string
import tensorflow as tf

In [4]:
file_path = 'facebook-names.txt'
with open(file_path, 'r') as file:
    data = file.read()
    data=data.lower()
    data=data.split('\n')

In [5]:
def char_level_dict(data):
    char_to_index ={}
    index_to_char ={}
    total_char = "".join(data)
    char_set= (set("".join(data)))
    special_character = '\n'
    char_set.add(special_character)
    char_set = sorted(char_set)
    for i, char in enumerate(char_set):
        char_to_index[char] = i
        index_to_char[i] = char
    return char_to_index, index_to_char, char_set, total_char

In [6]:
char_to_index, index_to_char,char_set,total_char = char_level_dict(data)

In [7]:
print(char_to_index)

{'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [8]:
print(index_to_char)

{0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


In [9]:
unique_char_size = len(char_set)
total_char_size = len(total_char)
print(f'There are {unique_char_size} unique characters and {total_char_size} total characters in the dataset')

There are 27 unique characters and 1889474 total characters in the dataset


In [10]:
class RNN:
    def __init__(self,hidden_size,vocab_size,seq_length,learning_rate):
        #hyper parameters
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        self.U = np.random.randn(hidden_size,vocab_size) #input to hidden
        self.W = np.random.randn(hidden_size,hidden_size)
        self.V = np.random.randn(vocab_size,hidden_size)
        self.b = np.random.randn(hidden_size,1)
        self.c = np.random.randn(vocab_size,1)

    def softmax(self,x):
        exps = np.exp(x-np.max(x))
        return exps/np.sum(exps)

    def forward(self,inputs, hprev):
        xs, hs, os, ycap = {}, {}, {}, {}
        hs[-1] = np.copy(hprev)
        for t in range((len(inputs))):
            xs[t] = np.zeros((vocab_size,1)) #creating an input vector for each character in the sequence
            xs[t][inputs[t]] = 1 #one hot encoding
            hs[t] = np.tanh(np.dot(self.U, xs[t]) + np.dot(self.W, hs[t-1]) + self.b)#hidden state and activation function tanh applied
            os[t] = np.dot(self.V, hs[t]) + self.c #unnormalized log probabilities for next chars
            ycap[t] = self.softmax(self.os[t])#output layer
        return xs, hs,ycap
    
    def loss(self,ycap,targets):
        return sum(-np.log(ycap[t][targets[t],0]) for t in range(len(targets)))
    
    def backward(self, xs, hs, ycap, targets):
        dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        db, dc = np.zeros_like(self.b), np.zeros_like(self.c)
        dhnext = np.zeros_like(hs[0])
        for t in reversed(range(len(targets))):
            dy = np.copy(ycap[t])
            dy[targets[t]] -= 1
            dV += np.dot(dy, hs[t].T)
            dc += dy
            dh = np.dot(self.V.T, dy) + dhnext
            dhraw = (1 - hs[t] * hs[t]) * dh
            db += dhraw
            dU += np.dot(dhraw, xs[t].T)
            dW += np.dot(dhraw, hs[t-1].T)
            dhnext = np.dot(self.W.T, dhraw)
        for dparam in [dU, dW, dV, db, dc]:
            np.clip(dparam, -5, 5, out=dparam)
        return dU, dW, dV, db, dc, hs[len(targets)-1]
    
    def update_model(self, dU, dW, dV, db, dc):
        for param, dparam in zip([self.U, self.W, self.V, self.b, self.c], [dU, dW, dV, db, dc]):
            param += -self.learning_rate * dparam
    
    def sample(self,char_to_ix, seed):
        x = np.zeros((self.vocab_size, 1))
        a_prev = np.zeros((self.hidden_size,1))
        indices = []
        idx = -1
        counter =0
        new_line_character = char_to_ix['\n']

        while(idx!= new_line_character and counter !=50):
            
            a = np.tanh(np.dot(self.U,x) + np.dot(self.W,a_prev) + self.b)
            z=np.dot(self.V,a) + self.c
            y=self.softmax(z)

            np.random.seed(counter+seed)

            idx = np.random.choice(range(self.vocab_size), p=y.ravel())
            indices.append(idx)
            x = np.zeros((self.vocab_size, 1))
            x[idx] = 1
            a_prev = a
            counter +=1
            seed +=1
        if(counter == 50):
            indices.append(new_line_character)
        return indices


In [11]:
rnn = RNN(hidden_size=100, vocab_size=unique_char_size, seq_length=25, learning_rate=1e-1)

In [12]:
indices = rnn.sample(char_to_index,1)
print(len(indices))
print([index_to_char[i] for i in indices])

5
['l', 'b', 'e', 'e', '\n']


In [15]:
max_len = max([len(name) for name in data])
X = np.zeros((len(data), max_len, unique_char_size), dtype=bool)
y = np.zeros((len(data), unique_char_size), dtype=bool)

In [16]:
for i, name in enumerate(data):
    for t, char in enumerate(name):
        X[i, t, char_to_index[char]] = 1
    y[i, char_to_index[name[-1]]] = 1

(300000, 20, 27)
