In [9]:
import numpy as np

In [10]:
data = open("kafka.txt",'r').read()

In [11]:
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print("Data consists of vocalbulary having {} unique chars, and dataset having {} chars".format(vocab_size,data_size))

Data consists of vocalbulary having 81 unique chars, and dataset having 137628 chars


In [12]:
# convert integers to characters and vice versa which would be referenced further ahead.
char_to_ints = {ch:i for i,ch in enumerate(chars)}
ints_to_char = {i:ch for i,ch in enumerate(chars)}

In [13]:
print(char_to_ints)
print(ints_to_char)

{'M': 0, 'J': 1, 'v': 2, ' ': 3, 'O': 4, 'x': 5, 'A': 6, 'n': 7, 'l': 8, '(': 9, 'g': 10, 't': 11, ';': 12, 'p': 13, 'o': 14, 'z': 15, 'Q': 16, 'W': 17, 'H': 18, 'C': 19, 'm': 20, 'L': 21, 'i': 22, ':': 23, 'K': 24, 'S': 25, 'w': 26, "'": 27, 'R': 28, '9': 29, '3': 30, 'f': 31, 'e': 32, 'X': 33, '/': 34, '@': 35, '2': 36, 'T': 37, '8': 38, ',': 39, '0': 40, 'q': 41, 'I': 42, 'U': 43, 'k': 44, '\n': 45, '*': 46, '7': 47, '%': 48, '"': 49, 'y': 50, ')': 51, '5': 52, '!': 53, '§': 54, 'c': 55, 'Ã': 56, 'd': 57, '$': 58, '.': 59, 'D': 60, 'N': 61, '6': 62, 'b': 63, 'r': 64, 'h': 65, 'P': 66, 'V': 67, '4': 68, 'G': 69, 'u': 70, 'F': 71, '1': 72, 's': 73, '?': 74, 'a': 75, 'B': 76, 'E': 77, 'Y': 78, '-': 79, 'j': 80}
{0: 'M', 1: 'J', 2: 'v', 3: ' ', 4: 'O', 5: 'x', 6: 'A', 7: 'n', 8: 'l', 9: '(', 10: 'g', 11: 't', 12: ';', 13: 'p', 14: 'o', 15: 'z', 16: 'Q', 17: 'W', 18: 'H', 19: 'C', 20: 'm', 21: 'L', 22: 'i', 23: ':', 24: 'K', 25: 'S', 26: 'w', 27: "'", 28: 'R', 29: '9', 30: '3', 31: 'f', 

In [14]:
# vector for char a

vector_for_a = np.zeros((vocab_size,1))
vector_for_a[char_to_ints['a']]=1
print(vector_for_a.ravel())


[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  1.  0.  0.  0.  0.  0.]


In [15]:
# build a model

# hyperparameter

hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [16]:
# model parameters 

# input to hidden stage weight
Wxh = np.random.randn(hidden_size,vocab_size)*0.01

# Recurrent weight matrix, hidden to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 

# hidden state to output value
Why = np.random.randn(vocab_size, hidden_size) * 0.01 

# bias to hidden
bh = np.zeros((hidden_size, 1))

# bias to output
by = np.zeros((vocab_size, 1))

In [None]:
def loss_function(inputs,targets,hprev):
    
    # xs = one hot encoded characters
    # hs = hidden state outputs
    # ys = target values 
    # ps = probability of the outcomes for normalized probabilites for chars
    
    xs, hs, ys, ps = {}, {}, {}, {}
    
    # initialize with the previous hidden states
    hs[-1] = np.copy(hprev)
    
    # initialize loss as 0
    loss = 0
    
    
    # forward pass
    for t in range(len(inputs)):
        
        # encode in 1-of-k representation (we place a 0 vector as the t-th input)                                   
        xs[t] =  np.zeros((vocab_size,1))    
        
        # Inside that t-th input we use the integer in "inputs" list to  set the correct
        xs[t][inputs[t]] = 1
        
        # hidden state
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) 
        
        # unnormalized log probabilities for next chars                                                                                                           
        ys[t] = np.dot(Why, hs[t]) + by 
        
        # probabilities for next chars 
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))                                                                                                              
        
        # softmax (cross-entropy loss)  
        loss += -np.log(ps[t][targets[t],0])        
    
    