**Loading Dataset.**

In [125]:
import re

In [131]:
import numpy
import random

In [193]:
data = open('/content/dataset (1).txt', 'r').read()

In [195]:
# Change all uppercase letters to lowercase
data = data.lower()

In [196]:
# Split the data by '\n'
words = data.split('\n')

# Define a regular expression pattern to match special characters
pattern = re.compile('[^a-zA-Z\n]')

# Filter out words with special characters and keep '\n' as a separator
filtered_words = [word for word in words if not pattern.search(word)]

# Join the filtered words back together with '\n' as a separator
filtered_data = '\n'.join(filtered_words)

print(filtered_data)

filtered_chars = list(set(filtered_data))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [198]:
data_size, vocab_size = len(filtered_data), len(filtered_chars)

In [199]:
print('data has %d characters, %d unique.' % (data_size, vocab_size))

data has 7300255 characters, 27 unique.


In [200]:
# Convert the filtered data into a list of names
names = filtered_data.split()

# Extract a sample of 10 names
sample_size = 10
sample_names = random.sample(names, sample_size)

# Print the sample names
for name in sample_names:
    print(name)

dean
georgie
jasson
samaria
breann
regina
melanie
rebecca
hayes
evangelina


**Creating Dictionaries.**

In [201]:
char_to_ix = { ch:i for i,ch in enumerate(filtered_chars) }
ix_to_char = { i:ch for i,ch in enumerate(filtered_chars) }

In [202]:
print(char_to_ix)

{'y': 0, 'c': 1, 'q': 2, 'w': 3, 'x': 4, 'n': 5, 'z': 6, 'b': 7, 'r': 8, 'h': 9, 'p': 10, 'd': 11, 'j': 12, 'a': 13, 't': 14, 's': 15, 'i': 16, 'v': 17, 'l': 18, 'g': 19, 'u': 20, 'e': 21, 'f': 22, 'k': 23, 'o': 24, 'm': 25, '\n': 26}


In [203]:
print(ix_to_char)

{0: 'y', 1: 'c', 2: 'q', 3: 'w', 4: 'x', 5: 'n', 6: 'z', 7: 'b', 8: 'r', 9: 'h', 10: 'p', 11: 'd', 12: 'j', 13: 'a', 14: 't', 15: 's', 16: 'i', 17: 'v', 18: 'l', 19: 'g', 20: 'u', 21: 'e', 22: 'f', 23: 'k', 24: 'o', 25: 'm', 26: '\n'}


Functions for random and zero initialization

In [153]:
def random_init(num_rows, num_cols):
    return np.random.rand(num_rows, num_cols)*0.01

def zero_init(num_rows, num_cols):
    return np.zeros((num_rows, num_cols))

# RNN MODEL

**1. Initialization**

In [None]:
class RNN:
   def __init__(self, hidden_size, vocab_size, seq_length, learning_rate):
        # hyper parameters
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        # model parameters
        self.U = np.random.uniform(-np.sqrt(1./vocab_size), np.sqrt(1./vocab_size), (hidden_size, vocab_size))
        self.V = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (vocab_size, hidden_size))
        self.W = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (hidden_size, hidden_size))
        self.b = np.zeros((hidden_size, 1)) # bias for hidden layer
        self.c = np.zeros((vocab_size, 1)) # bias for output

        # memory vars for adagrad
        self.mU = np.zeros_like(self.U)
        self.mW = np.zeros_like(self.W)
        self.mV = np.zeros_like(self.V)
        self.mb = np.zeros_like(self.b)
        self.mc = np.zeros_like(self.c)

 It turns out that the best initialization depends on the activation function (tanh in our case) and one recommended approach is to initialize the weights randomly in the interval from [ -1/sqrt(n), 1/sqrt(n)] where n is the number of incoming connections from the previous layer.

 This is called Xavier or Glorot Initialization. The Xavier Initialization method is calculated as a random number with a uniform probability distribution (U) between the range -(1/sqrt(n)) and 1/sqrt(n), where n is the number of inputs to the node.

Adagrad (Adaptive Gradient Algorithm) is an optimization algorithm used in training neural networks. It adjusts the learning rate for each parameter based on the historical gradients of that parameter.

It is better than SGD and works well for NLP data.

**2. Forward Pass**

Softmax Code

In [None]:
   def softmax(self, x):
        p = np.exp(x- np.max(x))
        return p / np.sum(p)

Forward Propagation Code

In [None]:
   def forward(self, inputs, hprev):
            xs, hs, os, ycap = {}, {}, {}, {}
            hs[-1] = np.copy(hprev)
            for t in range(len(inputs)):
                xs[t] = zero_init(self.vocab_size,1)
                xs[t][inputs[t]] = 1 # one hot encoding , 1-of-k
                hs[t] = np.tanh(np.dot(self.U,xs[t]) + np.dot(self.W,hs[t-1]) + self.b) # hidden state
                os[t] = np.dot(self.V,hs[t]) + self.c # unnormalised log probs for next char
                ycap[t] = self.softmax(os[t]) # probs for next char
            return xs, hs, ycap


Follows the below equations-

a(t) = b + W*h(t-1) + U*x(t)

h(t) = tanh(a(t))

o(t) = c + V*h(t)

y(t)= softmax(o(t))

**3. Compute Loss**

In [None]:
   def loss(self, ps, targets):
            # calculate cross-entropy loss
            return sum(-np.log(ps[t][targets[t],0]) for t in range(self.seq_length))

Cross-entropy Loss is the Loss function used and is used to train the network with respect to the words that are outputted by the network. In multi-class classification we take the sum of log loss values for each class prediction in the observation

The most important thing to remember is that the Y vector will only be 1 at the correct class and 0 everywhere else. The intent is the increase the probability of the correct class, and to decrease the probabilities of the rival classes.

**4. Backward Pass**

In [None]:
   def backward(self, xs, hs, ps, targets):
            # backward pass: compute gradients going backwards
            dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
            db, dc = np.zeros_like(self.b), np.zeros_like(self.c)
            dhnext = np.zeros_like(hs[0])
            for t in reversed(range(self.seq_length)):
                dy = np.copy(ps[t])
                #through softmax
                dy[targets[t]] -= 1 # backprop into y
                #calculate dV, dc
                dV += np.dot(dy, hs[t].T)
                dc += dc
                #dh includes gradient from two sides, next cell and current output
                dh = np.dot(self.V.T, dy) + dhnext # backprop into h
                # backprop through tanh non-linearity
                dhrec = (1 - hs[t] * hs[t]) * dh  #dhrec is the term used in many equations
                db += dhrec
                #calculate dU and dW
                dU += np.dot(dhrec, xs[t].T)
                dW += np.dot(dhrec, hs[t-1].T)
                #pass the gradient from next cell to the next iteration.
                dhnext = np.dot(self.W.T, dhrec)
            # clip to mitigate exploding gradients
            for dparam in [dU, dW, dV, db, dc]:
                np.clip(dparam, -5, 5, out=dparam)
            return dU, dW, dV, db, dc

Gradient clipping is done to prevent gradient explosion. It involves modifying the gradients during training so that they do not exceed a predefined threshold. If the gradient of a parameter exceeds this threshold, it is scaled down proportionally to ensure that its magnitude remains within a reasonable range.

Whereas the exploding gradient can be fixed with gradient clipping technique as is used in the example code here, the vanishing gradient issue is still is major concern with an RNN. Hence, LSTM, GRU, ResNets are better solutions for NLP tasks and can avoid vanishing gradient.

**5. Update Weights**

In [None]:
   def update_model(self, dU, dW, dV, db, dc):
        # parameter update with adagrad
        for param, dparam, mem in zip([self.U, self.W, self.V, self.b, self.c],
                                  [dU, dW, dV, db, dc],
                                  [self.mU, self.mW, self.mV, self.mb, self.mc]):
            mem += dparam*dparam
            param += -self.learning_rate*dparam/np.sqrt(mem+1e-8) # adagrad update

**6. Sampling**

In [None]:
   def sample(self, h, seed_ix, n):
    ixes = []
    for _ in range(n):
        x = np.zeros((self.vocab_size, 1))
        x[seed_ix] = 1
        h = np.tanh(np.dot(self.U, x) + np.dot(self.W, h) + self.b)
        y = np.dot(self.V, h) + self.c
        p = np.exp(y - np.max(y)) / np.sum(np.exp(y - np.max(y)))
        ix = np.random.choice(range(self.vocab_size), p=p.ravel())
        seed_ix = ix  # Update seed_ix for the next iteration
        ixes.append(ix)
    return ixes

Once the RNN is trained, the sample function allows you to generate new sequences of text by iteratively predicting the next character based on the previous characters. This is useful for tasks like generating creative text, composing music, or generating code.

The sample method generates a sequence of integers from the model by iteratively predicting the next character based on the current state and sampling from the output distribution. It utilizes one-hot encoding to represent characters, ensuring compatibility with the model architecture.

**7. Train model**

In [None]:
   def train(self, data):
    # Initialize variables
    iter_num = 0
    threshold = 0.1

    # Training loop
    while True:  # Keep iterating until explicitly broken
        # Initialize hidden state
        hprev = np.zeros((self.hidden_size, 1))

        # Generate random starting point for input sequence
        start_idx = np.random.randint(0, data_size - self.seq_length)
        inputs = [char_to_ix[ch] for ch in data[start_idx:start_idx + self.seq_length]]
        targets = [char_to_ix[ch] for ch in data[start_idx + 1:start_idx + self.seq_length + 1]]

        # Forward pass
        xs, hs, ps = self.forward(inputs, hprev)
        # Backward pass
        dU, dW, dV, db, dc = self.backward(xs, hs, ps, targets)
        # Compute loss
        loss = self.loss(ps, targets)
        # Update model parameters
        self.update_model(dU, dW, dV, db, dc)
        # Update hidden state
        hprev = hs[self.seq_length - 1]

        # Print progress occasionally
        if not iter_num % 500:
            sample_ix = self.sample(hprev, inputs[0], 200)
            print(''.join(ix_to_char[ix] for ix in sample_ix))
            print("iter: %d, loss: %f" % (iter_num, loss))

        # Check if loss is below threshold, if so, break the loop
        if loss < threshold:
            break

        iter_num += 1



In [205]:
import numpy as np

class RNN:
    def __init__(self, hidden_size, vocab_size, seq_length, learning_rate):
        # Initialize parameters
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.seq_length = seq_length
        self.learning_rate = learning_rate

        # Initialize model parameters
        self.U = np.random.uniform(-np.sqrt(1./vocab_size), np.sqrt(1./vocab_size), (hidden_size, vocab_size))
        self.V = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (vocab_size, hidden_size))
        self.W = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (hidden_size, hidden_size))
        self.b = np.zeros((hidden_size, 1)) # bias for hidden layer
        self.c = np.zeros((vocab_size, 1)) # bias for output

        # memory vars for adagrad
        self.mU = np.zeros_like(self.U)
        self.mW = np.zeros_like(self.W)
        self.mV = np.zeros_like(self.V)
        self.mb = np.zeros_like(self.b)
        self.mc = np.zeros_like(self.c)

    def softmax(self, x):
        p = np.exp(x - np.max(x))
        return p / np.sum(p)

    def forward(self, inputs, hprev):
        xs, hs, os, ycap = {}, {}, {}, {}
        hs[-1] = np.copy(hprev)
        for t in range(len(inputs)):
            xs[t] = np.zeros((self.vocab_size, 1))
            xs[t][inputs[t]] = 1 # one hot encoding, 1-of-k
            hs[t] = np.tanh(np.dot(self.U, xs[t]) + np.dot(self.W, hs[t-1]) + self.b) # hidden state
            os[t] = np.dot(self.V, hs[t]) + self.c # unnormalized log probs for next char
            ycap[t] = self.softmax(os[t]) # probs for next char
        return xs, hs, ycap

    def loss(self, ps, targets):
        # calculate cross-entropy loss
        return sum(-np.log(ps[t][targets[t],0]) for t in range(self.seq_length))

    def backward(self, xs, hs, ps, targets):
        # backward pass: compute gradients going backwards
        dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        db, dc = np.zeros_like(self.b), np.zeros_like(self.c)
        dhnext = np.zeros_like(hs[0])
        for t in reversed(range(self.seq_length)):
            dy = np.copy(ps[t])
            # through softmax
            dy[targets[t]] -= 1 # backprop into y
            # calculate dV, dc
            dV += np.dot(dy, hs[t].T)
            dc += dc
            # dh includes gradient from two sides, next cell and current output
            dh = np.dot(self.V.T, dy) + dhnext # backprop into h
            # backprop through tanh non-linearity
            dhrec = (1 - hs[t] * hs[t]) * dh  # dhrec is the term used in many equations
            db += dhrec
            # calculate dU and dW
            dU += np.dot(dhrec, xs[t].T)
            dW += np.dot(dhrec, hs[t-1].T)
            # pass the gradient from next cell to the next iteration.
            dhnext = np.dot(self.W.T, dhrec)
        # clip to mitigate exploding gradients
        for dparam in [dU, dW, dV, db, dc]:
            np.clip(dparam, -5, 5, out=dparam)
        return dU, dW, dV, db, dc

    def update_model(self, dU, dW, dV, db, dc):
        # parameter update with adagrad
        for param, dparam, mem in zip([self.U, self.W, self.V, self.b, self.c],
                                  [dU, dW, dV, db, dc],
                                  [self.mU, self.mW, self.mV, self.mb, self.mc]):
            mem += dparam * dparam
            param += -self.learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    def sample(self, h, seed_ix, n):
        ixes = []
        for _ in range(n):
            x = np.zeros((self.vocab_size, 1))
            x[seed_ix] = 1
            h = np.tanh(np.dot(self.U, x) + np.dot(self.W, h) + self.b)
            y = np.dot(self.V, h) + self.c
            p = np.exp(y - np.max(y)) / np.sum(np.exp(y - np.max(y)))
            ix = np.random.choice(range(self.vocab_size), p=p.ravel())
            seed_ix = ix  # Update seed_ix for the next iteration
            ixes.append(ix)
        return ixes

    def train(self, data):
     # Initialize variables
     iter_num = 0
     threshold = 0.1

     # Training loop
     while True:  # Keep iterating until explicitly broken
        # Initialize hidden state
        hprev = np.zeros((self.hidden_size, 1))

        # Generate random starting point for input sequence
        start_idx = np.random.randint(0, data_size - self.seq_length)
        inputs = [char_to_ix[ch] for ch in data[start_idx:start_idx + self.seq_length]]
        targets = [char_to_ix[ch] for ch in data[start_idx + 1:start_idx + self.seq_length + 1]]

        # Forward pass
        xs, hs, ps = self.forward(inputs, hprev)
        # Backward pass
        dU, dW, dV, db, dc = self.backward(xs, hs, ps, targets)
        # Compute loss
        loss = self.loss(ps, targets)
        # Update model parameters
        self.update_model(dU, dW, dV, db, dc)
        # Update hidden state
        hprev = hs[self.seq_length - 1]

        # Print progress occasionally
        if not iter_num % 500:
            sample_ix = self.sample(hprev, inputs[0], 200)
            print(''.join(ix_to_char[ix] for ix in sample_ix))
            print("iter: %d, loss: %f" % (iter_num, loss))

        # Check if loss is below threshold, if so, break the loop
        if loss < threshold:
            break

        iter_num += 1



In [None]:
seq_length = 6
rnn = RNN(hidden_size=100, vocab_size=vocab_size, seq_length=seq_length, learning_rate=1e-1)
rnn.train(filtered_data)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
einn
donam
trique
brien
licki
myrah
paula
nickitly
rashina
pedriamaya
machanda
juyton
chancin
jomia
darinnettaci
dannoe
sharyn
julie
wilton
vinesterine
dania
shyl
rake
chegd
nasmanc
joram
lavenn
quinn
iter: 440000, loss: 10.396380
lix
andree
naroa
vincia
ginra
leolie
rixa
netela
yergin
francis
kara
kadell
kaitre
karl
joanna
chardon
varran
gabroylen
garrey
carlenandie
jeann
floakes
dallo
shelton
stivea
sheyne
aubrin
icalvie
jees
iter: 440500, loss: 9.699934
t
kelisa
leonna
eflen
cectionst
annela
luciosa
sudni
candido
haige
shanselt
shar
eovanne
lete
sezeman
elane
merathanett
coresa
laurnie
andre
kristile
albert
mellenne
trell
german
sabel
genekie
lauron

iter: 441000, loss: 9.496154
johusson
comathene
jedie
marith
liza
timea
kindoprinna
danall
fredon
car
dana
betther
maxus
alda
sereston
nanay
marrotnen
marqakr
tralinite
cathlynne
quint
mallina
rob
demee
jahoyla
melina
gettel
alli
iter: 441500, loss: 7.580268
lalo
rayan
sha