# Project

## Dependencies

In [1]:
import tensorflow as tf
from keras import layers
import copy
import numpy as np
import pickle
import matplotlib.pyplot as plt

## Loading data

In [2]:
fname = '../data/shaketext.txt'

with open(fname, "r") as fid:
    data = fid.read()

unique_chars = list(set(data))
K = len(unique_chars)
unique_chars_sorted = sorted(unique_chars)

char_to_index = {char: index for index, char in enumerate(unique_chars_sorted)}
index_to_char = {index: char for index, char in enumerate(unique_chars_sorted)}

print("Total characters:", len(data))
print("Unique characters (K):", K)
print("Sample char to index mapping:", list(char_to_index.items())[:10])

Total characters: 5378661
Unique characters (K): 106
Sample char to index mapping: [('\t', 0), ('\n', 1), (' ', 2), ('!', 3), ('#', 4), ('$', 5), ('%', 6), ('&', 7), ("'", 8), ('(', 9)]


## Baseline RNN

In [3]:
text_as_int = [char_to_index[c] for c in data]

seq_length = 100
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
)

In [4]:
for x_batch, y_batch in dataset.take(1):
    print("Input batch shape:", x_batch.shape)
    print("Target batch shape:", y_batch.shape)
    first_input = ''.join(index_to_char[idx] for idx in x_batch[0].numpy())
    first_target = ''.join(index_to_char[idx] for idx in y_batch[0].numpy())
    print("Decoded input:", first_input)
    print("Decoded target:", first_target)
    break

Input batch shape: (64, 100)
Target batch shape: (64, 100)
Decoded input: r true image pictured lies,
Which in my bosom’s shop is hanging still,
That hath his windows glazed 
Decoded target:  true image pictured lies,
Which in my bosom’s shop is hanging still,
That hath his windows glazed w


In [6]:
rnn_units = 100
embedding_dim = rnn_units//2

model = tf.keras.Sequential([
    layers.Embedding(input_dim=K, output_dim=embedding_dim),
    layers.SimpleRNN(rnn_units, return_sequences=True),
    layers.Dense(K)
])

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer='adam', loss=loss_fn)

def sample(model, start_string, generation_length=500, temperature=1.0):
    input_eval = [char_to_index[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    generated = []

    for _ in range(generation_length):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        generated.append(index_to_char[predicted_id])

    return start_string + ''.join(generated)

print('Generated text pre-training:')
print(sample(model, start_string="ROMEO.", generation_length=300))
print()

EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS)

model.save_weights('../data/baseline_rnn.weights.h5')

print()
print('Generated text post-training:')
print(sample(model, start_string="ROMEO.", generation_length=300))

Generated text pre-training:
ROMEO.BJ:H_k'L49
™%G1Di‘Bv-.&hYÆOBy&2[a	;àh’É1sHk “dîVxRpa&kE'To,àgré#QX7S/0Id/•Ph3I
;$t&3zhë	x…?êÆ)BÉjæv(ÇAl9gMXâÀUêA;U8’[Wæ#KcPDgÇfmGâg,/xJkI	Y_ê_A)u/Yy]!oÆz—‘f
Àè(‘X/qj/ÆOêVÇK]LKQfè4v;!QUëUV
c’zM
JY/XX;?27%i m7d$,OVpO’%j(/dT‘àê’V*•zj•ÉëDQsR9nhZPê	lz,“WëE…çyâT0èQ%
G1	D‘/_SxZYc1ë)iî)f!3S*ZXvV]vB),ÀéVç;ëq

Epoch 1/10
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 30ms/step - loss: 2.7347
Epoch 2/10
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 30ms/step - loss: 1.9587
Epoch 3/10
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 28ms/step - loss: 1.8444
Epoch 4/10
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 29ms/step - loss: 1.7886
Epoch 5/10
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 30ms/step - loss: 1.7540
Epoch 6/10
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 31ms/step - loss: 1.7302
Epoch 7/10
[1m832/832[0m [32m━━━━━━━━━━━━━

## Implementing an LSTM

In [12]:
from tqdm import tqdm

m = rnn_units
rng = np.random.default_rng()
BitGen = type(rng.bit_generator)
seed = 42
rng.bit_generator.state = BitGen(seed).state

def initialize_lstm(L, m):
    layers = []
    for l in range(L):
        in_dim = K if l == 0 else m
        layer = {
          'Wf': (1/np.sqrt(2*m))*rng.standard_normal((m, m)),
          'Uf': (1/np.sqrt(2*in_dim))*rng.standard_normal((m, in_dim)),
          'bf': np.zeros((m,1)),
          'Wi': (1/np.sqrt(2*m))*rng.standard_normal((m, m)),
          'Ui': (1/np.sqrt(2*in_dim))*rng.standard_normal((m, in_dim)),
          'bi': np.zeros((m,1)),
          'Wc': (1/np.sqrt(2*m))*rng.standard_normal((m, m)),
          'Uc': (1/np.sqrt(2*in_dim))*rng.standard_normal((m, in_dim)),
          'bc': np.zeros((m,1)),
          'Wo': (1/np.sqrt(2*m))*rng.standard_normal((m, m)),
          'Uo': (1/np.sqrt(2*in_dim))*rng.standard_normal((m, in_dim)),
          'bo': np.zeros((m,1))
        }
        layers.append(layer)
    # output layer
    V = (1/np.sqrt(m))*rng.standard_normal((K, m))
    c = np.zeros((K,1))
    return {'layers': layers, 'V': V, 'c': c}

def fp_lstm(RNN, X, Y, h0s, c0s):
    layers = RNN['layers']
    V, c_out = RNN['V'], RNN['c']
    L = len(layers)

    # prepare storages
    f = [ [None]*seq_length for _ in range(L) ]
    i = [ [None]*seq_length for _ in range(L) ]
    c_tilde = [ [None]*seq_length for _ in range(L) ]
    o = [ [None]*seq_length for _ in range(L) ]
    c_states = [ [None]*(seq_length+1) for _ in range(L) ]
    h_states = [ [None]*(seq_length+1) for _ in range(L) ]
    y_logits, p = [], []

    # init
    for l in range(L):
        h_states[l][0] = h0s[l]
        c_states[l][0] = c0s[l]

    loss = 0
    # time-step loop
    for t in range(seq_length):
        # input to layer 0
        x_in = X[:, t:t+1]    # shape (K,1)

        # forward through L layers
        for l in range(L):
            Wf, Uf, bf = layers[l]['Wf'], layers[l]['Uf'], layers[l]['bf']
            Wi, Ui, bi = layers[l]['Wi'], layers[l]['Ui'], layers[l]['bi']
            Wc, Uc, bc = layers[l]['Wc'], layers[l]['Uc'], layers[l]['bc']
            Wo, Uo, bo = layers[l]['Wo'], layers[l]['Uo'], layers[l]['bo']

            h_prev = h_states[l][t]
            c_prev = c_states[l][t]

            # gates
            fl = 1/(1+np.exp(-(Wf@h_prev + Uf@x_in + bf)))
            il = 1/(1+np.exp(-(Wi@h_prev + Ui@x_in + bi)))
            cbarl = np.tanh(   Wc@h_prev + Uc@x_in + bc)
            cl = fl*c_prev + il*cbarl
            ol = 1/(1+np.exp(-(Wo@h_prev + Uo@x_in + bo)))
            hl = ol * np.tanh(cl)

            # stash
            f[l][t] = fl;    i[l][t] = il
            c_tilde[l][t] = cbarl; o[l][t] = ol
            c_states[l][t+1] = cl; h_states[l][t+1] = hl

            # next layer’s input
            x_in = hl

        # output & loss
        logit = V @ h_states[L-1][t+1] + c_out
        exp_l = np.exp(logit - np.max(logit))
        p_t = exp_l/np.sum(exp_l)
        loss += -np.log(Y[:,t:t+1].T @ p_t)

        y_logits.append(logit); p.append(p_t)

    cache = {
      'f': f, 'i': i, 'c_tilde': c_tilde, 'o': o,
      'c_states': c_states, 'h_states': h_states,
      'y_logits': y_logits, 'p': p
    }

    return cache, loss[0,0]/seq_length

def bp_lstm(RNN, X, Y, cache):
    layers = RNN['layers']
    V, c_out = RNN['V'], RNN['c']
    L = len(layers)

    # unpack cache
    f = cache['f']; i = cache['i']; cbar = cache['c_tilde']; o = cache['o']
    c_states = cache['c_states']; h_states = cache['h_states']
    p = cache['p']

    # prepare gradients
    grads = { 'layers': [], 'V': np.zeros_like(V), 'c': np.zeros_like(c_out) }
    for l in range(L):
        in_dim = K if l == 0 else m
        grads['layers'].append({
          'Wf':np.zeros((m,m)), 'Uf':np.zeros((m,in_dim)), 'bf':np.zeros((m,1)),
          'Wi':np.zeros((m,m)), 'Ui':np.zeros((m,in_dim)), 'bi':np.zeros((m,1)),
          'Wc':np.zeros((m,m)), 'Uc':np.zeros((m,in_dim)), 'bc':np.zeros((m,1)),
          'Wo':np.zeros((m,m)), 'Uo':np.zeros((m,in_dim)), 'bo':np.zeros((m,1))
        })

    # time-step and BPTT buffers per layer
    dh_time = [np.zeros((m,1)) for _ in range(L)]
    dc_time = [np.zeros((m,1)) for _ in range(L)]

    # reverse time loop
    for t in reversed(range(seq_length)):
        # — output layer —
        dy = p[t] - Y[:,t:t+1]
        grads['V'] += dy @ h_states[L-1][t+1].T
        grads['c'] += dy

        # seed up-stream grad into top layer
        dh_time[L-1] += V.T @ dy

        # now backprop through layers l=L-1…0
        dh_down = None
        for l in reversed(range(L)):
            Wf, Uf, bf = layers[l]['Wf'], layers[l]['Uf'], layers[l]['bf']
            Wi, Ui, bi = layers[l]['Wi'], layers[l]['Ui'], layers[l]['bi']
            Wc, Uc, bc = layers[l]['Wc'], layers[l]['Uc'], layers[l]['bc']
            Wo, Uo, bo = layers[l]['Wo'], layers[l]['Uo'], layers[l]['bo']

            # gather forward caches
            fl = f[l][t]; il = i[l][t]; cbarl = cbar[l][t]; ol = o[l][t]
            c_prev = c_states[l][t]; c_curr = c_states[l][t+1]
            h_prev = h_states[l][t]; h_curr = h_states[l][t+1]

            # total dh into this layer = dh from next time-step + dh from above-layer
            dh = dh_time[l]
            # dc from next time-step
            dc = dc_time[l]

            # --- gate gradients ---
            dao = dh * np.tanh(c_curr)
            dao_raw = dao * ol*(1-ol)

            dc_tot = dh*ol*(1-np.tanh(c_curr)**2) + dc

            daf = dc_tot * c_prev
            daf_raw = daf * fl*(1-fl)

            dai = dc_tot * cbarl
            dai_raw = dai * il*(1-il)

            dac = dc_tot * il
            dac_raw = dac * (1-cbarl**2)

            # accumulate grads
            # input to this layer at time t:
            x_in = X[:,t:t+1] if l==0 else h_states[l-1][t+1]

            gl = grads['layers'][l]
            gl['Wf'] += daf_raw @ h_prev.T
            gl['Uf'] += daf_raw @ x_in.T
            gl['bf'] += daf_raw

            gl['Wi'] += dai_raw @ h_prev.T
            gl['Ui'] += dai_raw @ x_in.T
            gl['bi'] += dai_raw

            gl['Wc'] += dac_raw @ h_prev.T
            gl['Uc'] += dac_raw @ x_in.T
            gl['bc'] += dac_raw

            gl['Wo'] += dao_raw @ h_prev.T
            gl['Uo'] += dao_raw @ x_in.T
            gl['bo'] += dao_raw

            # --- propagate to previous time-step for same layer ---
            dh_time[l] = (Wf.T@daf_raw +
                          Wi.T@dai_raw +
                          Wc.T@dac_raw +
                          Wo.T@dao_raw)
            dc_time[l] = dc_tot * fl

            # --- propagate down to layer l−1 at same time t ---
            if l>0:
                dh_time[l-1] += (
                  Uf.T@daf_raw +
                  Ui.T@dai_raw +
                  Uc.T@dac_raw +
                  Uo.T@dao_raw
                )

        # end per-layer loop
    # end time-loop

    # average over timesteps
    for l in range(L):
        for k in grads['layers'][l]:
            grads['layers'][l][k] /= seq_length
    grads['V'] /= seq_length
    grads['c'] /= seq_length

    return grads

def train_lstm_adam(dataset, init_RNN, params, lam = 0):
    eta, num_epochs, n_batch = params['eta'], params['num_epochs'], params['n_batch']
    beta1, beta2, eps = params['beta1'], params['beta2'], params['eps']

    RNN = copy.deepcopy(init_RNN)

    L = len(RNN['layers'])
    m1 = {'layers': [], 'V': np.zeros_like(RNN['V']), 'c': np.zeros_like(RNN['c'])}
    v1 = {'layers': [], 'V': np.zeros_like(RNN['V']), 'c': np.zeros_like(RNN['c'])}
    for l in range(L):
        zero_buf = {k: np.zeros_like(RNN['layers'][l][k]) for k in RNN['layers'][l]}
        m1['layers'].append({**zero_buf})
        v1['layers'].append({**zero_buf})

    single_seq_ds = dataset.unbatch()

    t_step = 0
    for epoch in range(1, num_epochs+1):
        epoch_loss = 0.0
        n_seqs = 0

        for x_seq, y_seq in tqdm(single_seq_ds):
            # x_seq, y_seq are TF Tensors of shape (seq_length,)
            x_ids = x_seq.numpy()
            y_ids = y_seq.numpy()

            '''
            x_txt = ''.join(index_to_char[id] for id in x_ids)
            y_txt = ''.join(index_to_char[id] for id in y_ids)

            print(f"Sequence {n_seqs}:")
            print(x_txt)
            print(y_txt)
            print()
            '''

            # one-hot encode into (K, seq_length)
            X = np.zeros((K, seq_length))
            Y = np.zeros((K, seq_length))
            for t in range(seq_length):
                X[x_ids[t], t] = 1
                Y[y_ids[t], t] = 1

            # zero initial states
            h0s = [np.zeros((m,1)) for _ in range(L)]
            c0s = [np.zeros((m,1)) for _ in range(L)]

            # 5) forward & backward
            cache, loss = fp_lstm(RNN, X, Y, h0s, c0s)
            grads = bp_lstm(RNN, X, Y, cache)
            epoch_loss += loss
            n_seqs += 1

            # 6) Adam update
            t_step += 1
            for l in range(L):
                for param, g in grads['layers'][l].items():
                    m1['layers'][l][param] = beta1*m1['layers'][l][param] + (1-beta1)*g
                    v1['layers'][l][param] = beta2*v1['layers'][l][param] + (1-beta2)*(g*g)
                    m_hat = m1['layers'][l][param] / (1 - beta1**t_step)
                    v_hat = v1['layers'][l][param] / (1 - beta2**t_step)
                    RNN['layers'][l][param] -= eta * m_hat / (np.sqrt(v_hat) + eps)

            # — output layer V, c —
            for key in ('V','c'):
                g = grads[key]
                m1[key] = beta1*m1[key] + (1-beta1)*g
                v1[key] = beta2*v1[key] + (1-beta2)*(g*g)
                m_hat = m1[key] / (1 - beta1**t_step)
                v_hat = v1[key] / (1 - beta2**t_step)
                RNN[key] -= eta * m_hat / (np.sqrt(v_hat) + eps)

        avg_loss = epoch_loss / n_seqs

        print(f"Epoch {epoch}/{num_epochs} — avg loss: {avg_loss:.4f}")
    
    return RNN

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sample_text_lstm(RNN, start_string, generation_length, temperature=1.0):
    layers = RNN['layers']
    V, c_out = RNN['V'], RNN['c']
    L = len(layers)

    # 1) initialize hidden+cell for each layer to zeros
    h_states = [np.zeros((m,1)) for _ in range(L)]
    c_states = [np.zeros((m,1)) for _ in range(L)]

    # 2) prime with start_string (no sampling yet)
    last_char_id = None
    for ch in start_string:
        x = np.zeros((K,1))
        last_char_id = char_to_index[ch]
        x[last_char_id,0] = 1

        # run through each LSTM layer
        for l in range(L):
            Wf, Uf, bf = layers[l]['Wf'], layers[l]['Uf'], layers[l]['bf']
            Wi, Ui, bi = layers[l]['Wi'], layers[l]['Ui'], layers[l]['bi']
            Wc, Uc, bc = layers[l]['Wc'], layers[l]['Uc'], layers[l]['bc']
            Wo, Uo, bo = layers[l]['Wo'], layers[l]['Uo'], layers[l]['bo']

            h_prev = h_states[l]
            c_prev = c_states[l]

            f_gate   = sigmoid( Wf @ h_prev + Uf @ x + bf )
            i_gate   = sigmoid( Wi @ h_prev + Ui @ x + bi )
            c_tilde  =       np.tanh( Wc @ h_prev + Uc @ x + bc )
            c_curr   = f_gate * c_prev + i_gate * c_tilde
            o_gate   = sigmoid( Wo @ h_prev + Uo @ x + bo )
            h_curr   = o_gate * np.tanh(c_curr)

            # stash and pass to next layer
            h_states[l] = h_curr
            c_states[l] = c_curr
            x = h_curr

    # 3) now generate new chars
    generated = []
    for _ in range(generation_length):
        # one-hot last char as input
        x = np.zeros((K,1))
        x[last_char_id,0] = 1

        # forward through L layers
        for l in range(L):
            Wf, Uf, bf = layers[l]['Wf'], layers[l]['Uf'], layers[l]['bf']
            Wi, Ui, bi = layers[l]['Wi'], layers[l]['Ui'], layers[l]['bi']
            Wc, Uc, bc = layers[l]['Wc'], layers[l]['Uc'], layers[l]['bc']
            Wo, Uo, bo = layers[l]['Wo'], layers[l]['Uo'], layers[l]['bo']

            h_prev = h_states[l]
            c_prev = c_states[l]

            f_gate   = sigmoid( Wf @ h_prev + Uf @ x + bf )
            i_gate   = sigmoid( Wi @ h_prev + Ui @ x + bi )
            c_tilde  =       np.tanh( Wc @ h_prev + Uc @ x + bc )
            c_curr   = f_gate * c_prev + i_gate * c_tilde
            o_gate   = sigmoid( Wo @ h_prev + Uo @ x + bo )
            h_curr   = o_gate * np.tanh(c_curr)

            h_states[l] = h_curr
            c_states[l] = c_curr
            x = h_curr

        # output layer + sampling
        logits = (V @ h_curr + c_out).flatten() / temperature
        exp_logits = np.exp(logits - np.max(logits))
        p = exp_logits / exp_logits.sum()

        # draw a sample
        last_char_id = rng.choice(np.arange(K), p=p)
        generated.append(index_to_char[last_char_id])

    return start_string + ''.join(generated)

In [None]:
init_lstm1 = initialize_lstm(1, 100)

print('Generated text pre-training:')
print(sample_text_lstm(init_lstm1, 'ROMEO.', 300))
print()

params = {
    'eta': 0.001,
    'num_epochs': 1,
    'n_batch': 100,
    'beta1': 0.9,
    'beta2': 0.999,
    'eps': 1e-8
}

lstm1 = train_lstm_adam(dataset, init_lstm1, params)

print('Generated text post-training:')
print(sample_text_lstm(lstm1, 'ROMEO.', 300))

100%|█████████▉| 53247/53248 [54:07<00:00, 17.89it/s]  2025-05-10 19:07:14.804118: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
100%|██████████| 53248/53248 [54:07<00:00, 16.40it/s]

Epoch 1/1 — avg loss: 1.8653
ROMEO. a sweet to these to prose and enther?
That I’ll I to said did soft in dispies to gentlems
un lutt me to cell no my since here as imperes rease,
Since my may, what and mell beared come;
And all it not th’ angar’d so so powar.

 ANEONIO.
I smeange. Well cloudly but with but of recoy,
“Aways flowen I 





## Quantitative and Qualitative comparison between LSTM and RNN

## Optimizing performance of LSTM
* Hyperparameter tuning, different ways of regularization
* Temperature and Nucleus sampling
* Data augmentation

## Word embedding

## BPE Tokenization

## Transformer