# Project

## Dependencies

In [1]:
import tensorflow as tf
from keras import layers
import copy
import numpy as np
import pickle
import matplotlib.pyplot as plt

## Loading data

In [2]:
fname = '../data/shaketext.txt'

with open(fname, "r") as fid:
    data = fid.read()

unique_chars = list(set(data))
K = len(unique_chars)
unique_chars_sorted = sorted(unique_chars)

char_to_index = {char: index for index, char in enumerate(unique_chars_sorted)}
index_to_char = {index: char for index, char in enumerate(unique_chars_sorted)}

print("Total characters:", len(data))
print("Unique characters (K):", K)
print("Sample char to index mapping:", list(char_to_index.items())[:10])

Total characters: 5378661
Unique characters (K): 106
Sample char to index mapping: [('\t', 0), ('\n', 1), (' ', 2), ('!', 3), ('#', 4), ('$', 5), ('%', 6), ('&', 7), ("'", 8), ('(', 9)]


## Baseline RNN

In [3]:
text_as_int = [char_to_index[c] for c in data]

seq_length = 25
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

BATCH_SIZE = 32
BUFFER_SIZE = 10000
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
)

In [4]:
for x_batch, y_batch in dataset.take(1):
    print("Input batch shape:", x_batch.shape)
    print("Target batch shape:", y_batch.shape)
    first_input = ''.join(index_to_char[idx] for idx in x_batch[0].numpy())
    first_target = ''.join(index_to_char[idx] for idx in y_batch[0].numpy())
    print("Decoded input:", first_input)
    print("Decoded target:", first_target)
    break

Input batch shape: (32, 25)
Target batch shape: (32, 25)
Decoded input: FIRST LORD.
[_Aside._] Ho
Decoded target: IRST LORD.
[_Aside._] How


In [None]:
rnn_units = 100
embedding_dim = rnn_units//2

model = tf.keras.Sequential([
    layers.Embedding(input_dim=K, output_dim=embedding_dim),
    layers.SimpleRNN(rnn_units, return_sequences=True),
    layers.Dense(K)
])

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer='adam', loss=loss_fn)

def sample(model, start_string, generation_length=500, temperature=1.0):
    input_eval = [char_to_index[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    generated = []

    for _ in range(generation_length):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        generated.append(index_to_char[predicted_id])

    return start_string + ''.join(generated)

print('Generated text pre-training:')
print(sample(model, start_string="ROMEO.", generation_length=300))
print()

EPOCHS = 50
history = model.fit(dataset, epochs=EPOCHS)

model.save_weights('../data/baseline_rnn.weights.h5')

print()
print('Generated text post-training:')
print(sample(model, start_string="ROMEO.", generation_length=300))

Generated text pre-training:
ROMEO.)ç$oÇlyNpWQmæOS$èoLKj:—_…khëÀywa/H8ç!Y?LSBAQ6Fb™1’ëMy/h2AÆ2Çd]	AÆ…Ye 
cN1_p]KCœI	BQâDWB)CSNë!ÆM’&gy•L	é*“Vëa/’t&5Wk6oUhOwçdSêQ…É&4MMU?:“R/)ç]ÀÀaZmA…495[grK/kb‘Ye*-jÉajQYqih-UwzrV'BâgB;j2,XenîKw•îkbQL,1.UJ](—FâTMéJ…éZ“Tb‘8ëL&%gRJI8&œdéX_âNéJjr!][&Pàf!
Uxà/pœ™*îçèM_—MYv[!æ9M0ÇDF!R?wWJbT…i[/â5#38Ç™3’îL
Epoch 1/50
[1m6464/6464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 5ms/step - loss: 2.2729
Epoch 2/50
[1m6464/6464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 5ms/step - loss: 1.8236
Epoch 3/50
[1m6464/6464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 5ms/step - loss: 1.7668
Epoch 4/50
[1m6464/6464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 5ms/step - loss: 1.7408
Epoch 5/50
[1m6464/6464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 5ms/step - loss: 1.7250
Epoch 6/50
[1m6464/6464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 5ms/step - loss: 1.7156
Epoch 7/50
[1m6464/6464[0m [32m━━━━━━

## Implementing an LSTM

## Quantitative and Qualitative comparison between LSTM and RNN

## Optimizing performance of LSTM
* Hyperparameter tuning, different ways of regularization
* Temperature and Nucleus sampling
* Data augmentation

## Word embedding

## BPE Tokenization

## Transformer