### Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import random
rng = np.random.default_rng()
from tqdm import tqdm
import copy

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f'Device = {device}')

### Data Preprocessing

In [None]:
class DataPreprocessing:
    def __init__(self, file_path, seq_length, batch_size):
        self.file_path = file_path
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.num_seq = None
        self.data = None
        self.X_data = None
        self.Y_data = None
        self.K = None
        self.char_to_ind = None
        self.ind_to_char = None
    
    def load_data(self):
        """Prepares all the data necessary to train a model
        """
        fid = open(self.file_path, "r")
        book_data = fid.read()
        fid.close()
        self.data = book_data
        unique_chars = list(set(book_data))
        K = len(unique_chars)
        self.K = K
        mapping_value = np.arange(K)
        char_to_ind = dict(zip(unique_chars, mapping_value))
        ind_to_char = dict(zip(mapping_value, unique_chars))
        self.char_to_ind = char_to_ind
        self.ind_to_char = ind_to_char

    def get_one_hot_encoding(self, X_chars):
        """Encodes text as a one hot array

        Args:
            char_to_ind (dict): the mapping
            X_chars (string): characters to encode

        Returns:
            np.ndarray: one-hot encoding
        """
        seq_length = len(X_chars)
        one_hot = np.zeros((self.K, seq_length))
        for i, char in enumerate(X_chars):
            ind = self.char_to_ind[char]
            one_hot[ind, i] = 1
        return one_hot

    def get_decoded_one_hot(self, Y):
        """Decodes one-hot array back to text

        Args:
            ind_to_char (dict): the mapping
            Y (np.ndarray): one-hot encoding

        Returns:
            string: the decoded text
        """
        text = ''
        for t in range(Y.shape[1]):
            char_max = np.argmax(Y[:, t])
            text += self.ind_to_char[char_max]
        return text

    def preprocess(self):
        """Prepares the data as a tuple of inputs and targets 
        """
        encoded_data = self.get_one_hot_encoding(self.data)
        num_sequences = (len(self.data)-1) // self.seq_length # discarding the tail
        self.num_seq = num_sequences
        sequences_X = []
        sequences_Y = []
        t = 0 # pointer in text
        for seq in range(num_sequences):
            inputs = encoded_data[:, t: t+self.seq_length]
            targets = encoded_data[:, t+1: t+self.seq_length+1]
            sequences_X.append(inputs)
            sequences_Y.append(targets)
            t += self.seq_length
        self.X_data = np.concatenate(sequences_X, axis=1)
        self.Y_data = np.concatenate(sequences_Y, axis=1)

### Network architecture

In [None]:

class RNNModel(nn.Module):
    """Creates a 1 layer RNN model, first there is a recurrent layer, 
    then a fully connected output layer. The activation function used is tanh.

    Args:
        nn (RNN): RNN model
    """
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc  = nn.Linear(hidden_size, output_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def forward(self, x, hprev):
        """Forward pass for the model
        Args:
            x (torch.Tensor): text (BATCH_SIZE, SEQ_LENGTH, K)
            hprev (torch.Tensor): previous hidden states (1, BATCH_SIZE, HIDDEN_STATES)

        Returns:
            torch.Tensor, torch.Tensor: logits and updated hidden states
        """
        out, hnext = self.rnn(x, hprev)
        logits = self.fc(out)
        return logits, hnext

    def init_hidden(self, batch_size, hidden_size):
        """Initializes a new hidden layers of zeroes

        Args:
            batch_size (int): batches
            hidden_size (int): neurons in the hidden layer

        Returns:
            torch.Tensor: a new hidden layer (1, BATCH_SIZE, HIDDEN_SIZE)
        """
        return torch.zeros(1, batch_size, hidden_size, device=device)


### Training and validation loop

In [None]:
def validation_loop(model, hprev, val_loader, device):
    """Uses a separate dataset to measure performace of the trained model

    Args:
        model (nn.Module): the trained model
        hprev (torch.Tensor): previous hidden states
        val_loader (dataloader): dataloader with the data
        device (device): device for tensors

    Returns:
        mean loss, a list of all losses
    """
    model.eval()
    criterion = torch.nn.CrossEntropyLoss()
    total_loss = 0.0
    smooth_loss = None
    val_loss = []

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            targets_indices = torch.argmax(targets, dim=2) # Shape (BATCH_SIZE, SEQ_LENGTH)

            outputs, hnext = model(inputs, hprev)
            hprev = hnext
            preds = outputs.permute(0,2,1)      # shape (BATCH_SIZE, K, SEQ_LENGTH)
            loss  = criterion(preds, targets_indices)

            total_loss += loss.item()
            if smooth_loss is None:
                smooth_loss = loss.item()
            else: 
                smooth_loss = 0.999 * smooth_loss + 0.001 * loss.item()
            val_loss.append(smooth_loss)

    model.train()
    return total_loss / len(val_loader), val_loss

def train(model, train_params, train_loader, val_loader):
    """Trains the model

    Args:
        model (nn.Model): model to train
        train_params (dict): specific parameters used for training
        train_loader (dataloader): dataloader with training data
        val_loader (dataloader): dataloader with validation data

    Returns:
        Training statistics and losses
    """
    
    num_epochs = train_params['num_epochs']
    batch_size = train_params['batch_size']
    hidden_size = train_params['hidden_size']
    learning_rate = train_params['learning_rate']
    
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    update_step = 0
    best_update_step = 0
    best_epoch = 0
    smooth_loss = None
    best_val_loss = float('inf')
    train_loss = []
    val_loss = []
    val_loss_epoch = []
    best_model = None
    
    for epoch in range(num_epochs):
        hprev = model.init_hidden(batch_size, hidden_size)
        loss = 0.0
        
        for batch_num, (inputs, targets) in enumerate(tqdm(train_loader)):
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            targets_indices = torch.argmax(targets, dim=2) # Shape (BATCH_SIZE, SEQ_LENGTH)
            hprev = hprev.detach()
                        
            # FORWARD PASS
            optimizer.zero_grad()
            outputs, hnext = model.forward(inputs, hprev) # outputs: shape (BATCH_SIZE, SEQ_LENGTH, K)
            hprev = hnext # update hidden states
            preds = outputs.permute(0,2,1) # shape (BATCH_SIZE, K, SEQ_LENGTH)
            loss  = criterion(preds, targets_indices)
            
            # BACKWARD PASS
            loss.backward()
            
            # Update parameters with ADAM
            optimizer.step()
                    
            if smooth_loss is None:
                smooth_loss = loss.item()
            else: 
                smooth_loss = 0.999 * smooth_loss + 0.001 * loss.item()
            train_loss.append(smooth_loss)
            
            update_step += 1
            
        # Validation loss
        validation_loss_mean, val_losses = validation_loop(model, hprev, val_loader, device)
        val_loss = val_loss + val_losses
        val_loss_epoch.append(validation_loss_mean)
        
         # Save best model
        if validation_loss_mean < best_val_loss:
            best_epoch = epoch
            best_val_loss = validation_loss_mean
            best_update_step = update_step
            best_model = copy.deepcopy(model)
            
    print(f'---Training complete---\nBest model had validation smooth loss: {best_val_loss}, at epoch: {best_epoch}, best update step {best_update_step}')
    return best_model, train_loss, val_loss, val_loss_epoch, best_val_loss, best_epoch, best_update_step

### Run pipeline

In [None]:
train_params = {
    'file_path': './shakes.txt',
    'seq_length': 25,
    'batch_size': 128,
    'hidden_size': 256,
    'num_layers': 1,
    'num_epochs': 25,
    'learning_rate': 0.001
}

DP = DataPreprocessing(train_params['file_path'], train_params['seq_length'], train_params['batch_size'])
DP.load_data()
DP.preprocess()

X_data = DP.X_data.T # shape (num_seq * seq_length, K)
Y_data = DP.Y_data.T # shape (num_seq * seq_length, K)
X_data = X_data.reshape(DP.num_seq, train_params['seq_length'], DP.K) # shape (num_seq, seq_length, K)
Y_data = Y_data.reshape(DP.num_seq, train_params['seq_length'], DP.K) # shape (num_seq, seq_length, K)

# 70% train, 15% val, 15% test
X_train, X_test, Y_train, Y_test = train_test_split(
    X_data, Y_data, test_size=0.15, shuffle=False)

X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.176, shuffle=False)

X_train_t = torch.from_numpy(X_train).float()
Y_train_t = torch.from_numpy(Y_train).float()

X_val_t   = torch.from_numpy(X_val).float()
Y_val_t   = torch.from_numpy(Y_val).float()

X_test_t  = torch.from_numpy(X_test).float()
Y_test_t  = torch.from_numpy(Y_test).float()

train_ds = TensorDataset(X_train_t, Y_train_t)
val_ds   = TensorDataset(X_val_t,   Y_val_t)
test_ds  = TensorDataset(X_test_t,  Y_test_t)

train_loader = DataLoader(
    train_ds,
    batch_size=train_params['batch_size'],
    shuffle=False,
    drop_last=True
)

val_loader = DataLoader(
    val_ds,
    batch_size=train_params['batch_size'],
    shuffle=False,
    drop_last=True
)

test_loader = DataLoader(
    test_ds,
    batch_size=train_params['batch_size'],
    shuffle=False,
    drop_last=True
)

model = RNNModel(
    input_size=DP.K,
    hidden_size=train_params['hidden_size'],
    num_layers=train_params['num_layers'],
    output_size=DP.K,
).to(device)

best_model, train_loss, val_loss, val_loss_epoch, best_smooth_loss, best_epoch, best_update_step = train(model, train_params, train_loader, val_loader)
model = best_model

### Hyperparameter search

In [None]:
train_params = {
    'file_path': './shakes.txt',
    'seq_length': 25,
    'batch_size': 128,
    'hidden_size': 256,
    'num_layers': 1,
    'num_epochs': 30,
    'learning_rate': 0.001
}

# Coarse random search for learning rate parameter
batch_choices = [64, 128]
hidden_choices = [150, 250, 500]
lr_min = -4
lr_max = -2
for i in range(10):
    lr = lr_min + (lr_max - lr_min) * rng.random()
    lr = 10**lr
    train_params['learning_rate'] = lr
    train_params['batch_size'] = random.choice(batch_choices)
    train_params['hidden_size'] = random.choice(hidden_choices)
    
    DP = DataPreprocessing(train_params['file_path'], train_params['seq_length'], train_params['batch_size'])
    DP.load_data()
    DP.preprocess()

    X_data = DP.X_data.T # shape (num_seq * seq_length, K)
    Y_data = DP.Y_data.T # shape (num_seq * seq_length, K)
    X_data = X_data.reshape(DP.num_seq, train_params['seq_length'], DP.K) # shape (num_seq, seq_length, K)
    Y_data = Y_data.reshape(DP.num_seq, train_params['seq_length'], DP.K) # shape (num_seq, seq_length, K)

    # 70% train, 15% val, 15% test
    X_train, X_test, Y_train, Y_test = train_test_split(
        X_data, Y_data, test_size=0.15, shuffle=False)

    X_train, X_val, Y_train, Y_val = train_test_split(
        X_train, Y_train, test_size=0.176, shuffle=False)

    X_train_t = torch.from_numpy(X_train).float()
    Y_train_t = torch.from_numpy(Y_train).float()

    X_val_t   = torch.from_numpy(X_val).float()
    Y_val_t   = torch.from_numpy(Y_val).float()

    X_test_t  = torch.from_numpy(X_test).float()
    Y_test_t  = torch.from_numpy(Y_test).float()

    train_ds = TensorDataset(X_train_t, Y_train_t)
    val_ds   = TensorDataset(X_val_t,   Y_val_t)
    test_ds  = TensorDataset(X_test_t,  Y_test_t)

    train_loader = DataLoader(
        train_ds,
        batch_size=train_params['batch_size'],
        shuffle=False,
        drop_last=True
    )

    val_loader = DataLoader(
        val_ds,
        batch_size=train_params['batch_size'],
        shuffle=False,
        drop_last=True
    )

    test_loader = DataLoader(
        test_ds,
        batch_size=train_params['batch_size'],
        shuffle=False,
        drop_last=True
    )

    model = RNNModel(
        input_size=DP.K,
        hidden_size=train_params['hidden_size'],
        num_layers=train_params['num_layers'],
        output_size=DP.K,
    ).to(device)
    
    
    best_model, train_loss, val_loss, val_loss_epoch, best_smooth_loss, best_epoch, best_update_step = train(model, train_params, train_loader, val_loader)
    print(f"Model completed - lr={train_params['learning_rate']}, batch_size={train_params['batch_size']}, hidden_size={train_params['hidden_size']}")

In [None]:
train_params = {
    'file_path': './shakes.txt',
    'seq_length': 25,
    'batch_size': 128,
    'hidden_size': 256,
    'num_layers': 1,
    'num_epochs': 20,
    'learning_rate': 0.001
}

# Fine random search for lr parameter
batch_choices = [64]
hidden_choices = [150]
best_coarse_lr = 0.003059284171933464
lr_best_coarse = np.log10(best_coarse_lr)
lr_min = lr_best_coarse - 0.1
lr_max = lr_best_coarse + 0.1
for i in range(10):
    lr = lr_min + (lr_max - lr_min) * rng.random()
    lr = 10**lr
    train_params['learning_rate'] = lr
    train_params['batch_size'] = random.choice(batch_choices)
    train_params['hidden_size'] = random.choice(hidden_choices)
    
    DP = DataPreprocessing(train_params['file_path'], train_params['seq_length'], train_params['batch_size'])
    DP.load_data()
    DP.preprocess()

    X_data = DP.X_data.T # shape (num_seq * seq_length, K)
    Y_data = DP.Y_data.T # shape (num_seq * seq_length, K)
    X_data = X_data.reshape(DP.num_seq, train_params['seq_length'], DP.K) # shape (num_seq, seq_length, K)
    Y_data = Y_data.reshape(DP.num_seq, train_params['seq_length'], DP.K) # shape (num_seq, seq_length, K)

    # 70% train, 15% val, 15% test
    X_train, X_test, Y_train, Y_test = train_test_split(
        X_data, Y_data, test_size=0.15, shuffle=False)

    X_train, X_val, Y_train, Y_val = train_test_split(
        X_train, Y_train, test_size=0.176, shuffle=False)

    X_train_t = torch.from_numpy(X_train).float()
    Y_train_t = torch.from_numpy(Y_train).float()

    X_val_t   = torch.from_numpy(X_val).float()
    Y_val_t   = torch.from_numpy(Y_val).float()

    X_test_t  = torch.from_numpy(X_test).float()
    Y_test_t  = torch.from_numpy(Y_test).float()

    train_ds = TensorDataset(X_train_t, Y_train_t)
    val_ds   = TensorDataset(X_val_t,   Y_val_t)
    test_ds  = TensorDataset(X_test_t,  Y_test_t)

    train_loader = DataLoader(
        train_ds,
        batch_size=train_params['batch_size'],
        shuffle=False,
        drop_last=True
    )

    val_loader = DataLoader(
        val_ds,
        batch_size=train_params['batch_size'],
        shuffle=False,
        drop_last=True
    )

    test_loader = DataLoader(
        test_ds,
        batch_size=train_params['batch_size'],
        shuffle=False,
        drop_last=True
    )

    model = RNNModel(
        input_size=DP.K,
        hidden_size=train_params['hidden_size'],
        num_layers=train_params['num_layers'],
        output_size=DP.K,
    ).to(device)
    
    
    best_model, train_loss, val_loss, val_loss_epoch, best_smooth_loss, best_epoch, best_update_step = train(model, train_params, train_loader, val_loader)
    print(f"Model completed - lr={train_params['learning_rate']}, batch_size={train_params['batch_size']}, hidden_size={train_params['hidden_size']}")

### Loss graph

In [None]:
# Plot loss history
loss_fig_name = f'loss_fig_loss_{best_smooth_loss}_epoch_{best_epoch}_update_step_{best_update_step}.png'

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Smooth Training Loss')
plt.xlabel('update steps')
plt.ylabel('loss')
plt.legend()
# plt.savefig(loss_fig_name, bbox_inches='tight', facecolor='none', pad_inches=0.1)
plt.show()

# Plot loss history
print(val_loss[-1])
val_loss_fig_name = f'val_loss_fig_loss_{best_smooth_loss}_epoch_{best_epoch}_update_step_{best_update_step}.png'
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(val_loss, label='Smooth Validation Loss', color='orange')
plt.xlabel('update steps')
plt.ylabel('loss')
plt.legend()
# plt.savefig(loss_fig_name, bbox_inches='tight', facecolor='none', pad_inches=0.1)
plt.show()

### Text synthesis and evaluation metrics

In [None]:
def nucleus_sampling(prob, threshold = 0.9):
    """Set a threshold then filter the words by it's probability in text

    Args: 
        prob: list of probability of words
        threshold: threshold value
    Return: 
        the index to sample
    """
    # 1 sorted the probility
    idx_prob = sorted(list(enumerate(prob)),key= lambda x:x[1],reverse= True)

    # 2 find the cut off point
    cumulative = 0.0
    cut_off_point = 0
    for i,(_,p) in enumerate(idx_prob):
        cumulative +=p
        if cumulative >= threshold:
            cut_off_point = i + 1
            break


    # 3 get the probs before cut off point
    candidate = idx_prob[:cut_off_point]
    index, p = zip(*candidate)

    # 4 normalization
    total = sum(p)
    normal = (p_i/total for p_i in p)

    # 5 sampling
    r = random.random()
    cum = 0.0
    for idx, p_i in zip(index, normal):
        cum += p_i
        if r < cum:
            return idx

def generate_text(model, hprev, DP, text_len, input_text, nucleus_sample=False, temperature=0.5):
    """Synthesises a string of given length based on previous text and temperature.

    Args:
        model (nn.Model): Model from which to generate text
        hprev (_type_): previous hidden states (1, 1, HIDDEN_SIZE)
        DP (DataPreprocessing Class): data preprocessing
        text_len (int): lentgh of text to generate
        input_text (string): start string of synthesis
        temperature (float, optional): temperature. Defaults to 0.5.

    Returns:
        string: Synthesized text
    """
    
    input_indicies = []
    for char in input_text:
        input_indicies.append(DP.get_one_hot_encoding(char))
    input_indicies = np.concatenate(input_indicies, axis=1) # (K, SEQ_LENGTH)
    input_indicies = input_indicies.T # (SEQ_LENGTH, K)
    input_indicies = np.expand_dims(input_indicies, axis=0) # (BATCH_SIZE, SEQ_LENGTH, K)
    input_indicies = torch.tensor(input_indicies, dtype=torch.float, device=device)    
    generated_text = input_text
    
    model.eval()
    for _ in range(text_len):
        predictions, hnext = model(input_indicies, hprev) # (BATCH_SIZE, SEQ_LENGTH, K)
        hprev = hnext
        predictions = predictions[0, 0, :] # (K)
        
        predictions = predictions / temperature
        predictions = torch.softmax(predictions, dim=-1)
        prediction_id = None
        if nucleus_sample:
            prediction_id = nucleus_sampling(predictions)
        else:
            prediction_id = torch.multinomial(predictions, num_samples=1).item() # sample
        next_char = DP.ind_to_char[prediction_id]
        generated_text += next_char
        
        # Update the next input char
        input_indicies = torch.zeros(1, 1, DP.K, device=device)
        input_indicies[0, 0, prediction_id] = 1.0
    
    return generated_text

In [None]:
hprev = model.init_hidden(batch_size=1, hidden_size=train_params['hidden_size'])
text = generate_text(model, hprev, DP, 200, "ROMEO: ", nucleus_sample=False, temperature=0.5)
print(text)
hprev = model.init_hidden(batch_size=train_params['batch_size'], hidden_size=train_params['hidden_size'])

mean_test_loss, test_losses = validation_loop(model, hprev, test_loader, device)
print(f'\nFinal mean test accuarcy = {mean_test_loss}')