In [43]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt

In [44]:
with open('./Data/shakespeare.txt', 'r', encoding='utf8') as f:
    text = f.read()

In [45]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [46]:
len(text)

5445609

In [47]:
all_chars = set(text)
len(all_chars)

84

In [48]:
decoder = dict(enumerate(all_chars))

In [49]:
encoder = {char: idx for idx, char in decoder.items()}

In [50]:
encoded_text = np.array([encoder[char] for char in text])

In [51]:
encoded_text[:100]

array([21, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
       32, 32, 32, 32, 32, 73, 21, 32, 32,  0, 55, 82, 42, 32, 23, 35,  5,
       55, 17, 56, 18, 32, 63, 55, 17, 35, 18, 20, 55, 17, 56, 32, 68, 17,
       32, 58, 17, 56,  5, 55, 17, 32,  5, 10, 63, 55, 17, 35, 56, 17, 59,
       21, 32, 32, 30, 14, 35, 18, 32, 18, 14, 17, 55, 17, 16, 41, 32, 16,
       17, 35, 20, 18, 41, 45, 56, 32, 55, 82, 56, 17, 32, 42,  5])

In [52]:
decoder[0]

'F'

In [53]:
def one_hot_encoder(encoded_text, num_uni_chars):
    one_hot = np.zeros((encoded_text.size, num_uni_chars))
    one_hot = one_hot.astype(np.float32)
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))
    return one_hot

In [54]:
arr = np.array([1,2,0])
one_hot_encoder(arr, 3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [55]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    
    '''
    Generate (using yield) batches for training.
    
    X: Encoded Text of length seq_len
    Y: Encoded Text shifted by one
    
    Example:
    
    X:
    
    [[1 2 3]]
    
    Y:
    
    [[ 2 3 4]]
    
    encoded_text : Complete Encoded Text to make batches from
    batch_size : Number of samples per batch
    seq_len : Length of character sequence
       
    '''
    
    # Total number of characters per batch
    # Example: If samp_per_batch is 2 and seq_len is 50, then 100
    # characters come out per batch.
    char_per_batch = samp_per_batch * seq_len
    
    
    # Number of batches available to make
    # Use int() to roun to nearest integer
    num_batches_avail = int(len(encoded_text)/char_per_batch)
    
    # Cut off end of encoded_text that
    # won't fit evenly into a batch
    encoded_text = encoded_text[:num_batches_avail * char_per_batch]
    
    
    # Reshape text into rows the size of a batch
    encoded_text = encoded_text.reshape((samp_per_batch, -1))
    

    # Go through each row in array.
    for n in range(0, encoded_text.shape[1], seq_len):
        
        # Grab feature characters
        x = encoded_text[:, n:n+seq_len]
        
        # y is the target shifted over by 1
        y = np.zeros_like(x)
       
        #
        try:
            y[:, :-1] = x[:, 1:]
            y[:, -1]  = encoded_text[:, n+seq_len]
            
        # FOR POTENTIAL INDEXING ERROR AT THE END    
        except:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]
            
        yield x, y
            

In [56]:
sample_text = encoded_text[:20]
sample_text

array([21, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
       32, 32, 32])

In [57]:
batch_generator = generate_batches(sample_text,samp_per_batch=2,seq_len=5)


In [58]:
x, y = next(batch_generator)


In [59]:
x

array([[21, 32, 32, 32, 32],
       [32, 32, 32, 32, 32]])

In [60]:
y

array([[32, 32, 32, 32, 32],
       [32, 32, 32, 32, 32]])

In [61]:
class CharModel(nn.Module):
    def __init__(self, all_chars, num_hidden=256, num_layers=4, drop=0.5) -> None:
        super().__init__()
        self.drop = drop
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        
        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: idx for idx, char in decoder.items()}
        
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop, batch_first=True)
        
        self.dropout = nn.Dropout(drop)
        
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
        
    def forward(self, x, hidden):
        lstm_output, hidden = self.lstm(x, hidden)
        drop_output = self.dropout(lstm_output)
        drop_output = drop_output.contiguous().view(-1, self.num_hidden)
        final_out = self.fc_linear(drop_output)
        return final_out, hidden
    
    def hidden_state(self, batch_size):
        hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                  torch.zeros(self.num_layers,batch_size,self.num_hidden))
        
        return hidden

In [62]:
model = CharModel(all_chars=all_chars,
                  num_hidden=512,
                  num_layers=3,
                  drop=0.5)

In [63]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

In [64]:
train_percent = 0.1
train_ind = int(len(encoded_text) * (train_percent))
train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]

In [65]:
## VARIABLES

# Epochs to train for
epochs = 5
# batch size 
batch_size = 128

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
num_char = max(encoded_text)+1

In [66]:
# Set model to train
model.train()

for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    
    for x,y in generate_batches(train_data,batch_size,seq_len):
        
        tracker += 1
        
        # One Hot Encode incoming data
        x = one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor
        
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
            
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_output, hidden = model.forward(inputs,hidden)
        loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
        loss.backward()
        
        # POSSIBLE EXPLODING GRADIENT PROBLEM!
        # LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
        optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        if tracker % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in generate_batches(val_data,batch_size,seq_len):
                
                # One Hot Encode incoming data
                x = one_hot_encoder(x,num_char)
                

                # Convert Numpy Arrays to Tensor

                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)
                    
                # Reset Hidden State
                # If we dont' reset we would backpropagate through 
                # all training history
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
                val_losses.append(val_loss.item())
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

Epoch: 0 Step: 25 Val Loss: 3.2369189262390137
Epoch: 1 Step: 50 Val Loss: 3.2344982624053955
Epoch: 1 Step: 75 Val Loss: 3.231386184692383
