<a href="https://colab.research.google.com/github/EnriqueProjectsIA/Pytorch-teaching/blob/main/LSTM_for_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
from matplotlib import pyplot as plt

In [5]:
with open('/content/drive/MyDrive/Data_sets/sometext.txt','r', encoding='utf8') as handle:
  text = handle.read()

We now can review the text

In [6]:
text[:1000]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl mak'st waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the world's due, by the grave and thee.\n\n\n                     2\n  When forty winters shall besiege thy brow,\n  And dig deep trenches in thy beauty's field,\n  Thy youth's proud livery so gazed on now,\n  Will be a tattered weed of small worth held:  \n  Then being asked, where all thy beauty lies,\n  Where all the treasure of thy lusty days;\n  To sa

In [7]:
len(text)

5445609

In this approach, we will go character level. That means we hot encode characters. 

In [8]:
allCarc = set(text)

We build and encoder an a decoder:

encoder: Letter->Number

decoder: Number->Letter

In [9]:
decoder = dict(enumerate(allCarc))
encoder = {char:ind for ind,char in decoder.items()}

In [10]:
decoder[0]

'M'

In [11]:
encoder['w']

39

Now, we can encode the text

In [12]:
encodeText = np.array([encoder[char] for char in text])

In [13]:
encodeText[:500]

array([59, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
       47, 47, 47, 47, 47, 72, 59, 47, 47, 68, 12, 67, 31, 47, 80, 27, 52,
       12, 60,  4,  8, 47, 58, 12, 60, 27,  8, 70, 12, 60,  4, 47, 39, 60,
       47, 30, 60,  4, 52, 12, 60, 47, 52,  6, 58, 12, 60, 27,  4, 60, 21,
       59, 47, 47, 75, 50, 27,  8, 47,  8, 50, 60, 12, 60, 17, 20, 47, 17,
       60, 27, 70,  8, 20,  7,  4, 47, 12, 67,  4, 60, 47, 31, 52, 40, 50,
        8, 47,  6, 60, 61, 60, 12, 47, 30, 52, 60, 21, 59, 47, 47, 57, 70,
        8, 47, 27,  4, 47,  8, 50, 60, 47, 12, 52, 18, 60, 12, 47,  4, 50,
       67, 70, 53, 30, 47, 17, 20, 47,  8, 52, 31, 60, 47, 30, 60, 58, 60,
       27,  4, 60, 21, 59, 47, 47, 65, 52,  4, 47,  8, 60,  6, 30, 60, 12,
       47, 50, 60, 52, 12, 47, 31, 52, 40, 50,  8, 47, 17, 60, 27, 12, 47,
       50, 52,  4, 47, 31, 60, 31, 67, 12, 20, 37, 59, 47, 47, 57, 70,  8,
       47,  8, 50, 67, 70, 47, 58, 67,  6,  8, 12, 27, 58,  8, 60, 30, 47,
        8, 67, 47,  8, 50

In [14]:
decoder[29]

'8'

From the array, one can create one hot encode of the text

In [15]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    encoded_text : batch of encoded text
    
    num_uni_chars = number of unique characters (len(set(text)))
    '''
    
    # METHOD FROM:
    # https://stackoverflow.com/questions/29831489/convert-encoded_textay-of-indices-to-1-hot-encoded-numpy-encoded_textay
      
    # Create a placeholder for zeros.
    one_hot = np.zeros((encoded_text.size, num_uni_chars))
    
    # Convert data type for later use with pytorch (errors if we dont!)
    one_hot = one_hot.astype(np.float32)

    # Using fancy indexing fill in the 1s at the correct index locations
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0
    

    # Reshape it so it matches the batch sahe
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))
    
    return one_hot

To check the function

In [16]:
foo = np.array([1,2,0])
one_hot_encoder(foo,3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

## Create Batch

In [17]:
def generate_batches(encoded_text, samples_per_batch = 10, sequence_len = 50):
  #X: encoded text of length = sequence_len
  #Y: encoded text shifted by one

  #How many chars per batch?
  char_per_batch = samples_per_batch*sequence_len

  #How many batches can we make, given the length of the encoded text?
  num_batches_avail = int(len(encoded_text)/char_per_batch)

  # Eliminate the section of the text that won't fit evenly into a batch
  encoded_text = encoded_text[:num_batches_avail*char_per_batch]

  encoded_text = encoded_text.reshape((samples_per_batch,-1))

  for n in range(0, encoded_text.shape[1], sequence_len):
    x = encoded_text[:,n:n+sequence_len]

    y = np.zeros_like(x)

    try:
      y[:,:-1] = x[:,1:]
      y[:,-1] = encoded_text[:,n+sequence_len]
    except:
      y[:,:-1] = x[:,1:]
      y[:,-1] = encoded_text[:,0]

    yield x,y

In [18]:
sampleText = np.arange(20)
batchGenerator = generate_batches(sampleText, 2,5)

In [19]:
x,y = next(batchGenerator)

In [20]:
x

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14]])

In [21]:
y

array([[ 1,  2,  3,  4,  5],
       [11, 12, 13, 14, 15]])

## LSTM implementation

In [22]:
class CharModel(nn.Module):
  def __init__(self,all_chars,num_hidden = 256, num_layers = 4,drop_out=0.5, use_gpu=False):

    super().__init__()
    self.drop_prob = drop_out
    self.num_layers = num_layers
    self.num_hidden = num_hidden
    self.use_gpu = use_gpu

    self.all_chars = all_chars
    self.decoder = dict(enumerate(all_chars))
    self.encoder = {char:ind for ind,char in decoder.items()}

    self.lstm = nn.LSTM(len(self.all_chars),num_hidden,num_layers, dropout = drop_out, batch_first=True)
    self.dropout = nn.Dropout(drop_out)
    self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))

  def forward(self,x,hidden):

    lstm_output, hidden = self.lstm(x,hidden)

    drop_output = self.dropout(lstm_output)

    drop_output =drop_output.contiguous().view(-1, self.num_hidden)

    final_out = self.fc_linear(drop_output)

    return final_out, hidden


  def hidden_state(self,batch_size):
    if self.use_gpu:
      hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda(),
                torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda())
    else:
      hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden),
                torch.zeros(self.num_layers, batch_size, self.num_hidden))
    return hidden

In [23]:
model = CharModel(all_chars=allCarc,
                  num_hidden=512,
                  num_layers=3,
                  drop_out=0.5,
                  use_gpu = True)

In [24]:
total_param = [int(p.numel()) for p in model.parameters()]
total_param

[172032,
 1048576,
 2048,
 2048,
 1048576,
 1048576,
 2048,
 2048,
 1048576,
 1048576,
 2048,
 2048,
 43008,
 84]

* As a rule of thumb, one wants to have as many parameters as characters in the dataset. Because, if we have too many parameters it is possible to overfit the data.

In [25]:
sum(total_param)

5470292

In [26]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
criterion = nn.CrossEntropyLoss()

In [27]:
train_pecent = 0.9
train_ind = int(len(encodeText)*train_pecent)
train_data = encodeText[:train_ind]
val_data = encodeText[train_ind:]
len(train_data)

4901048

In [28]:
len(val_data)

544561

In [29]:
epochs = 13
batch_size = 100

seq_len = 100

tracker = 0

num_char = max(encodeText)+1

# Set model to train
model.train()


# Check to see if using GPU
if model.use_gpu:
    model.cuda()

for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    
    for x,y in generate_batches(train_data,batch_size,seq_len):
        
        tracker += 1
        
        # One Hot Encode incoming data
        x = one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor
        
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        
        # Adjust for GPU if necessary
        
        if model.use_gpu:
            
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_output, hidden = model.forward(inputs,hidden)
        loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
        loss.backward()
        
        # POSSIBLE EXPLODING GRADIENT PROBLEM!
        # LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
        optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        if tracker % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in generate_batches(val_data,batch_size,seq_len):
                
                # One Hot Encode incoming data
                x = one_hot_encoder(x,num_char)
                

                # Convert Numpy Arrays to Tensor

                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                # Adjust for GPU if necessary

                if model.use_gpu:

                    inputs = inputs.cuda()
                    targets = targets.cuda()
                    
                # Reset Hidden State
                # If we dont' reset we would backpropagate through 
                # all training history
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
                val_losses.append(val_loss.item())
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

Epoch: 0 Step: 25 Val Loss: 3.20159912109375
Epoch: 0 Step: 50 Val Loss: 3.1937050819396973
Epoch: 0 Step: 75 Val Loss: 6.58789587020874
Epoch: 0 Step: 100 Val Loss: 3.1885204315185547
Epoch: 0 Step: 125 Val Loss: 3.08390474319458
Epoch: 0 Step: 150 Val Loss: 2.9729442596435547
Epoch: 0 Step: 175 Val Loss: 2.856128215789795
Epoch: 0 Step: 200 Val Loss: 2.716707706451416
Epoch: 0 Step: 225 Val Loss: 2.632988929748535
Epoch: 0 Step: 250 Val Loss: 2.531026840209961
Epoch: 0 Step: 275 Val Loss: 2.406292676925659
Epoch: 0 Step: 300 Val Loss: 2.30350399017334
Epoch: 0 Step: 325 Val Loss: 2.2238659858703613
Epoch: 0 Step: 350 Val Loss: 2.1682865619659424
Epoch: 0 Step: 375 Val Loss: 2.116222381591797
Epoch: 0 Step: 400 Val Loss: 2.0757057666778564
Epoch: 0 Step: 425 Val Loss: 2.0295915603637695
Epoch: 0 Step: 450 Val Loss: 2.010106325149536
Epoch: 0 Step: 475 Val Loss: 1.9710310697555542
Epoch: 1 Step: 500 Val Loss: 1.9496560096740723
Epoch: 1 Step: 525 Val Loss: 1.92460036277771
Epoch: 1 Ste

In [30]:
# model_name = 'hidden512_layers3_sha.net'
# torch.save(model.state_dict(),'/content/drive/MyDrive/Data_sets/Models/'+model_name)

In [31]:
#model.load_state_dict(torch.load('/content/drive/MyDrive/Data_sets/Models/hidden512_layers3_sha.net'))

 # Now we will predict new characters

In [32]:
def predict_next_char(model, char, hidden = None, k = 1):
  encoded_text = model.encoder[char]
  encoded_text = np.array([[encoded_text]])
  encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))

  inputs = torch.from_numpy(encoded_text)

  if model.use_gpu:
    inputs = inputs.cuda()
  
  hidden = tuple([state.data for state in hidden])

  lstm_out, hidden = model(inputs,hidden)

  probs = F.softmax(lstm_out, dim = 1).data

  if model.use_gpu:

    probs = probs.cpu()

  probs, index_positions = probs.topk(k)

  index_positions = index_positions.numpy().squeeze()

  probs = probs.numpy().flatten()

  probs = probs/probs.sum()

  char = np.random.choice(index_positions, p = probs)

  return model.decoder[char], hidden

In [33]:
def generate_text(model,size, seed = 'The', k=1):

  if model.use_gpu:
    model.cuda()
  else:
    model.cpu()

  output_chars = [c for c in seed]

  hidden = model.hidden_state(1)

  for char in seed:

    char,hidden = predict_next_char(model, char, hidden, k = k)
  output_chars.append(char)

  for i in range(size):

    char,hidden = predict_next_char(model, output_chars[-1], hidden, k=k)

    output_chars.append(char)

  return ''.join(output_chars)

In [34]:
print(generate_text(model, 1000, seed = 'The', k = 3))

Thething offended
                               and a stople of the consent.
                                           Exeint all but AUFIDIUS  
    What a good fool will be too think is the trimms that that this
    hatch in him to tell me. If they see, that there is thieves and
    thruch here. What may I see the sea of his stock and this thought
    here of that time? We see all my heart and string off them to the
    thing.
  PRONPERO. Why, I do not the word to hard; that in his soul,
    And we should be to make me with him. If there be
    this that I have saint me. Is he? And I warrant you to the
    sons?
  CLOWN. I will not see her with me.
  SEBISTON. I am not with him with the words of your astainted, and,  
    that thou wild sport another still to thy first words, whereof
    he is this strange and shrink or as a man that hath she stoods.
  POINS. It would not be to be the mask that I shall show thy
    tongues.
  PROSPERO. What's the woman?
  SHELECK. I am no more thou 