Name: Arjun Bhan  UNI: AB5666

In [None]:
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

### Get the data and process
- This is the Mysterious island found in Project Gutenberg.

In [None]:
with open('/content/1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')

text = text[start_indx:end_indx]
char_set = set(text)
print(char_set)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))
assert(len(text) == 1130711)
assert(len(char_set) == 85)

{'L', ' ', ')', 'G', 'V', 'H', 'Y', 'd', '2', '.', 's', '&', 'R', 'y', '(', '’', '!', '9', 'k', 'i', '?', 'C', 'm', '\n', 'z', '”', '-', 'X', 'n', 'B', 'c', 'j', 'Z', '"', 'q', 'f', 'S', 'Q', 'P', '8', 'T', '4', 'U', 'I', '3', 'a', '“', 'u', 'e', ':', '7', 'J', 'r', 'A', 'v', '%', 'M', '5', '$', 'p', 'E', 'x', 'F', '=', '*', 'b', 'w', '/', 'o', 'g', 'W', ',', 't', '‘', 'h', "'", ';', '1', '0', 'l', '6', 'K', 'O', 'N', 'D'}
Total Length: 1130711
Unique Characters: 85


### Tokenze and get other helpers
- We do this manually since everything is character based.

In [None]:
chars_sorted = sorted(char_set)


char2int = {curChar: curInd for curInd, curChar in enumerate(chars_sorted)}
int2char = np.array(chars_sorted)

text_encoded = np.array([char2int[i] for i in text], dtype= np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1130711,)
THE MYSTERIOUS       == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32]  == Reverse  ==>  ISLAND


#### Examples

In [None]:
print('Text encoded shape: ', text_encoded.shape)
print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1130711,)
THE MYSTERIOUS       == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32]  == Reverse  ==>  ISLAND


In [None]:
assert(
    np.array_equal(
    text_encoded[:15],
        [48, 36, 33, 1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1]
    )
)

### Process the data and get the data loader

In [None]:
seq_length = 40
chunk_size = seq_length + 1


chunckAmount = len(text_encoded)//chunk_size
text_chunks = []
for i in range(len(text_encoded)-chunk_size + 1):
  text_chunks.append(text_encoded[i:i+chunk_size])


In [None]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]


        return text_chunk[0:40], text_chunk[1:41]

seq_dataset = TextDataset(torch.tensor(text_chunks))

In [None]:
for i, (seq, target) in enumerate(seq_dataset):
    print(seq.shape, target.shape)
    print('Input (x):', repr(''.join(int2char[seq])))
    print('Target (y):', repr(''.join(int2char[target])))
    print()
    if i == 1:
        break

torch.Size([40]) torch.Size([40])
Input (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
Target (y): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'

torch.Size([40]) torch.Size([40])
Input (x): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Target (y): 'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'



In [None]:
device = torch.device("cpu")

In [None]:
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Write the models

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size

        self.rnn = nn.LSTM(input_size = embed_dim, hidden_size=rnn_hidden_size, batch_first=True)


        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, text, hidden=None, cell=None):
        out = self.embedding(text)

        if hidden is not None:
            out, (hidden, cell) = self.rnn(out, (hidden, cell))
        else:
            out, (hidden, cell) = self.rnn(out)

        out = self.fc(out)

        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)

### Do this right way - across all data all at once!

In [None]:
vocab_size = len(int2char)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model = model.to(device)
model

RNN(
  (embedding): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 10000

torch.manual_seed(1)
seq_dl_iter = iter(seq_dl)

for epoch in range(num_epochs):
      hidden, cell = model.init_hidden(batch_size)
      try:
        seq_batch, target_batch = next(seq_dl_iter)
      except StopIteration:
        seq_dl_iter = iter(seq_dl)
        seq_batch, target_batch = next(seq_dl_iter)
      seq_batch = seq_batch.to(device)
      target_batch = target_batch.to(device)

      optimizer.zero_grad()

      loss = 0

      logits, _ = model(seq_batch, hidden, cell)



      loss += criterion(logits.view(batch_size * seq_length, -1), target_batch.view(batch_size * seq_length).long())

      loss.backward()

      optimizer.step()

      loss = loss.item()

      if epoch % 100 == 0:
          print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.4364
Epoch 100 loss: 1.6833
Epoch 200 loss: 1.4937
Epoch 300 loss: 1.3673
Epoch 400 loss: 1.3395
Epoch 500 loss: 1.4198
Epoch 600 loss: 1.3841
Epoch 700 loss: 1.2793
Epoch 800 loss: 1.3617
Epoch 900 loss: 1.2560
Epoch 1000 loss: 1.3461
Epoch 1100 loss: 1.2049
Epoch 1200 loss: 1.2541
Epoch 1300 loss: 1.2249
Epoch 1400 loss: 1.2720
Epoch 1500 loss: 1.2072
Epoch 1600 loss: 1.2968
Epoch 1700 loss: 1.2175
Epoch 1800 loss: 1.2203
Epoch 1900 loss: 1.2118
Epoch 2000 loss: 1.2050
Epoch 2100 loss: 1.1825
Epoch 2200 loss: 1.1912
Epoch 2300 loss: 1.2353
Epoch 2400 loss: 1.2758
Epoch 2500 loss: 1.2641
Epoch 2600 loss: 1.1490
Epoch 2700 loss: 1.1978
Epoch 2800 loss: 1.1520
Epoch 2900 loss: 1.1824
Epoch 3000 loss: 1.2312
Epoch 3100 loss: 1.2092
Epoch 3200 loss: 1.1566
Epoch 3300 loss: 1.1856
Epoch 3400 loss: 1.1772
Epoch 3500 loss: 1.1998
Epoch 3600 loss: 1.2040
Epoch 3700 loss: 1.2268
Epoch 3800 loss: 1.1719
Epoch 3900 loss: 1.1775
Epoch 4000 loss: 1.1764
Epoch 4100 loss: 1.1541
Epoc

In [None]:
from torch.distributions.categorical import Categorical

torch.manual_seed(1)

logits = torch.tensor([[-1.0, 1.0, 3.0]])

print('Probabilities:', torch.softmax(logits, dim = 1))

m = torch.distributions.Categorical(torch.softmax(logits,dim = -1))
samples = m.sample((10,))

print(samples.numpy())

Probabilities: tensor([[0.0159, 0.1173, 0.8668]])
[[1]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


### Random decoding.
- This compounds problems: once you make a mistake, you can't undo it.

In [None]:
def random_sample(
    model,
    starting_str,
    len_generated_text=500,
):

    # Encode starting string into a tensor using char2str.
    encoded_input = torch.tensor([char2int[s] for s in starting_str])

    # Reshape to be 1 by ??? - let PyTorch figure this out.
    encoded_input = encoded_input.view(1,-1)

    # This will be what you generate, but it starts off with something.
    generated_str = starting_str

    # Put model in eval mode. This matters if we had dropout o batch / layer norms.
    model.eval()

    hidden, cell = model.init_hidden(1)

    hidden = hidden.to(device)

    cell = cell.to(device)


    for c in range(len(starting_str)-1):
        out = encoded_input[:, c].view(1,1)
        _, (hidden, cell) = model(out, hidden, cell)


    last_char = encoded_input[:, -1]


    for i in range(len_generated_text):


        logits, (hidden, cell) = model(last_char.reshape(1,1), hidden, cell)

        logits = logits.view(-1)

        m = torch.distributions.Categorical(logits = logits)

        last_char = m.sample()


        generated_str += int2char[last_char]

    return generated_str

torch.manual_seed(1)
model.to(device)
print(random_sample(model, starting_str='The island'))

The island is on board, either the bead displange under the water on the
rate--that was blunded birds, the maneud coupless the retreat werence forth winted to underce, for in coast, his arms was no chumntannes, indeed.

Cyrus Harding recounted them seen syout tusily, was to be sought. Leriate is breathey to any effect of a mile.

The eyes observated him all that time reasoning to the mysterious.
There the reporter and Herbert that was therefore, they prought, Cyrus Harding.

The ship of the fabrication, a
