In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!curl -O https://www.gutenberg.org/files/1268/1268-0.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1124k  100 1124k    0     0  1039k      0  0:00:01  0:00:01 --:--:-- 1039k


In [58]:
from pathlib import Path
import urllib.request

def download_shakespeare_text():
    path = Path("datasets/shakespeare/shakespeare.txt")
    if not path.is_file():
        path.parent.mkdir(parents=True, exist_ok=True)
        url = "https://homl.info/shakespeare"
        urllib.request.urlretrieve(url, path)
    return path.read_text()

shakespeare_text = download_shakespeare_text()

In [59]:
text = shakespeare_text
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1115394
Unique Characters: 65


In [37]:
import numpy as np
with open('1268-0.txt', 'r', encoding="utf8") as fp:
    text = fp.read()
start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')
text = text[start_indx:end_indx]
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112310
Unique Characters: 80


In [60]:
chars_sorted = sorted(char_set)
print("".join(chars_sorted))
char2int = {ch:i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = torch.tensor([char2int[ch] for ch in text])
print('Text encoded shape:', text_encoded.shape)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Text encoded shape: torch.Size([1115394])


In [61]:
import torch
from torch.utils.data import Dataset

seq_length = 40

class TextDataset(Dataset):
    def __init__(self, encoded_text, chunk_size):
        self.encoded_text = encoded_text
        self.chunk_size = chunk_size

    def __len__(self):
        return len(self.encoded_text) - self.chunk_size

    def __getitem__(self, idx):
        #text_chunk = torch.from_numpy(self.encoded_text[idx: idx + self.chunk_size])
        #return text_chunk[:-1].long(), text_chunk[1:].long()
        end = idx + self.chunk_size
        window = self.encoded_text[idx : end]
        target = self.encoded_text[idx + 1 : end + 1]
        return window, target

seq_dataset = TextDataset(text_encoded, seq_length)
print(len(seq_dataset))

1115354


In [62]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
print(seq_dl)

<torch.utils.data.dataloader.DataLoader object at 0x7d1c09b1f190>


In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [64]:
import torch.nn as nn
class charRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, n_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.GRU(embed_dim, rnn_hidden_size, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x):
        embeddings = self.embedding(x)
        out, _states = self.rnn(embeddings)
        return self.fc(out).permute(0, 2, 1)

    # def init_hidden(self, batch_size):
    #     hidden = torch.zeros(1, batch_size, self.rnn_hidden_size).to(device)
    #     cell = torch.zeros(1, batch_size, self.rnn_hidden_size).to(device)
    #     return hidden, cell

In [65]:
vocab_size = len(char_set)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = charRNN(vocab_size, embed_dim, rnn_hidden_size)
model.to(device)

charRNN(
  (embedding): Embedding(65, 256)
  (rnn): GRU(256, 512, num_layers=2, batch_first=True)
  (fc): Linear(in_features=512, out_features=65, bias=True)
)

In [66]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [67]:
num_epochs = 10
torch.manual_seed(1)
for epoch in range(num_epochs):
    idx = 0
    for seq_batch, target_batch in seq_dl:
        seq_batch, target_batch = seq_batch.to(device), target_batch.to(device)
        
        optimizer.zero_grad()
        loss = 0
        pred_logits = model(seq_batch)
        loss += loss_fn(pred_logits, target_batch)
        loss.backward()
        optimizer.step()
        idx += 1
        if idx == 200:
            break
    
    loss = loss.item()
    print(f'Epoch {epoch} loss: {loss:.4f}')


Epoch 0 loss: 1.6861
Epoch 1 loss: 1.6128
Epoch 2 loss: 1.5505
Epoch 3 loss: 1.5823
Epoch 4 loss: 1.5692
Epoch 5 loss: 1.5501
Epoch 6 loss: 1.5963
Epoch 7 loss: 1.5585
Epoch 8 loss: 1.5374
Epoch 9 loss: 1.6461


In [68]:
import torch.nn.functional as F

def next_char(model, text, temperature=1):
    txt = torch.tensor([char2int[ch] for ch in text])
    encoded_text = txt.unsqueeze(dim=0).to(device)
    with torch.no_grad():
        Y_logits = model(encoded_text)
        Y_probas = F.softmax(Y_logits[0, :, -1] / temperature, dim=-1)
        predicted_char_id = torch.multinomial(Y_probas, num_samples=1).item()
    return str(char_array[predicted_char_id])

def extend_text(model, text, n_chars=80, temperature=1):
    for _ in range(n_chars):
        text += next_char(model, text, temperature)
    return text

In [70]:
print(extend_text(model, "To be or not to b", temperature=0.4))

To be or not to be our ears of the ender the often it the mother of heaven for the ender the worl
