# Prepare the data

In [358]:
import gdown
import os

# Download URL, the shakespeare.txt
url = f'https://drive.google.com/uc?id=1O4PZ8wOpp6yecoy8tMuVEIFS7XgyRJy9'

data_path = '../data'
text_path = f'{data_path}/shakespeare.txt'

if not os.path.exists(data_path):
    os.makedirs(data_path)

if not os.path.exists(text_path):
  gdown.download(url, text_path, quiet=False)


In [359]:
import re

with open(text_path) as f:
  text = f.read()
  
text = re.sub(r'\d+', '', text)

print(f"lenth of the text {len(text)}")

lenth of the text 5433453


In [360]:
print(text[:1000])

  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep sunken eyes,
  Were an all-ea

In [361]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"vocab size : {len(chars)}")
print("".join(chars))

vocab size : 74

 !"&'(),-.:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz|}


# Prepare tokenizer

In [362]:
# import tiktoken
# tokenizer = tiktoken.get_encoding('gpt2')
# tokens = tokenizer.encode(text)
# print(f"total tokens {len(tokens)}")
# print("decode result of \"hello world.\"", tokenizer.decode([31373, 995]))

In [363]:
class SimpleTokenizer:
  def __init__(self, text):
    self.chars = sorted(list(set(text)))
    self.token2id = {c : i for i, c in enumerate(chars)}
    self.id2token = {i : c for i, c in enumerate(chars)}
    
  def encode(self, text):
    return [self.token2id[c] for c in text]
  
  def decode(self, token_ids):
    return "".join([self.id2token[token_id] for token_id in token_ids])
  

In [364]:
tokenizer = SimpleTokenizer(text)
vocab_size = len(tokenizer.chars)
print(
  tokenizer.encode("Hello"),
  tokenizer.decode([23, 50, 57, 57, 60]),
  sep='\n')

[23, 50, 57, 57, 60]
Hello


# Prepare Data for torch

In [365]:
import torch

data = torch.tensor(tokenizer.encode(text), dtype = torch.int8) # Be careful about the type, which should hold the tokens
print(data, data.shape, data.dtype)


tensor([ 1,  1, 21,  ..., 29, 19,  0], dtype=torch.int8) torch.Size([5433453]) torch.int8


In [366]:
train_data_size = int(data.shape[0] * 0.9)
train_data = data[:train_data_size].detach()
val_data = data[train_data_size:].detach()

In [367]:
from torch.utils.data import Dataset, DataLoader
class SimpleDataset(Dataset):
  def __init__(self, data, block_size = 8):
    self.data = data
    self.block_size = block_size
    
  def __len__(self):
    return len(self.data) - self.block_size

  def __getitem__(self, idx):
    x = self.data[idx: idx + self.block_size]
    y = self.data[idx + self.block_size]
    return x, y

In [368]:
train_dataset = SimpleDataset(train_data)
val_dataset = SimpleDataset(val_data)

In [369]:
print(len(train_dataset), len(val_dataset))

4890099 543338


In [370]:
data[:9]

tensor([ 1,  1, 21, 63, 60, 58,  1, 51, 46], dtype=torch.int8)

In [371]:
train_dataset[0]

(tensor([ 1,  1, 21, 63, 60, 58,  1, 51], dtype=torch.int8),
 tensor(46, dtype=torch.int8))

In [372]:
import numpy as np
class SimpleDataloader(DataLoader):
  def __init__(self, dataset, batch_size=4, shuffle=True, **kwargs):
    super().__init__(dataset, batch_size=batch_size, shuffle=shuffle, **kwargs)
    self.shuffle = shuffle
  def __iter__(self):
    dataset_size = len(self.dataset)
    indices = np.arange(dataset_size)
    if self.shuffle:
        np.random.shuffle(indices)

    for start_idx in range(0, dataset_size - self.batch_size + 1, self.batch_size):
        batch_indices = indices[start_idx:start_idx + self.batch_size]
        yield (torch.stack([self.dataset[i][0] for i in batch_indices]),
              torch.stack([self.dataset[i][1] for i in batch_indices]))

In [373]:
torch.manual_seed(0)
train_dataloader = SimpleDataloader(train_dataset)
val_dataloader = SimpleDataloader(val_dataset)

In [374]:
for i, batch in enumerate(train_dataloader):
  print(batch)
  if i == 3:
    break

(tensor([[50,  1, 51, 54, 67, 50,  1, 53],
        [53, 46, 65, 53,  1, 60, 51, 51],
        [68, 46, 54, 57,  1, 54, 59, 50],
        [34, 20, 35,  0,  0,  1,  1, 34]], dtype=torch.int8), tensor([66, 50, 67, 30], dtype=torch.int8))
(tensor([[70,  1, 48, 46, 63, 50,  1, 49],
        [60, 51,  1, 29, 46, 61, 57, 50],
        [70,  1, 57, 60, 63, 49, 10,  1],
        [47, 50, 46, 66, 65, 54, 50, 64]], dtype=torch.int8), tensor([63, 64, 24,  1], dtype=torch.int8))
(tensor([[ 1, 58, 50, 10,  1,  1,  1,  1],
        [64,  0,  1,  1,  1,  1, 30, 51],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [66, 63,  1, 54, 59,  1, 58, 46]], dtype=torch.int8), tensor([ 1,  1, 35, 63], dtype=torch.int8))
(tensor([[ 1, 65, 53, 70,  1, 61, 60, 48],
        [51, 60, 63, 48, 50, 49,  1, 65],
        [59, 52,  5, 64,  1, 64, 60, 59],
        [ 1, 16, 31, 20, 28, 16, 29, 35]], dtype=torch.int8), tensor([56, 60,  1, 36], dtype=torch.int8))


# Transformer GPT

In [375]:
vocab_size

74