# Tiny Shakespeare Dataset for Language Model

## Imports

In [4]:
import torch
import numpy as np
import matplotlib.pyplot as plt

import requests
import os



## Load data

In [5]:
datapath = './data/input.txt'

if not os.path.exists(os.path.dirname(datapath)):
    os.mkdir(os.path.dirname(datapath))

try:
    print('Trying to read dataset locally.')
    with open(datapath) as f:
        data = f.read()
    print('Done.')
except FileNotFoundError:
    print('Not Found. Downloading...')
    response = requests.get('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
    if response.status_code == 200:
        print('Download successfull.')
        data = response.text
        with open(datapath, 'w') as f:
            f.write(data)
        print('Saved dataset locally.')
    else:
        raise RuntimeError(f'Failed to fetch data from {DATASET_URL}: status code: {response.status_code}')

Trying to read dataset locally.
Done.


In [6]:
print(data[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [7]:
n_symbols = len(data)
vocab = list(sorted(set(data)))
print(f'Text data contains' \
        f'\n\t{n_symbols:,} symbols' \
        f'\n\tWith a vocabulary of size: {len(vocab)}',
        '\n\t' + ''.join(vocab).encode('unicode_escape').decode('utf-8'))

Text data contains
	1,115,394 symbols
	With a vocabulary of size: 65 
	\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


## Tokenization

For now, we stick to a very simple tokenization procedure, which encode each individual character to a numerical representation.

This means that the number of tokens is predetermined and equal to the vocabulary size.

In [8]:
class CharacterTokenizer:

    def __init__(self, data):
        self.vocab = list(sorted(set(data)))
        self.vocab_size = len(self.vocab)

        self.ctoi = {ch:i for i, ch in enumerate(vocab)}
        self.itoc = {i:ch for i, ch in enumerate(vocab)}

    def encode(self, s):
        return [self.ctoi[c] for c in s]
    
    def decode(self, l):
        return ''.join(self.itoc[i] for i in l)

In [9]:
tokenizer = CharacterTokenizer(data)

test_string = 'Hello there! $.o'
print('Test string:', test_string)
print('Encoded:', tokenizer.encode(test_string))
print('Decoded:', tokenizer.decode(tokenizer.encode(test_string)))

Test string: Hello there! $.o
Encoded: [20, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43, 2, 1, 3, 8, 53]
Decoded: Hello there! $.o


We can now encode our entire dataset such that it can be ingested by a neural network.

In [10]:
tokens = tokenizer.encode(data)
tokens = torch.tensor(tokens, dtype=torch.long)
print(tokens.shape)
print(tokens[:250])

torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57

In [11]:
split = int(0.9 * len(tokens))
train_data = tokens[:split]
val_data = tokens[split:]

print(f'Train tokens: {train_data.shape[0]:,}')
print(f'Val tokens: {val_data.shape[0]:,}')

Train tokens: 1,003,854
Val tokens: 111,540


## Context Length

Data needs to be subdivided into chunks of a specific size to allow sampling different random parts of text, from which the model will learn to predict autoregressively each token inside a chunk.

In [12]:
context_length = 8

For a given context (chunk), we can emit a prediction for each of the token contained inside it.

In [20]:
x = train_data[:context_length]
y = train_data[1:context_length+1]
print('x:', x.tolist())
print('y:', y.tolist())
for i in range(context_length):
    context = x[:i+1]
    print(f't{i}: From context: {context.tolist()} -> Predict {y[i]}')

x: [18, 47, 56, 57, 58, 1, 15, 47]
y: [47, 56, 57, 58, 1, 15, 47, 58]
t0: From context: [18] -> Predict 47
t1: From context: [18, 47] -> Predict 56
t2: From context: [18, 47, 56] -> Predict 57
t3: From context: [18, 47, 56, 57] -> Predict 58
t4: From context: [18, 47, 56, 57, 58] -> Predict 1
t5: From context: [18, 47, 56, 57, 58, 1] -> Predict 15
t6: From context: [18, 47, 56, 57, 58, 1, 15] -> Predict 47
t7: From context: [18, 47, 56, 57, 58, 1, 15, 47] -> Predict 58
