In [32]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import re
import tiktoken # BPE

In [3]:
dataset = tfds.load(name='tiny_shakespeare')

train = dataset['train']
for text in train:
    x = text['text'].numpy().decode('utf-8')
len(x)

I0000 00:00:1739134947.135962    9029 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739134948.165579    9029 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739134948.166102    9029 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739134948.222348    9029 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739134948.222528    9029 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

1003854

In [24]:
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', x)
print(tokens[:100])

vocabulary = sorted(set(tokens))

dictionary = {item:value for value, item in enumerate(vocabulary)}
dictionary["<|unk|>"] = len(dictionary)
dictionary["<|eos|>"] = len(dictionary)
#dictionary["<|bos|>"] = len(dictionary)
#dictionary["<|pad|>"] = len(dictionary)

['First', ' ', 'Citizen', ':', '', '\n', 'Before', ' ', 'we', ' ', 'proceed', ' ', 'any', ' ', 'further', ',', '', ' ', 'hear', ' ', 'me', ' ', 'speak', '.', '', '\n', '', '\n', 'All', ':', '', '\n', 'Speak', ',', '', ' ', 'speak', '.', '', '\n', '', '\n', 'First', ' ', 'Citizen', ':', '', '\n', 'You', ' ', 'are', ' ', 'all', ' ', 'resolved', ' ', 'rather', ' ', 'to', ' ', 'die', ' ', 'than', ' ', 'to', ' ', 'famish', '?', '', '\n', '', '\n', 'All', ':', '', '\n', 'Resolved', '.', '', ' ', 'resolved', '.', '', '\n', '', '\n', 'First', ' ', 'Citizen', ':', '', '\n', 'First', ',', '', ' ', 'you', ' ', 'know', ' ']


In [23]:
print(sorted(dictionary.values())[-1])

13018


In [25]:
class SimpleTockenizer:
    def __init__(self, dictionary):
        self.dictionary = dictionary
        self.dictionary_reverse = {value:item for item, value in dictionary.items()}

    def encode(self, text):
        split = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = list()
        for item in split:
            try: 
                tokens.append(self.dictionary[item])
            except:
                tokens.append(self.dictionary["<|unk|>"])
                
        return tokens

    def decode(self, tokens):
        text = "".join([self.dictionary_reverse[token] for token in tokens])
        return text

In [31]:
tokenizer = SimpleTockenizer(dictionary)
tokens = tokenizer.encode(x)
print(tokens[:10])

text = tokenizer.decode(tokens)
print(text[:100])

[864, 2, 457, 11, 0, 1, 249, 2, 12630, 2]
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [36]:
tiktok = tiktoken.get_encoding("gpt2")
integers = tiktok.encode(x, allowed_special={"<|eos|>"})
print(integers[:50])

strings = tiktok.decode(integers[:50])
print(strings)

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13, 198, 198, 5962, 22307, 25, 198, 1639, 389, 477, 12939, 2138, 284, 4656, 621, 284, 1145, 680, 30, 198, 198, 3237, 25, 198, 4965, 5634, 13]
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved.


# Model Creation