In [1]:
import os

from lib.tokenizer import Tokenizer
from lib.utils import load_txt_file

### Constants and hyperparameters

In [2]:
# Constants
save_state = True # Save the state of the tokenizer into a file
dataset_path = os.path.join(os.getcwd(), 'dataset', 'input.txt') # Path to the training dataset
output_path = os.path.join(os.getcwd(), 'checkpoints', 'tokenizer.json') # Path into which the tokenized parameters will be saved


In [3]:
# Hyperparameters
vocab_size = 1024 # The size of the vocabulary
validation_samples = 250 # Number of samples to use for validation

### Data loading

In [4]:
# Load the text file
text = load_txt_file(dataset_path)

# Split the text into training and validation sets
training_text = text[:(len(text) - validation_samples if len(text) > validation_samples else -1)]
validation_text = text[-validation_samples:]

# Print the number of samples in each set
print(f"Training text length: {len(training_text)}")
print(f"Validation text length: {len(validation_text)}")

Training text length: 1115144
Validation text length: 250


### Tokenizer

In [5]:
# Instantiate the tokenizer
tokenizer = Tokenizer()

### Training the tokenizer

In [6]:
    
# Train the tokenizer
tokenizer.train(text=training_text, vocab_size=vocab_size)

1/768 --> Merged tokens (101, 32) into token 256
2/768 --> Merged tokens (116, 104) into token 257
3/768 --> Merged tokens (116, 32) into token 258
4/768 --> Merged tokens (115, 32) into token 259
5/768 --> Merged tokens (100, 32) into token 260
6/768 --> Merged tokens (44, 32) into token 261
7/768 --> Merged tokens (111, 117) into token 262
8/768 --> Merged tokens (101, 114) into token 263
9/768 --> Merged tokens (105, 110) into token 264
10/768 --> Merged tokens (121, 32) into token 265
11/768 --> Merged tokens (97, 110) into token 266
12/768 --> Merged tokens (58, 10) into token 267
13/768 --> Merged tokens (111, 114) into token 268
14/768 --> Merged tokens (111, 32) into token 269
15/768 --> Merged tokens (101, 110) into token 270
16/768 --> Merged tokens (10, 10) into token 271
17/768 --> Merged tokens (97, 114) into token 272
18/768 --> Merged tokens (32, 257) into token 273
19/768 --> Merged tokens (111, 110) into token 274
20/768 --> Merged tokens (108, 108) into token 275
21/7

In [None]:
# Check if the state of the tokenizer should be saved
if save_state:
    # Save the state of the tokenizer
    tokenizer.save_state(output_path)

### Evaluation

In [8]:
# Encoding the text
encoded_text = tokenizer.encode(validation_text)

# Decoding the text
decoded_text = tokenizer.decode(encoded_text)

# Print the original text
print(f"Original text")
print("-------------")
print(validation_text)

# Print the encoded text
print("\nEncoded text")
print("------------")
print(encoded_text)

# Print the decoded text
print("\nDecoded text")
print("------------")
print(decoded_text)

Original text
-------------
 sleep. What is it thou didst say?
This is a strange repose, to be asleep
With eyes wide open; standing, speaking, moving,
And yet so fast asleep.

ANTONIO:
Noble Sebastian,
Thou let'st thy fortune sleep--die, rather; wink'st
Whiles thou art waking.


Encoded text
------------
[282, 321, 709, 46, 32, 87, 639, 32, 320, 32, 316, 522, 354, 441, 296, 282, 437, 481, 307, 320, 32, 320, 293, 508, 551, 738, 32, 309, 460, 305, 44, 391, 111, 620, 293, 115, 321, 709, 10, 659, 32, 627, 280, 288, 441, 101, 32, 516, 270, 59, 508, 417, 291, 44, 787, 286, 512, 44, 287, 111, 118, 291, 277, 312, 100, 32, 121, 453, 282, 111, 32, 355, 296, 293, 115, 321, 709, 278, 443, 84, 548, 73, 79, 267, 78, 111, 98, 321, 32, 83, 101, 98, 1023, 105, 266, 277, 753, 32, 321, 116, 39, 115, 116, 273, 121, 791, 116, 347, 101, 282, 321, 709, 530, 404, 101, 44, 32, 330, 368, 59, 288, 554, 39, 115, 116, 10, 360, 383, 280, 522, 32, 272, 116, 288, 97, 512, 331]

Decoded text
------------
 sleep. What 