In [None]:
import os

# Import custom modules
from src import Tokenizer

### Constants and hyperparameters

In [3]:
# Constants
save_state = True # Save the state of the tokenizer into a local file
dataset_path = os.path.join(os.getcwd(), 'dataset', 'input.txt') # Path to the training dataset
output_path = os.path.join(os.getcwd(), 'checkpoints', 'tokenizer.json') # Path into which the tokenized parameters will be saved


In [4]:
# Hyperparameters
vocab_size = 1024 # The size of the vocabulary
validation_samples = 250 # Number of samples to use for validation

### Data loading

In [5]:
def load_txt_file(path: str) -> str:
    """
    Load a text file from the specified path.
    
    Parameters:
    - path (str): The path to the text file.
    
    Returns:
    - str: The contents of the text file.
    """
    
    # Check if the file exists
    if not os.path.exists(path):
        raise FileNotFoundError(f'The file "{path}" does not exist.')
    
    # Read the file
    with open(path, 'r', encoding='utf-8') as file:
        return file.read()

In [6]:
# Load the text file
text = load_txt_file(dataset_path)

# Split the text into training and validation sets
training_text = text[:(len(text) - validation_samples if len(text) > validation_samples else -1)]
validation_text = text[-validation_samples:]

# Print the number of samples in each set
print(f"Training text length: {len(training_text)}")
print(f"Validation text length: {len(validation_text)}")

Training text length: 294935
Validation text length: 250


### Tokenizer

In [7]:
# Instantiate the tokenizer
tokenizer = Tokenizer()

### Training the tokenizer

In [8]:
# Train the tokenizer
tokenizer.train(text=training_text, vocab_size=vocab_size)

1/768 --> Merged tokens (101, 32) into token 256
2/768 --> Merged tokens (97, 32) into token 257
3/768 --> Merged tokens (105, 32) into token 258
4/768 --> Merged tokens (111, 32) into token 259
5/768 --> Merged tokens (101, 114) into token 260
6/768 --> Merged tokens (99, 104) into token 261
7/768 --> Merged tokens (110, 32) into token 262
8/768 --> Merged tokens (108, 32) into token 263
9/768 --> Merged tokens (99, 111) into token 264
10/768 --> Merged tokens (97, 110) into token 265
11/768 --> Merged tokens (44, 10) into token 266
12/768 --> Merged tokens (101, 110) into token 267
13/768 --> Merged tokens (97, 114) into token 268
14/768 --> Merged tokens (44, 32) into token 269
15/768 --> Merged tokens (108, 108) into token 270
16/768 --> Merged tokens (261, 256) into token 271
17/768 --> Merged tokens (111, 114) into token 272
18/768 --> Merged tokens (115, 116) into token 273
19/768 --> Merged tokens (113, 117) into token 274
20/768 --> Merged tokens (260, 32) into token 275
21/76

In [9]:
# Check if the state of the tokenizer should be saved
if save_state:
    # Save the state of the tokenizer
    tokenizer.save(output_path)

### Evaluation

In [10]:
# Encoding the text
encoded_text = tokenizer.encode(validation_text)

# Decoding the text
decoded_text = tokenizer.decode(encoded_text)

# Print the original text
print(f"Original text")
print("-------------")
print(validation_text)

# Print the encoded text
print("\nEncoded text")
print("------------")
print(encoded_text)

# Print the decoded text
print("\nDecoded text")
print("------------")
print(decoded_text)

Original text
-------------
, quel principio ond'elli indige,
tal era io a quella vista nova:
veder volea come si convenne
l'imago al cerchio e come vi s'indova;
se non che la mia mente fu percossa
da un fulgore in che sua voglia venne.
l'amor che move il sole e l'altre stelle.

Encoded text
------------
[44, 32, 300, 108, 32, 382, 110, 292, 326, 111, 32, 287, 100, 39, 439, 105, 32, 282, 285, 519, 266, 652, 32, 260, 97, 32, 1013, 32, 97, 32, 498, 97, 32, 937, 97, 32, 277, 624, 381, 308, 585, 32, 367, 101, 97, 32, 264, 353, 32, 336, 32, 348, 385, 329, 10, 108, 39, 1022, 672, 32, 284, 32, 553, 377, 111, 32, 101, 32, 264, 353, 32, 295, 32, 115, 39, 282, 386, 624, 309, 305, 32, 277, 110, 32, 261, 101, 32, 455, 32, 315, 97, 32, 387, 296, 32, 335, 32, 379, 264, 283, 97, 10, 100, 97, 32, 325, 32, 335, 108, 103, 272, 101, 32, 282, 32, 261, 101, 32, 294, 97, 32, 857, 97, 32, 385, 329, 276, 108, 39, 775, 32, 261, 101, 32, 380, 308, 32, 105, 108, 32, 500, 101, 32, 101, 32, 108, 39, 363, 101, 32, 