In [None]:
import os
import sys

# Add parent directory to path
sys.path.append(os.path.join(os.getcwd(), ".."))

from common.src.utils import load_txt_file
from common.src.tokenizer import Tokenizer

### Constants and hyperparameters

In [None]:
# Constants
save_state = True # Save the state of the tokenizer into a file
dataset_path = os.path.join(os.getcwd(), 'dataset', 'input.txt') # Path to the training dataset
output_path = os.path.join(os.getcwd(), 'checkpoints', 'tokenizer.json') # Path into which the tokenized parameters will be saved


In [None]:
# Hyperparameters
vocab_size = 50257 # The size of the vocabulary
validation_samples = 0 # Number of samples to use for validation

### Data loading

In [None]:
# Load the text file
text = load_txt_file(dataset_path)

# Split the text into training and validation sets
training_text = text[:(len(text) - validation_samples if len(text) > validation_samples else -1)]
validation_text = text[-validation_samples:]

# Print the number of samples in each set
print(f"Training text length: {len(training_text)}")
print(f"Validation text length: {len(validation_text)}")

### Tokenizer

In [None]:
# Instantiate the tokenizer
tokenizer = Tokenizer()

### Training the tokenizer

In [None]:
    
# Train the tokenizer
tokenizer.train(text=training_text, vocab_size=vocab_size)

In [None]:
# Check if the state of the tokenizer should be saved
if save_state:
    # Save the state of the tokenizer
    tokenizer.save_state(output_path)

### Evaluation

In [None]:
# Encoding the text
encoded_text = tokenizer.encode(validation_text)

# Decoding the text
decoded_text = tokenizer.decode(encoded_text)

# Print the original text
print(f"Original text")
print("-------------")
print(validation_text)

# Print the encoded text
print("\nEncoded text")
print("------------")
print(encoded_text)

# Print the decoded text
print("\nDecoded text")
print("------------")
print(decoded_text)