In [1]:
import tqdm
import torch
from torch import nn
import wandb
import os
import tokenizers
from matplotlib import pyplot as plt
import numpy as np
import json

config = {
    "learning_rate": 1e-3,
    "sae_learning_rate": 5e-5,
    "model_embedding_layer": 6,
    "eval_interval": 500,
    "max_iters": 60000, 
    "H": 32, # hidden dimension size
    "B": 64,
    "T": 256,
    "C": 256,
    "feedforward_factor": 3,
    "n_heads": 8,
    "n_layers": 12,
    "tokenizer_vocab_size": 2**13,
    "git_hash": os.popen("git rev-parse HEAD").read().strip()
}

# initial
for k,v in config.items():
    locals ()[k] = v


In [2]:
with open('tinystories-train.txt', 'r', encoding='utf-8') as f:
    text = f.read()

paths = ['tinystories-train.txt']
tokenizer = tokenizers.ByteLevelBPETokenizer()

tokenizer.train(files=paths, vocab_size=tokenizer_vocab_size, min_frequency=2)

tokenizer.save_model('.', 'tiny-stories-bpe')



enc = tokenizer.encode("She sells sea shells by the sea shore!")
tokenizer.decode(enc.ids)







'She sells sea shells by the sea shore!'

In [3]:

def encode(text):
    return tokenizer.encode(text).ids
def decode(encoded_text):
    return tokenizer.decode(encoded_text)

from tqdm import tqdm

def batch_encode(text, batch_size):
    tokens = []
    for i in tqdm(range(0, len(text), batch_size)):
        tokens.extend(encode(text[i:i+batch_size]))
    return tokens


hello_encoded = encode("hello")
print(hello_encoded)
print(decode(hello_encoded))
vocab_size = tokenizer.get_vocab_size()
print("vocab size: ", vocab_size)

[6099]
hello
vocab size:  8192


In [4]:
encoded_text = batch_encode(text, 200000)
# data = torch.tensor(encode(text), dtype=torch.int64)
data = torch.tensor(encoded_text, dtype=torch.int64, device='cuda')
print(data.dtype)
print(data.size())
print(data.device)
torch.save(data, 'tiny-stories-train.pt')
encoded_text = None


  4%|▎         | 348/9614 [00:35<15:44,  9.81it/s]


KeyboardInterrupt: 