In [None]:
from transformers import GPT2Tokenizer


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("Qilex/tinyStories-10k-tokenizer", cache_dir="custom-tokenizer/")


In [None]:
import pickle

In [None]:
with open("data/dataset.pkl", "rb") as f: 
    arr=pickle.load(f)

In [None]:
len(arr)

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
from mingpt.utils import set_seed, setup_logging, CfgNode as CN

In [None]:
class BatchEmitterDataset(Dataset):
    """
    Emits batches of tokens (because we're passing it tokens)
    """

    @staticmethod
    def get_default_config():
        C = CN()
        C.block_size = 512
        return C

    def __init__(self, config, data):
        self.config = config
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d tokens, %d unique.' % (data_size, vocab_size))
        self.vocab_size = vocab_size
        self.data = data

    def get_vocab_size(self):
        return self.vocab_size

    def get_block_size(self):
        return self.config.block_size

    def __len__(self):
        return len(self.data) - self.config.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.config.block_size + 1]
        # return as tensors
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y

In [None]:
dataset = BatchEmitterDataset(BatchEmitterDataset.get_default_config(),arr)

In [None]:
from minGPT.mingpt.utils import set_seed
set_seed(0)

In [None]:
# create a GPT instance
from minGPT.mingpt.model import GPT
model_config = GPT.get_default_config()
model_config.model_type = 'gpt-mini'
model_config.vocab_size = tokenizer.vocab_size
model_config.block_size = 512
model = GPT(model_config)



In [None]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = len(arr)-512
train_config.num_workers = 0
#train_config.batch_size = 2
trainer = Trainer(train_config, model, dataset)

In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 1000 == 0 and trainer.iter_num !=0:      
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
    if trainer.iter_num % 10000 == 0 and trainer.iter_num !=0:
        loss = trainer.loss.item()
        iter_num = trainer.iter_num
        filepath = "tiny46"+str(iter_num)+".pt"
        torch.save({
                    'epoch': iter_num,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': trainer.optimizer.state_dict(),
                    'loss': loss,
                    'train_config':train_config,
                    'model_config':model_config

        }, filepath)
        print("Saving model")
        
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

In [None]:
def generate(prompt='', num_samples=1, steps=200, do_sample=True):
    #tokenizer = tokenizer
    if prompt == '': 
        prompt = tokenizer.eos_token
    encoded_input = tokenizer(prompt, return_tensors='pt').to(device)
    x = encoded_input['input_ids']
    
    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)

    # forward the model `steps` times to get samples, in a batch
    y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=None, temperature = 1)
    
    for i in range(num_samples):
#        out = y[i].cpu().squeeze()

        out = tokenizer.decode(y[i].cpu().squeeze())
        print('-'*80)
        print(out)

In [None]:
device = 'cuda'
generate("There were two mice")

In [None]:
#https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-a-general-checkpoint-for-inference-and-or-resuming-training


checkpoint = torch.load('tiny46900000.pt')
mconfig = checkpoint['model_config']
tconfig = checkpoint['train_config']
mconfig.model_type = None
new_model = GPT(mconfig)
new_model.load_state_dict(checkpoint['model_state_dict'])
#trainer.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
trainer.iter_num = checkpoint['epoch']





In [None]:
device = 'cpu'

In [None]:
generate("A little boy named Sam")

In [None]:
beam("A little boy named Sam")

In [None]:
def generate( prompt='', num_samples=1, steps=200, do_sample=True):
    #tokenizer = tokenizer
    if prompt == '': 
        prompt = tokenizer.eos_token
    encoded_input = tokenizer(prompt, return_tensors='pt').to(device)
    x = encoded_input['input_ids']
    
    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)

    # forward the model `steps` times to get samples, in a batch
    y = new_model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=None, temperature = 1)
    #    y = new_model.beam(x, max_new_tokens=steps, beam_size = 5)

    
    for i in range(num_samples):
        out = tokenizer.decode(y[i].cpu().squeeze())
        print('-'*80)
        print(out)

In [None]:
def beam( prompt='', num_samples=1, steps=200, do_sample=True):
    #tokenizer = tokenizer
    if prompt == '': 
        prompt = tokenizer.eos_token
    encoded_input = tokenizer(prompt, return_tensors='pt').to(device)
    x = encoded_input['input_ids']
    
    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)

    # forward the model `steps` times to get samples, in a batch
    #y = new_model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=None, temperature = 1)
    y = new_model.beam(x, max_new_tokens=steps, beam_size = 5)

    
    for i in range(num_samples):
        out = tokenizer.decode(y[i].cpu().squeeze())
        print('-'*80)
        print(out)