In [1]:
from gpt import GPTConfig, GPT, generate
import torch
import os
from transformers import AutoTokenizer


block_size: 512
n_layer: 14
n_head: 16
n_embd: 512
feed_forward_factor: 2.5
vocab_size: 8192
data_dir: dataset
expt_name: restart_good_3hr_search
batch_size: 128
max_lr: 0.002
min_lr: 0.0001
beta_1: 0.9
beta_2: 0.99
warmup_steps: 50
max_steps: 60000
max_runtime_seconds: 10800
weight_decay: 0.12
need_epoch_reshuffle: True
matmul_precision: high
smoke_test: False


In [11]:
def get_value(key, line, default):
    try:
        str_val = line.split(key)[1].split(' ')[1]
        str_val = str_val.replace('s', '')
        return float(str_val)
    except:
        return default

def print_leaderboard_and_get_best_model(time_limit = None):
    candidates = []
    for expt in os.listdir('logs'):        
        if 'smoke' in expt:
            continue
        best_loss_for_expt = 10000
        best_line = ''

        for line in open(f'logs/{expt}/log.txt').readlines():
            if 'val' in line and 'byte loss' in line:
                if time_limit is not None:
                    time = get_value('ds', line, 100000)
                    if time > time_limit:
                        continue

                byte_loss = get_value('byte loss', line, 100000)

                if byte_loss < best_loss_for_expt:
                    best_loss_for_expt = byte_loss
                    best_line = line
            
        
        if best_loss_for_expt < 10000:
            candidates.append((best_loss_for_expt, expt, best_line))

    candidates = sorted(candidates, key=lambda x: x[0])
    for best_loss, expt, line in candidates:
        print(expt.ljust(50), f'{best_loss:.3} {line.strip()}')

    best_expt = candidates[0][1]
    best_model_paths = [model_path for model_path in os.listdir(f'logs/{best_expt}') if 'model' in model_path]
    best_model_path = sorted(best_model_paths, key=lambda x: x.split('_')[1])[-1]
    return best_expt, best_model_path

best_expt, best_model_path = print_leaderboard_and_get_best_model(time_limit=720)



lucky_diego_1                                      0.364 step 3750 | val loss 1.4910 | byte loss 0.3645 | ds 691.5s
16l_16h_512d_hour_good_max                         0.366 step 6250 | val loss 1.4960 | byte loss 0.3657 | ds 708.9s
16l_16h_512d_hour                                  0.369 step 6250 | val loss 1.5093 | byte loss 0.3690 | ds 708.7s
diego_is_12m_man                                   0.37 step 4299 | val loss 1.5155 | byte loss 0.3705 | ds 376.3s
16l_16h_512d_3_hour_good_max_hig_prec              0.375 step 4000 | val loss 1.5341 | byte loss 0.3750 | ds 686.7s
16l_16h_512d_3_hour_good_max_med_prec_fixed_shuffle 0.376 step 4000 | val loss 1.5363 | byte loss 0.3756 | ds 687.0s
lucky_diego_2                                      0.378 step 3750 | val loss 1.5461 | byte loss 0.3780 | ds 691.5s
lucky_diego_5                                      0.379 step 3750 | val loss 1.5490 | byte loss 0.3787 | ds 693.7s
lucky_diego_6                                      0.379 step 3750 | val

In [9]:
# this sucks, make time limit a parameter to the existing print_leaderboard_and_get_best_model func


def print_leaderboard_and_get_best_model_fast():
    candidates = []
    for expt in os.listdir('logs'):        
        if 'smoke' in expt:
            continue
        best_loss_for_expt = 10000
        best_line = ''

        for line in open(f'logs/{expt}/log.txt').readlines():
            if 'val' in line and 'byte loss' in line

                def get_byte_loss():
                    after_bl_str = line.split('byte loss')[1]            
                    try:                
                        byte_loss = float(after_bl_str.split(' ')[1])
                    except ValueError as e:
                        byte_loss = 100000
                    return byte_loss
                
                byte_loss = get_byte_loss()

                if byte_loss < best_loss_for_expt:
            
                    best_loss_for_expt = byte_loss
                    best_line = line
            
        
        if best_loss_for_expt < 10000:
            candidates.append((best_loss_for_expt, expt, best_line))

    candidates = sorted(candidates, key=lambda x: x[0])
    for best_loss, expt, line in candidates:
        print(expt.ljust(50), f'{best_loss:.3} {line.strip()}')

    best_expt = candidates[0][1]
    best_model_paths = [model_path for model_path in os.listdir(f'logs/{best_expt}') if 'model' in model_path]
    best_model_path = sorted(best_model_paths, key=lambda x: x.split('_')[1])[-1]
    return best_expt, best_model_path
print_leaderboard_and_get_best_model_fast()

lucky_diego_1                                      0.364 step 3750 | val loss 1.4910 | byte loss 0.3645 | ds 691.5s
16l_16h_512d_hour_good_max                         0.366 step 6250 | val loss 1.4960 | byte loss 0.3657 | ds 708.9s
16l_16h_512d_hour                                  0.369 step 6250 | val loss 1.5093 | byte loss 0.3690 | ds 708.7s
16l_16h_512d_3_hour_good_max_hig_prec              0.375 step 4000 | val loss 1.5341 | byte loss 0.3750 | ds 686.7s
16l_16h_512d_3_hour_good_max_med_prec_fixed_shuffle 0.376 step 4000 | val loss 1.5363 | byte loss 0.3756 | ds 687.0s
lucky_diego_2                                      0.378 step 3750 | val loss 1.5461 | byte loss 0.3780 | ds 691.5s
lucky_diego_5                                      0.379 step 3750 | val loss 1.5490 | byte loss 0.3787 | ds 693.7s
lucky_diego_6                                      0.379 step 3750 | val loss 1.5497 | byte loss 0.3788 | ds 691.9s
16l_16h_512d_8_hour_good_max                       0.38 step 6250 | val

('lucky_diego_1', 'model_06399.pt')

In [3]:
choosen_model = f'logs/{best_expt}/{best_model_path}'
# choosen_model = f'logs/16l_16h_512d_quick/model_00323.pt'
full_checkpoint = torch.load(choosen_model)
config = full_checkpoint['config']
m = GPT(config)

def remove_orig_mod_prefix(state_dict):
    return {k.replace('_orig_mod.', ''): v for k, v in state_dict.items()}

m.load_state_dict(remove_orig_mod_prefix(full_checkpoint['model']))
m.to('cuda')

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(8192, 512)
    (wpe): Embedding(512, 512)
    (h): ModuleList(
      (0-15): 16 x Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=512, out_features=1536, bias=True)
          (c_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=512, out_features=1024, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=1024, out_features=512, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=512, out_features=8192, bias=False)
)

In [4]:
enc = AutoTokenizer.from_pretrained('activated-ai/tiny-stories-8k-tokenizer')

tokenizer_config.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/324k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

In [5]:
generate(m, enc, "Lily went to the park and saw a friendly dog.", 255, 4)

sample 0: Lily went to the park and saw a friendly dog. She said, "Hello, doggy! Do you want to play with me?" The dog barked happily. Lily and the dog played together for a long time.

Suddenly, Lily's mom came to get her. She said, "Lily, it's time to go home. What's in your hand?" Lily showed her mom the penny and said, "Look, Mommy! I found this in the park! Is this a penny?" Her mom explained, "Yes, that's a penny. It's a lot of money. We can use it to buy something at the store someday."

Lily was happy to have met a friendly dog and a penny. She knew she would always remember the fun day at the park with her mom.
sample 1: Lily went to the park and saw a friendly dog. The dog was very polite and didn't bark too loud. Lily thought the dog was nice and she admired his shiny collar. 

After playing in the park, Lily went home and told her mom about the dog she admired. Her mom was happy that Lily had a fun time and was polite to the dog. Lily also thought about the nice dog she saw