In [49]:
# # load dataset

# from datasets import load_dataset
# from tokenizers import ByteLevelBPETokenizer

# tokenizer = ByteLevelBPETokenizer()
# dataset = load_dataset("roneneldan/TinyStories")

# # Specify the split you want to save (e.g., "train", "validation", "test")
# split = "train"

# # Get the desired split from the dataset
# subset = dataset[split]

# # Save the subset to a text file
# subset.to_csv("tinystories-train.txt", sep="\t", index=False)


In [50]:
#----- imports --------

import tqdm
import torch
from torch import nn
import wandb
import os
import tokenizers
from matplotlib import pyplot as plt
import numpy as np
import json


device= 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)
assert device == 'cuda', "This notebook is not optimized for CPU"

config = {
    "learning_rate": 1e-3,
    "sae_learning_rate": 5e-5,
    "model_embedding_layer": 6,
    "eval_interval": 500,
    "max_iters": 60000, 
    "H": 32, # hidden dimension size
    "B": 64,
    "T": 256,
    "C": 256,
    "feedforward_factor": 3,
    "n_heads": 8,
    "n_layers": 12,
    "tokenizer_vocab_size": 2**13,
    "git_hash": os.popen("git rev-parse HEAD").read().strip()
}

# initial
for k,v in config.items():
    locals ()[k] = v
model_name = "tiny-stories-model-kurtosis-regularize-0.44-loss.pt"
model_path = f'models/{model_name}'
# model_name = "tiny-stories-model-kurtosis-regularize-0.44-loss.pt"


#wandb.init(
#    project = "tinystories",
#    config = config,
#)

In [51]:
# Create the ./residuals directory if it doesn't already exist
import os


if not os.path.exists('./residuals'):
    os.makedir('./residuals')
    print("Created ./residuals directory")
else:
    print("./residuals directory already exists")

if not os.path.exists(f'./residuals/{model_name}'):
    os.mkdir(f'./residuals/{model_name}')
    print(f'created model {model_name} subdir')
else:
    print(f"./residuals/{model_name}")

dir = f"./residuals/{model_name}"

./residuals directory already exists
created model tiny-stories-model-kurtosis-regularize-0.44-loss.pt subdir


In [52]:

# stories_data = []
# data_dir = './data'
# for filename in os.listdir(data_dir):
#     file_path = os.path.join(data_dir, filename)
#     if filename.endswith('.json'):
#         with open(file_path, 'r', encoding='utf-8') as f:
#             data = json.load(f)
#             stories_data.extend(data)






In [53]:
# # load the tinystories tokenizer
# tokenizer = tokenizers.ByteLevelBPETokenizer(
#     "./tiny-stories-bpe-vocab.json", 
#     "./tiny-stories-bpe-merges.txt"
# )



# def encode(text):
#     return torch.tensor(tokenizer.encode(text).ids, dtype=torch.int64)
# def decode(encoded_text):
#     return tokenizer.decode(encoded_text.tolist())

# from tqdm import tqdm

# encoded_stories = [encode(story['story']) for story in tqdm(stories_data, desc="Encoding stories")]



In [54]:
# # save the encoded stories to a file
# torch.save(encoded_stories, 'encoded-stories.pt')

In [55]:

with open('tinystories-train.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [56]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1922767089


In [57]:
1916206969/4

479051742.25

In [58]:
print("length of dataset in lines: ", len(text.split('\n')))

length of dataset in lines:  14815490


In [59]:
print(text[:1000])

One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.
Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."
Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.
<|endoftext|>
Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.
One day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were

In [60]:
# paths = ['tinystories-train.txt']
# tokenizer = tokenizers.ByteLevelBPETokenizer()

# tokenizer.train(files=paths, vocab_size=tokenizer_vocab_size, min_frequency=2)

# tokenizer.save_model('.', 'tiny-stories-bpe')



# enc = tokenizer.encode("She sells sea shells by the sea shore!")
# tokenizer.decode(enc.ids)



In [61]:
tokenizer = tokenizers.ByteLevelBPETokenizer(
    "./tiny-stories-bpe-vocab.json", 
    "./tiny-stories-bpe-merges.txt"
)


In [62]:

def encode(text):
    return tokenizer.encode(text).ids
def decode(encoded_text):
    return tokenizer.decode(encoded_text)

from tqdm import tqdm

def batch_encode(text, batch_size):
    tokens = []
    for i in tqdm(range(0, len(text), batch_size)):
        tokens.extend(encode(text[i:i+batch_size]))
    return tokens


hello_encoded = encode("hello")
print(hello_encoded)
print(decode(hello_encoded))
vocab_size = tokenizer.get_vocab_size()
print("vocab size: ", vocab_size)

[6099]
hello
vocab size:  8192


In [63]:
sample_text = text[:200000]
sample_encoded = batch_encode(sample_text, 20000)

# get the amount of memory used by sample_encoded
def recursive_memory_usage(python_obj):
    if isinstance(python_obj, (str, int, float)):
        return python_obj.__sizeof__()
    if isinstance(python_obj, dict):
        return sum([recursive_memory_usage(v) for v in python_obj.values()])
    if isinstance(python_obj, list):
        return sum([recursive_memory_usage(v) for v in python_obj])
    return python_obj.__sizeof__()

print("memory used by sample_encoded: ", recursive_memory_usage(sample_encoded) / 1024**2, "MB")


100%|██████████| 10/10 [00:00<00:00, 59.81it/s]

memory used by sample_encoded:  1.2849769592285156 MB





In [64]:
print("length of dataset in characters: ", len(text[:10000]))
print("length of dataset in tokens: ", len(encode(text[:10000])))
chars_per_token = len(text[:10000]) / len(encode(text[:10000]))
print("characters per token: ", chars_per_token)

length of dataset in characters:  10000
length of dataset in tokens:  2440
characters per token:  4.098360655737705


In [65]:
# encoded_text = batch_encode(text, 200000)
# # data = torch.tensor(encode(text), dtype=torch.int64)
# data = torch.tensor(encoded_text, dtype=torch.int64, device='cuda')
# print(data.dtype)
# print(data.size())
# print(data.device)
# torch.save(data, 'tiny-stories-train.pt')
# encoded_text = None


In [66]:
# load data from tiny-stories-train.pt
data = torch.load('tiny-stories-train.pt', map_location='cuda')


In [67]:
len(data)

468163695

In [68]:
n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]

In [69]:
train_data.size()

torch.Size([421347325])

In [70]:
train_data[:T+1]

tensor([ 427,  357,   11,  258,  405,  452,  507,  365,  600,  258, 3729,  316,
         308,  763,   13,  312,  708,  303,  281, 2965,  265,  360,  342,  303,
         792,  303,  281, 2120,   13,  365,  450,  265,  953,  262, 3729,  342,
         308,  367,   11,  350,  338,  466, 5179,  258, 2227,  345,  308, 2498,
          13,  198,  343,  475,  265,  308,  367,  264,  326,   11,  328,  775,
          11,  335,  600,  745, 3729,   13, 1283,  346,  953,  303,  342,  525,
         264, 5179,  656, 2498,  484,  870,  367,  505,  264,  326,   11,  328,
         835,   11,  365,   11,  368,  478,  953,  262, 3729,  264, 1307,  633,
        2498,  421,  198, 4611,   11,  364, 1658,  262, 3729,  264, 7866,  262,
        2227,  345,  365,  374, 2498,   13,  415,  281,  393, 2965,  369,  454,
         792,  364,  435, 2500,  264, 1763,  761,  576,   13, 1454,  364, 1444,
          11,  365,  863,  308,  367,  369, 2500,  262, 3729,  264, 5132,  308,
        2498,   13,  320,  900,  520,  4

In [71]:
decode(train_data[:T+1].cpu().numpy())

'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.\n<|endoftext|>\nOnce upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\nOne day, Beep was driving in the park when he saw a big tree. The tree had many leaves th

In [72]:
x = train_data[:T]
y = train_data[1:T+1]
for t in range(T):
    context = x[:t+1]
    target = y[t]
    # print("when we see the text", context, "we predict the next character is", target)

In [73]:

import torch
import torch.nn as nn
from torch.nn import functional as F
# torch.manual_seed(1337)


class Head(nn.Module):
    '''One Head of self-attention'''
    def __init__(self, H):
        super().__init__()
        self.query = nn.Linear(C, H, bias=False)
        self.key = nn.Linear(C, H, bias=False)
        self.value = nn.Linear(C, H, bias=False)
        # self.output = nn.Linear(H, C, bias=False) # output matrix
        self.register_buffer('tril', torch.tril(torch.ones(T, T)))

    def forward(self, x):
        # Query and Key matrices for the attention mechanism
        # x: 8 tokens
        # Q: 16 tall (arbitrary), 32 long channels
        # K: 16 tall (arbitrary), 32 long channels

        query_vectors = self.query(x)
        key_vectors = self.key(x)


        # Attention masking(so we can't look into the past):

        tril = self.tril
        wei = torch.zeros(T, T) 
        wei = wei.masked_fill(tril == 0, float('-inf')) # set the upper triangular to -inf
        # xbow = wei @ x # apply the mask to the input, bag of words because simple avg.

        # multiply the two to get the attention weights
        attention_pattern = query_vectors @ key_vectors.transpose(-2, -1) # T, T
        attention_pattern = attention_pattern / (H ** 0.5) # scale the attention pattern for numerical stability
        attention_weights = F.softmax(attention_pattern + wei, dim=-1) # T, T (the row dimension is the query)

        value_vectors = self.value(x) # the direction we should go in the embedding space for each token (ie more blue) T, H

        # apply the attention weights to the value vectors
        context = attention_weights @ value_vectors # T, H

        # project back into original space from value space
        # return self.output(context)
        return context

x = torch.randn(B,T,C)
head = Head(H)
# head(x)


In [74]:
class MultiHeadAttention(nn.Module):
    '''Multiple heads of self-attention'''
    def __init__(self, H, C, n_heads): # H is head embedding space size, n_heads is number of heads
        super().__init__()
        self.heads = nn.ModuleList([Head(H) for _ in range(n_heads)])
        self.combine_heads = nn.Linear(H*n_heads, C)


    def forward(self,x):
        x = torch.cat([head(x) for head in self.heads], dim=-1)
        x = self.combine_heads(x)  # T, C
        return x

In [75]:
head = MultiHeadAttention(H, C, n_heads)
head.heads[0].forward(x).shape


torch.Size([64, 256, 32])

In [76]:
class FeedForward(nn.Module):
    '''Feed-forward neural network'''
    def __init__(self, C):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(C, C * feedforward_factor),
            nn.ReLU(),
            nn.Linear(C * feedforward_factor, C),
        )

    def forward(self, x):
        return self.net(x)

In [77]:
class LayerNorm(nn.Module):
    '''Layer normalization'''
    def __init__(self, C, use_affine=True):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(C)) if use_affine else None
        self.beta = nn.Parameter(torch.zeros(C)) if use_affine else None

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        if self.gamma is not None and self.beta is not None:
            return self.gamma * (x - mean) / (std + 1e-6) + self.beta
        else:
            return (x - mean) / (std + 1e-6)

In [78]:
class Block(nn.Module):
    '''Transformer block'''
    def __init__(self, H, C, n_heads):
        super().__init__()
        self.attention = MultiHeadAttention(H, C, n_heads)
        self.ff = FeedForward(C)
        self.norm1 = LayerNorm(C, use_affine=True)
        self.norm2 = LayerNorm(C, use_affine=True)

    def forward(self, x):
        x = x + self.attention(self.norm1(x))
        x = x + self.ff(self.norm2(x))
        return x

In [79]:
class GPT(nn.Module):

    def __init__(self, n_layers):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, C) 
        self.position_embedding_table = nn.Embedding(T, C)
        self.lm_head = nn.Linear(C, vocab_size)
        self.layers = nn.ModuleList([Block(H, C, n_heads) for _ in range(n_layers)])
    
    def forward(self, idx, targets=None, return_residuals=None):
        B, T = idx.shape
        token_emb = self.token_embedding_table(idx) # batch_dim, sequence_dim, embedding_dim
        pos_emb = self.position_embedding_table(torch.arange(T))
        x = token_emb + pos_emb # token identities and positions contained

        if return_residuals == "first_embedding":
            return x

        def excess_kurtosis(emb):
            mean = torch.mean(emb, dim=-1, keepdim=True) # BxTx1
            std = torch.std(emb, dim=-1, keepdim=True) # BxTx1

            centralized = emb - mean #BxTxC
            fourth_moment = torch.mean(centralized**4, dim=-1, keepdim=True) # BxTx1
            kurtosis = torch.squeeze(fourth_moment / std**4, dim=-1) # BxT
            # view as a 1d vector
            kurtosis = kurtosis.view(-1) - 3
            # make each one min 0
            kurtosis = torch.maximum(kurtosis, torch.tensor(0.0))
            # sum over the vector
            kurtosis = torch.sum(kurtosis)
            return kurtosis


        kurtosis_sum = torch.tensor(0.0)
        for i, layer in enumerate(self.layers):
            x = layer(x)
            kurtosis_sum += excess_kurtosis(x)
            if return_residuals is not None and i == return_residuals:
                return x
        
        kurtosis_avg = kurtosis_sum / (len(self.layers) * T * B)

        logits = self.lm_head(x) # batch_dim, sequence_dim, vocab_size

        batch_dim, sequence_dim, embedding_dim = logits.size()

        # loss = F.cross_entropy(logits, targets) this won't work because we need 1d logits and 1d targets
        # one-hot-vectors are a line in the x-dimension, so the shape of shape of the logits should be (-1, vocab_size).

        if targets is None:
            return logits, None, kurtosis_avg
        else:
            # a list of all the predictions, reguardles of batch.
            # xdim: probabilities of each character in the vocab (embedding_dim=vocab_size)
            # ydim: all predictions for all batches flattened (batch_dim*sequence_dim)
            logits_loss_view = logits.view(-1, vocab_size) 
            # targets loss view
            # xdim: all targets for all batches flattened (batch_dim*sequence_dim)
            # so this would be like, [1,4,5,1,2,3, ...]
            # where each number is the correct next index of the one hot vector
            targets_loss_view = targets.view(-1)
            loss = F.cross_entropy(logits_loss_view, targets_loss_view)
            return logits, loss, kurtosis_avg

    def generate(self, idx, max_new_tokens, temperature=0.5):
        for _ in range(max_new_tokens):
            logits, loss = self(idx[:,-T:])
            # get the predictions of the last token
            last_token_logits = logits[:, -1, :] # all batches, last token, all probabilities
            # apply temperature
            last_token_logits = last_token_logits / temperature
            # softmax to get probabilities
            probabilities = F.softmax(last_token_logits, dim=-1)
            # sample from the probabilities
            next_token = torch.multinomial(probabilities, num_samples=1)
            # add the new token to the idx tensor
            idx = torch.cat((idx, next_token), dim=1)
        return idx
    def prompt_model(self, prompt, max_new_tokens, temperature=0.5):
        autoregressive_seq = encode(prompt)
        for _ in range(max_new_tokens):
            prediction_index = len(autoregressive_seq)-1

            model_input = torch.tensor(autoregressive_seq)
            
            while model_input.shape[0] < T:
                pad_token = torch.tensor(encode("\n"))
                model_input = torch.cat((model_input, pad_token), dim=0)

            model_input
            model_input = model_input.unsqueeze(0)

            logits, loss, kurtosis_avg = model(model_input)
            prediction_token = logits[:, prediction_index, :] / temperature
            probabilities = F.softmax(prediction_token, dim=-1)
            next_token = torch.multinomial(probabilities, num_samples=1)
            next_token = next_token.item()

            autoregressive_seq.append(next_token)
        # get the autoregressive sequence
        return decode(autoregressive_seq)
    def get_embedding(self, prompt, override_model_embedding_layer=None):
        if override_model_embedding_layer is None:
            selected_model_embedding_layer = model_embedding_layer
        else:
            selected_model_embedding_layer = override_model_embedding_layer
        sequence = encode(prompt)
        model_input = torch.tensor(sequence)
        sequence_index = len(sequence) - 1
        while model_input.shape[0] < T:
            pad_token = torch.tensor(encode("\n"))
            model_input = torch.cat((model_input, pad_token), dim=0)
        model_input = model_input.unsqueeze(0)
        embedding = self.forward(model_input, return_residuals=selected_model_embedding_layer)
        # remove the batch dimension
        embedding = embedding.squeeze(0)[sequence_index]
        return embedding



    

model = GPT(n_layers)

In [80]:
# load the model
model.load_state_dict(torch.load(model_path))



<All keys matched successfully>

In [81]:
print(model.prompt_model(
    "<|endoftext|>",
    100,
    0.3
))

<|endoftext|>

Once upon a time, there was a little girl called Lucy. She was only three years old and loved to play. One day, she was playing in the garden when she noticed something strange. There was a big, round object in the grass.
Lucy was curious and wanted to see what it was. She slowly walked closer to the object and saw that it was a big, round object. She was so excited, she wanted to touch it.
Suddenly, a voice called out


# saving embeddings

In [82]:
# Freeze model parameters and disable building compute backprop graph
for param in model.parameters():
    param.requires_grad = False

torch.set_grad_enabled(False)


<torch.autograd.grad_mode.set_grad_enabled at 0x7f92c35ee1a0>

In [83]:


def get_context_window(split, ix):
    data = train_data if split == 'train' else val_data
    x = torch.stack([data[i:i+T] for i in ix]) # random sequences
    y = torch.stack([data[i+1:i+T+1] for i in ix]) # next character for each random sequence

    return x, y


accumulated_residuals = []
residuals_per_save = 2_000
save_idx = 0
ratio_residuals_save = 0.1

with torch.no_grad():
    for split in ['train', 'val']:
        data = train_data if split == 'train' else val_data
        tokens_in_batch = B*T
        for i in tqdm(range(0, len(data)-tokens_in_batch, tokens_in_batch)): # the - tokens_in_batch is there so we skip the last, potentially unfull batch
            # B ixs, step is T, start at i
            ixs = torch.arange(i, i+tokens_in_batch, T)
            xb, yb = get_context_window(split, ixs)
            residuals = model(xb, return_residuals=model_embedding_layer)
            residuals_flattened = residuals.view(-1, T)
            indices = torch.randperm(tokens_in_batch)[:int(tokens_in_batch*ratio_residuals_save)]
            sampled_residuals = residuals_flattened[indices].clone()
            del residuals
            del residuals_flattened

            accumulated_residuals.append(sampled_residuals)
            if len(accumulated_residuals) >= residuals_per_save:
                torch.save(accumulated_residuals, f"{dir}/residuals_{split}_{save_idx}.pt")
                accumulated_residuals = []
                save_idx += 1
        torch.save(accumulated_residuals, f"{dir}/residuals_{split}_{save_idx}.pt")




        


  0%|          | 3/25716 [00:00<14:36, 29.34it/s]

100%|██████████| 25716/25716 [16:58<00:00, 25.26it/s]  
100%|██████████| 2857/2857 [02:09<00:00, 21.98it/s] 


# Residuals Kurtosis evaluation

In [85]:
def load_tensor(filepath):
    # load the .pt tensor
    tensor = torch.load(filepath)
    tensor = torch.cat(tensor, dim=0)
    tensor = tensor.to(device)
    return tensor
    
residuals = load_tensor(f"{dir}/residuals_train_1.pt")

In [86]:
residuals.shape

torch.Size([3276000, 256])

In [87]:
# story1='''Once upon a time, in a big forest, there lived a rhinoceros named Roxy. Roxy loved to climb. She climbed trees, rocks, and hills. One day, Roxy found an icy hill. She had never seen anything like it before. It was shiny and cold, and she wanted to climb it.
# Roxy tried to climb the icy hill, but it was very slippery. She tried again and again, but she kept falling down. Roxy was sad. She wanted to climb the icy hill so much. Then, she saw a little bird named Billy. Billy saw that Roxy was sad and asked, "Why are you sad, Roxy?"
# Roxy told Billy about the icy hill and how she couldn't climb it'''

# assume BxTxC
def excess_kurtosis(emb):
    mean = torch.mean(emb, dim=-1, keepdim=True) # BxTx1
    std = torch.std(emb, dim=-1, keepdim=True) # BxTx1

    centralized = emb - mean #BxTxC
    fourth_moment = torch.mean(centralized**4, dim=-1, keepdim=True) # BxTx1
    kurtosis = torch.squeeze(fourth_moment / std**4, dim=-1) # BxT
    return kurtosis - 3

excess_kurtosis(residuals[0])



# emb1 = model.get_embedding("Tim and Lily saw a big dog", override_model_embedding_layer=6)
# emb2 = model.get_embedding("Tim and Lily noticed a cat", override_model_embedding_layer=6)


# import matplotlib.pyplot as plt
# import numpy as np



# # Plot emb1 and emb2 in the same plot
# # plt.figure(figsize=(10, 5))
# # plt.plot(np.square(emb1.cpu().detach().numpy()), label='emb1', color='blue')
# # plt.plot(np.square(emb2.cpu().detach().numpy()), label='emb2', color='red')
# # plt.xlabel('Index')
# # plt.ylabel('Value')
# # plt.title('emb1 and emb2 Plot')
# # plt.legend()
# # plt.show()



# # get the index of the highest value
# # Assuming emb1 and emb2 are tensors
# highest_value_index_emb1 = torch.argmax(emb1).item()
# highest_value_index_emb2 = torch.argmax(emb2).item()

# lowest_value_index_emb1 = torch.argmin(emb1).item()
# lowest_value_index_emb2 = torch.argmin(emb2).item()

# print(f"Index of the highest value in emb1: {highest_value_index_emb1}")
# print(f"Index of the highest value in emb2: {highest_value_index_emb2}")
# print(f"Index of the lowest value in emb1: {lowest_value_index_emb1}")
# print(f"Index of the lowest value in emb2: {lowest_value_index_emb2}")

# print(f"emb1 excess kurtosis: {excess_kurtosis(emb1)}")
# print(f"emb2 excess kurtosis: {excess_kurtosis(emb2)}")

# # dot product between emb1 and emb2
# emb1_l2 = F.normalize(emb1, p=2, dim=-1)
# emb2_l2 = F.normalize(emb2, p=2, dim=-1)
# print(f"Dot product between emb1 and emb2: {torch.dot(emb1_l2, emb2_l2)}")



tensor(1.8295, device='cuda:0')