In [7]:
import tiktoken
import torch

for rapid prototyping with new ideas 

In [2]:
import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)

# Save to a file
with open('input.txt', 'w', encoding='utf-8') as f:
    f.write(response.text)

print("Downloaded successfully!")

Downloaded successfully!


In [3]:
with open("input.txt", 'r') as f:
    text = f.read()

data = text[:1000]
print(data[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [17]:
enc = tiktoken.get_encoding("gpt2")

tokens = enc.encode(data)
print(tokens[:24])

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13]


`198` is `\n` or newline just to verify correctness sometimes. 

In [18]:
# create inputs and outputs using .view operations
buf = torch.tensor(tokens[:24+1])

x = buf[:-1].view(4,6)
# output is input shifted by 1
y = buf[1:].view(4,6) 

print(f"input batch = {x} \noutput batch = {y}")

input batch = tensor([[ 5962, 22307,    25,   198,  8421,   356],
        [ 5120,   597,  2252,    11,  3285,   502],
        [ 2740,    13,   198,   198,  3237,    25],
        [  198,  5248,   461,    11,  2740,    13]]) 
output batch = tensor([[22307,    25,   198,  8421,   356,  5120],
        [  597,  2252,    11,  3285,   502,  2740],
        [   13,   198,   198,  3237,    25,   198],
        [ 5248,   461,    11,  2740,    13,   198]])


### Weight tying - exploring openai wts for lm_head and token embedding (wte) tensors

In [1]:
from transformers import GPT2LMHeadModel

hf = GPT2LMHeadModel.from_pretrained("GPT2") # 124M, use GPT2-XL for actual 1.5b model
sd_hf = hf.state_dict() # stores parameters

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [3]:
for i,j in sd_hf.items():
    print(f"{i} | {j.shape}")

transformer.wte.weight | torch.Size([50257, 768])
transformer.wpe.weight | torch.Size([1024, 768])
transformer.h.0.ln_1.weight | torch.Size([768])
transformer.h.0.ln_1.bias | torch.Size([768])
transformer.h.0.attn.c_attn.weight | torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias | torch.Size([2304])
transformer.h.0.attn.c_proj.weight | torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias | torch.Size([768])
transformer.h.0.ln_2.weight | torch.Size([768])
transformer.h.0.ln_2.bias | torch.Size([768])
transformer.h.0.mlp.c_fc.weight | torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias | torch.Size([3072])
transformer.h.0.mlp.c_proj.weight | torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias | torch.Size([768])
transformer.h.1.ln_1.weight | torch.Size([768])
transformer.h.1.ln_1.bias | torch.Size([768])
transformer.h.1.attn.c_attn.weight | torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias | torch.Size([2304])
transformer.h.1.attn.c_proj.weight | torch.Size([768, 7

In [5]:
print(sd_hf['transformer.wte.weight'].shape)
print(sd_hf['lm_head.weight'].shape)

torch.Size([50257, 768])
torch.Size([50257, 768])


Lets verify if they match in the openai pretrained weights

In [8]:
torch.allclose(sd_hf['transformer.wte.weight'], sd_hf['lm_head.weight'])

True

See! So lets go back in sec1.py and enforce this condition. This is called as weight tying, which saves us 50257*768 ~ 38M parameters. [Here is a long post](https://www.reddit.com/r/MachineLearning/comments/1eqm0lr/r_why_and_when_tying_embedding_a_story/) on _why it works_. 

### effect of initialization on residual streams

In [9]:
# standard deviation grows inside the residual stream
x = torch.zeros(768)
y = torch.zeros(768)

n = 100 # e.g. 100 layers
for i in range(n):
    num = torch.randn(768)
    x += n**-0.5 * num
    y+= num

print(f'without init: {y.std()} \nwith init factor 1/sqrt(in_features): {x.std()}')

without init: 10.133536338806152 
with init factor 1/sqrt(in_features): 1.013353705406189


__What happened__: despite adding `num` sampled from normal distribution, the std deviation grows to $\sqrt(in\_features)$; so we divide by this factor each time after sampling before adding it to variable `x`