In [1]:
# from models.tokenizer import Tokenizer
from models.model import *

In [None]:
lengths = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# save list in csv
with open('results.csv', 'w') as f:
    f.write('length,accuracy\n')
    for length in lengths:
        f.write(f'{length}\n')

In [2]:
import torch
import os

In [50]:
# ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=2048)
args = ModelArgs(
    vocab_size=32768,
    dim=512,
    n_layers=6,
    # dim=1024,
    # n_layers=16,
    # dim=2048,
    # n_layers=24,
    n_heads=16,
    ffn_dim_multiplier=2,
)
model = Transformer(args)
args

ModelArgs(dim=512, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=32768, multiple_of=1, ffn_dim_multiplier=2, norm_eps=1e-05, rope_theta=500000, max_batch_size=32, max_seq_len=2048)

In [39]:
total_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
# model.tok_embeddings.weight.nbytes
embedding_bytes = model.tok_embeddings.weight.numel() * model.tok_embeddings.weight.element_size()
model_bytes = total_bytes - embedding_bytes
# total_bytes, embedding_bytes,
# f'{total_bytes:,}', '175,138,816'

print(f'Model size:     {model_bytes:,} bytes')
print(f'Embedding size: {embedding_bytes:,} bytes')
print(f'Total size:     {total_bytes:,} bytes')
print()

total_params = sum(p.numel() for p in model.parameters())
embedding_params = model.tok_embeddings.weight.numel()
model_params = total_params - embedding_params

print(f'Model params:     {model_params:,}')
print(f'Embedding params: {embedding_params:,}')
print(f'Total params:     {total_params:,}')
print()

param_dict = {pn: p for pn, p in model.named_parameters()}
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]

print(f'Decay params:   {sum(p.numel() for p in decay_params):13,}')
print(f'Nodecay params: {sum(p.numel() for p in nodecay_params):13,}')

Model size:     130,050,048 bytes
Embedding size: 67,108,864 bytes
Total size:     197,158,912 bytes

Model params:     32,512,512
Embedding params: 16,777,216
Total params:     49,289,728

Decay params:      49,283,072
Nodecay params:         6,656


In [51]:
model = model.cuda(5)

In [41]:
sd = model.state_dict()
sd['tok_embeddings.weight'][0][0] = 1
sd['tok_embeddings.weight'] = sd['tok_embeddings.weight'].cpu()
print(sd['tok_embeddings.weight'])
print(sd['tok_embeddings.weight'].device)
print(sd['tok_embeddings.weight'].cpu().device)
print(model.tok_embeddings.weight[0][0])
model.state_dict()

tensor([[ 1.0000,  0.2869,  1.1269,  ...,  0.3213, -1.5769,  1.9017],
        [-1.3611, -0.5551,  1.9559,  ..., -0.7281,  1.2629, -1.1705],
        [-1.3733,  0.2272,  1.1484,  ..., -1.3380,  0.8493, -0.3278],
        ...,
        [-1.5700, -1.1161, -0.2484,  ..., -0.9940,  0.3863, -1.2000],
        [-0.8732, -0.3420, -0.2484,  ..., -0.1483, -0.8331, -0.2186],
        [-1.8066,  1.9525,  0.5955,  ..., -0.6876, -0.6302, -0.1115]])
cpu
cpu
tensor(1., device='cuda:5', grad_fn=<SelectBackward0>)


OrderedDict([('tok_embeddings.weight',
              tensor([[ 1.0000,  0.2869,  1.1269,  ...,  0.3213, -1.5769,  1.9017],
                      [-1.3611, -0.5551,  1.9559,  ..., -0.7281,  1.2629, -1.1705],
                      [-1.3733,  0.2272,  1.1484,  ..., -1.3380,  0.8493, -0.3278],
                      ...,
                      [-1.5700, -1.1161, -0.2484,  ..., -0.9940,  0.3863, -1.2000],
                      [-0.8732, -0.3420, -0.2484,  ..., -0.1483, -0.8331, -0.2186],
                      [-1.8066,  1.9525,  0.5955,  ..., -0.6876, -0.6302, -0.1115]],
                     device='cuda:5')),
             ('layers.0.attention.wq.weight',
              tensor([[ 0.0003, -0.0128,  0.0400,  ...,  0.0423,  0.0324,  0.0247],
                      [-0.0348, -0.0405, -0.0411,  ..., -0.0153, -0.0129, -0.0347],
                      [-0.0010, -0.0343, -0.0246,  ..., -0.0222, -0.0367,  0.0193],
                      ...,
                      [-0.0088,  0.0218, -0.0368,  ...,  0.0203,

In [None]:
# model.state_dict()
# state_dict = model.state_dict()
# for k, v in state_dict.items():
#     state_dict[k] = v.cpu()
state_dict = {k: v.cpu() for k, v in model.state_dict().items()}
torch.save(state_dict, 'model.pt')
type(state_dict)
model.load_state_dict(torch.load('model.pt', weights_only=True))
torch.load('model.pt', weights_only=True)

{'tok_embeddings.weight': tensor([[ 1.0000,  1.0622, -0.2877,  ..., -0.4027,  0.4179, -0.9018],
         [-1.5353, -1.1946, -0.5567,  ..., -0.7425, -0.6992, -1.2020],
         [ 0.3854, -0.1506,  0.3458,  ...,  0.9315,  1.2356, -2.6614],
         ...,
         [ 0.6540,  2.3294, -2.2962,  ..., -0.1253,  0.1848, -0.4712],
         [ 0.8149, -1.7697,  0.6968,  ..., -0.3101, -0.2428,  0.7096],
         [ 0.7534, -0.0732,  1.1926,  ...,  0.0278, -0.7336, -1.3699]]),
 'layers.0.attention.wq.weight': tensor([[-0.0216, -0.0114,  0.0061,  ...,  0.0302, -0.0062,  0.0137],
         [ 0.0136, -0.0049, -0.0195,  ...,  0.0208, -0.0383, -0.0147],
         [-0.0010,  0.0347, -0.0438,  ..., -0.0066, -0.0179,  0.0043],
         ...,
         [-0.0353, -0.0021,  0.0440,  ...,  0.0237,  0.0080, -0.0329],
         [ 0.0193, -0.0149,  0.0413,  ...,  0.0029,  0.0067,  0.0271],
         [ 0.0234,  0.0396,  0.0275,  ...,  0.0082, -0.0126,  0.0352]]),
 'layers.0.attention.wk.weight': tensor([[-0.0290,  0.0091,

In [4]:
import pandas as pd

# load iris dataset
example_df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
print(example_df.to_csv(index=False))

sepal_length,sepal_width,petal_length,petal_width,species
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa
4.4,2.9,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa
5.4,3.7,1.5,0.2,setosa
4.8,3.4,1.6,0.2,setosa
4.8,3.0,1.4,0.1,setosa
4.3,3.0,1.1,0.1,setosa
5.8,4.0,1.2,0.2,setosa
5.7,4.4,1.5,0.4,setosa
5.4,3.9,1.3,0.4,setosa
5.1,3.5,1.4,0.3,setosa
5.7,3.8,1.7,0.3,setosa
5.1,3.8,1.5,0.3,setosa
5.4,3.4,1.7,0.2,setosa
5.1,3.7,1.5,0.4,setosa
4.6,3.6,1.0,0.2,setosa
5.1,3.3,1.7,0.5,setosa
4.8,3.4,1.9,0.2,setosa
5.0,3.0,1.6,0.2,setosa
5.0,3.4,1.6,0.4,setosa
5.2,3.5,1.5,0.2,setosa
5.2,3.4,1.4,0.2,setosa
4.7,3.2,1.6,0.2,setosa
4.8,3.1,1.6,0.2,setosa
5.4,3.4,1.5,0.4,setosa
5.2,4.1,1.5,0.1,setosa
5.5,4.2,1.4,0.2,setosa
4.9,3.1,1.5,0.2,setosa
5.0,3.2,1.2,0.2,setosa
5.5,3.5,1.3,0.2,setosa
4.9,3.6,1.4,0.1,setosa
4.4,3.0,1.3,0.2,setosa
5.1,3.4,1.5,0.2,setosa
5.0,3.5,1.3,0.3,setosa

In [None]:
# model.tok_embeddings.weight
# model(torch.randint(0, 32768, (4, 10)).cuda(5)).shape

Parameter containing:
tensor([[ 1.0000,  1.0622, -0.2877,  ..., -0.4027,  0.4179, -0.9018],
        [-1.5353, -1.1946, -0.5567,  ..., -0.7425, -0.6992, -1.2020],
        [ 0.3854, -0.1506,  0.3458,  ...,  0.9315,  1.2356, -2.6614],
        ...,
        [ 0.6540,  2.3294, -2.2962,  ..., -0.1253,  0.1848, -0.4712],
        [ 0.8149, -1.7697,  0.6968,  ..., -0.3101, -0.2428,  0.7096],
        [ 0.7534, -0.0732,  1.1926,  ...,  0.0278, -0.7336, -1.3699]],
       device='cuda:5', requires_grad=True)

In [43]:
state_dict = torch.load('model.pt', weights_only=True)
state_dict

OrderedDict([('tok_embeddings.weight',
              tensor([[ 1.0000,  0.2869,  1.1269,  ...,  0.3213, -1.5769,  1.9017],
                      [-1.3611, -0.5551,  1.9559,  ..., -0.7281,  1.2629, -1.1705],
                      [-1.3733,  0.2272,  1.1484,  ..., -1.3380,  0.8493, -0.3278],
                      ...,
                      [-1.5700, -1.1161, -0.2484,  ..., -0.9940,  0.3863, -1.2000],
                      [-0.8732, -0.3420, -0.2484,  ..., -0.1483, -0.8331, -0.2186],
                      [-1.8066,  1.9525,  0.5955,  ..., -0.6876, -0.6302, -0.1115]],
                     device='cuda:5')),
             ('layers.0.attention.wq.weight',
              tensor([[ 0.0003, -0.0128,  0.0400,  ...,  0.0423,  0.0324,  0.0247],
                      [-0.0348, -0.0405, -0.0411,  ..., -0.0153, -0.0129, -0.0347],
                      [-0.0010, -0.0343, -0.0246,  ..., -0.0222, -0.0367,  0.0193],
                      ...,
                      [-0.0088,  0.0218, -0.0368,  ...,  0.0203,

In [35]:
model.cuda(5)
print(model.tok_embeddings.weight.device)
model.cpu().state_dict()
model.tok_embeddings.weight.device

cuda:5


device(type='cpu')

In [8]:
for n, p in param_dict.items():
    if p.dim() < 2:
            print(f'{n:80} {p.numel():13,}')

layers.0.attention_norm.weight                                                           2,048
layers.0.ffn_norm.weight                                                                 2,048
layers.1.attention_norm.weight                                                           2,048
layers.1.ffn_norm.weight                                                                 2,048
layers.2.attention_norm.weight                                                           2,048
layers.2.ffn_norm.weight                                                                 2,048
layers.3.attention_norm.weight                                                           2,048
layers.3.ffn_norm.weight                                                                 2,048
layers.4.attention_norm.weight                                                           2,048
layers.4.ffn_norm.weight                                                                 2,048
layers.5.attention_norm.weight                    

In [6]:
model(torch.randint(0, 32000, (2, 512)), start_pos=0).shape

torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
torch.Size([2, 512, 1024]) tensor(0)
t

torch.Size([2, 512, 32768])

In [7]:
model.layers[0].feed_forward

FeedForward(
  (w1): Linear(in_features=1024, out_features=2048, bias=False)
  (w2): Linear(in_features=2048, out_features=1024, bias=False)
  (w3): Linear(in_features=1024, out_features=2048, bias=False)
)

# Del later

In [10]:
raise Exception

Exception: 

In [None]:
import torch
import torch.nn as nn

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(512, 512)
        # self.t = torch.ones(512, 512)
        self.register_buffer('t', torch.ones(512, 512), persistent=False)
        
    def forward(self, x):
        x = self.l1(x)
        x = torch.matmul(x, self.t)
        return x

In [None]:
model = Model()
print(model.t.device, model.l1.weight.device)
model = model.to('cuda:0')
print(model.t.device, model.l1.weight.device)
print(model.state_dict().keys())


cpu cpu
cuda:0 cuda:0
odict_keys(['l1.weight', 'l1.bias'])


In [None]:
from transformers import AutoTokenizer

mistral

In [None]:
# os.environ['RANK'] = '0'
# os.environ['WORLD_SIZE'] = '1'
# os.environ['MASTER_ADDR'] = 'localhost'
# os.environ['MASTER_PORT'] = '29500'
# torch.distributed.init_process_group()

# from fairscale.nn.model_parallel.initialize import initialize_model_parallel
# initialize_model_parallel(1)

In [1]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")

In [3]:
enc.n_vocab

50257