In [1]:
import os
import sys
from pathlib import Path
sys.path.insert(1, os.path.realpath(os.path.pardir))


import safetensors
import torch
import torch.nn.functional as F
from accelerate import notebook_launcher
from einops import rearrange
from einops.layers.torch import Rearrange
from simple_parsing import ArgumentParser
import einops

from models import brainformer
from utils.data_utils import BrainDataset, get_tokenizer
from utils.train_utils import TrainConfig, run_train_model, count_parameters

from torch import nn
from models.brainformer import Encoder, CrossBlock, build_complex_rope_cache, Config


In [2]:
from transformers import GPT2Tokenizer
from models.gpt2_model import GPT

import tiktoken

from contextlib import nullcontext


In [3]:
device = 'cuda'

device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
dtype = 'float32'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)


In [4]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT.from_pretrained('gpt2', dict(dropout=0.0))
model.train().to(device)

print('initing completed')

loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0
number of parameters: 123.65M
initing completed


In [5]:
start = '<|endoftext|>i love you so much <|endoftext|>'

input_ids = gpt2_tokenizer(start,  return_tensors="pt")['input_ids']
input_ids = input_ids.to(device)

prefix = torch.randn(1, 32, model.config.n_embd, dtype=ptdtype, device=device)

print('Input shapes', input_ids.shape, prefix.shape)

loss, logits = model.forward(idx=input_ids, targets=input_ids, prefix=prefix, )

print(loss)

Input shapes torch.Size([1, 8]) torch.Size([1, 32, 768])
tensor(6.8662, device='cuda:0', grad_fn=<NllLossBackward0>)


In [12]:
model.train().to(device)

enc = tiktoken.get_encoding("gpt2")
encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
decode = lambda l: enc.decode(l)


start = "Russian is the best"
start_ids = encode(start)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

prefix = torch.randn(1, 16, model.config.n_embd, dtype=ptdtype, device=device)

max_new_tokens = 15
temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 20

with torch.no_grad():
    with ctx:
        for k in range(3):
            y = model.generate(x, max_new_tokens, prefix=prefix, temperature=temperature, top_k=top_k)
            # print(y)
            print(decode(y[0].tolist()))
            print('---------------')

Russian is the best
torch.Size([1, 4])
Russian is the best- the and (, and and as ", in a for in,
---------------
Russian is the best
, for " the, the with::., is, a
---------------
Russian is the best with of a ", the (, and as for the, " and
---------------


### loss calculation 

## Let's add context vectors into model

- forward get into account idxs and also context vectors I did smt similar actually. 

N-fixed number of brain_tokens. 

- add into beggining of the sne

## Cut our model: remove layers.

This is approach to reduce number of layers. which allows to tune models with fewer GPU clusters. Like distillation.


So we can distill model for our task. 

In [7]:
all_blocks = list(model.transformer.h.children())
cut_blocks = all_blocks[:4] + all_blocks[4:8]
model.transformer.h = nn.Sequential(*cut_blocks)
count_parameters(model)


Total: 96.09M, Trainable: 96.09M


(96088320, 96088320)

In [8]:
max_new_tokens = 15
temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 10

with torch.no_grad():
    with ctx:
        for k in range(3):
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
            # print(y)
            print(decode(y[0].tolist()))
            print('---------------')

Russian is the best possible thing imaginable. Thereupon, however, thereuponuponuponuponupon
---------------
Russian is the best thing happening hereabouts. Especially since thereabouts are nowadaysadays only two
---------------
Russian is the best thing ever happening in regards regards regards regards regards regards regards regards regards #define
---------------


In [9]:
l1 = [20, 30, 50]
l2 = [50, 60]

l1 + l2

[20, 30, 50, 50, 60]

In [10]:
len(list(amodel.transformer.h[:10].children()))

NameError: name 'amodel' is not defined

In [None]:
start = '<|endoftext|>i love you so much'
print(encode(start))

input_ids = gpt2_tokenizer(start,  return_tensors="pt")['input_ids']
input_ids


[50256, 72, 1842, 345, 523, 881]


tensor([[50256,    72,  1842,   345,   523,   881]])

In [None]:
project_name = 'brainformer'

train_config = TrainConfig(exp_name='brainformer_simple', 
                           mixed_precision=False, 
                           batch_size=16)

data_path = Path(r"D:\Work\brain-to-text-competition\data\competitionData")

# train_dataset = BrainDataset(data_path / 'train')

test_dataset = BrainDataset(data_path / 'test')

# submit_dataset = BrainDataset(data_path / 'competitionHoldOut')



Runed processing of the  D:\Work\brain-to-text-competition\data\competitionData\test
bad_samples [15, 17, 18, 22]


In [None]:
test_dataset.targets

['Theocracy reconsidered.',
 'Rich purchased several signed lithographs.',
 'So rules we made, in unabashed collusion.',
 "Lori's costume needed black gloves to be completely elegant.",
 "The tooth fairy forgot to come when Roger's tooth fell out.",
 'That stinging vapor was caused by chloride vaporization.',
 "Before Thursday's exam, review every formula.",
 'Wildfire near Sunshine forces park closures.',
 "The word means it won't boil away easily, nothing else.",
 "Would a blue feather in a man's hat make him happy all day?",
 'He talked about unauthentic storylines too.',
 'Most young rise early every morning.',
 'With this no loyal citizen can quarrel.',
 'Primitive tribes have an upbeat attitude.',
 'And it was not a judge, but a justice of the peace who made the decision.',
 'Sometimes, desperate measures can also turn into solid moves for the future.',
 'Academic aptitude guarantees your diploma.',
 'To some extent predispositions are shaped by exposure to group environments.',


In [None]:
def get_unique_words(lines):
    unique_words = set()
    for line in lines:
        unique_words.update(line.lower().replace('.', '').split())
    return unique_words

In [None]:
train_set = get_unique_words(train_dataset.targets)
test_set = get_unique_words(test_dataset.targets)

intersection = train_set.intersection(test_set)

print(len(train_set))
print(len(test_set))
print(len(intersection))


7014
1566
1217


In [None]:
train_dataset.targets

NameError: name 'train_dataset' is not defined

In [None]:
from models import gpt2_model
