In [5]:
! ls

assets		  model.py		scaling_laws.ipynb
bench.py	  out			src
BranchyGPT.ipynb  out-shakespeare-char	train.py
config		  __pycache__		transformer_sizing.ipynb
configurator.py   README.md		wandb
data		  rejectOption.py
LICENSE		  sample.py


# Branchy GPT

In this notebook we will try to train a custom BranchyGPT for experiment on Shakespeare_char dataset for experimental purposes, It might scale further to openwebtext after.

First please run 

    python data/shakespeare_char/prepare.py


In [6]:
import torch
import os
import numpy as np
import time
from contextlib import nullcontext

from model import GPTConfig, GPT
torch.manual_seed(1337)

<torch._C.Generator at 0x7f01701129b0>

In [7]:
# Setting up checkpoint saving directory
out_dir = "./BranchyGPT_save"
dataset = "shakespeare_char"
dtype = torch.float16

# Get device between GPU or CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=dtype)
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

#Prepare dataset
gradient_accumulation_steps = 1 # used to simulate larger batch sizes
batch_size = 128
data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')


In [8]:
# Get default conf, model is GPT2
gptconf = GPTConfig()
gptconf.block_size = 256
gptconf.n_layer = 6
gptconf.n_head = 6
gptconf.n_embd = 384
model = GPT(gptconf)
model.to(device)
print(gptconf)

number of parameters: 29.96M
GPTConfig(block_size=256, vocab_size=50304, n_layer=6, n_head=6, n_embd=384, dropout=0.0, bias=True)


In [9]:
# adamw optimizer
learning_rate = 1e-3 # max learning rate
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0

optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device)

if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model) # requires PyTorch 2.0


num decayed parameter tensors: 26, with 30,031,872 parameters
num non-decayed parameter tensors: 50, with 30,720 parameters
using fused AdamW: True
compiling the model... (takes a ~minute)


In [10]:
def get_batch(split, block_size, batch_size, device):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

X, Y = get_batch('train', gptconf.block_size, batch_size, device) # fetch the very first batch

eval_iters = 200 # how many iterations to average loss over when evaluating
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, gptconf.block_size, batch_size, device)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [11]:
# Training loop
iter_num = 0
eval_interval = 100
best_val_loss = 0
max_iters = 2000
log_interval = 10
t0 = time.time()
while True:
    
    lr = learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': gptconf,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
    for micro_step in range(gradient_accumulation_steps):
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
                # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch('train', gptconf.block_size, batch_size, device)
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)
    
    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        # get loss as float. note: this is a CPU-GPU sync point
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * gradient_accumulation_steps
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")

    iter_num += 1

    if iter_num > max_iters:
        break


  Pointwise(
    'cuda',
    torch.float32,
    tmp0 = load(arg77_1, i1 + 256 * i0)
    tmp1 = load(arg26_1, i2 + 384 * (tmp0))
    tmp2 = index_expr(i1, dtype=torch.int64)
    tmp3 = load(arg27_1, i2 + 384 * (tmp2))
    tmp4 = tmp1 + tmp3
    tmp5 = load(buf1, i1 + 256 * i0)
    tmp6 = tmp4 - tmp5
    tmp7 = load(buf2, i1 + 256 * i0)
    tmp8 = index_expr(384, torch.float32)
    tmp9 = tmp7 / tmp8
    tmp10 = constant(1e-05, torch.float32)
    tmp11 = tmp9 + tmp10
    tmp12 = rsqrt(tmp11)
    tmp13 = tmp6 * tmp12
    return tmp13
    ,
    ranges=[128, 256, 384],
    origins={mul, embedding, arg26_1, add, iota, add_1, sub, arg77_1, unsqueeze, var_mean, arg27_1, embedding_1, rsqrt}
  )
))
  Pointwise(
    'cuda',
    torch.float32,
    tmp0 = load(arg77_1, i1 + 256 * i0)
    tmp1 = load(arg26_1, i2 + 384 * (tmp0))
    tmp2 = index_expr(i1, dtype=torch.int64)
    tmp3 = load(arg27_1, i2 + 384 * (tmp2))
    tmp4 = tmp1 + tmp3
    tmp5 = load(buf1, i1 + 256 * i0)
    tmp6 = tmp4 - tmp5
  

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB (GPU 0; 31.75 GiB total capacity; 29.20 GiB already allocated; 1.22 GiB free; 29.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
import pickle

start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster

meta_path = "./data/shakespeare_char/meta.pkl"
print(f"Loading meta from {meta_path}...")
with open(meta_path, 'rb') as f:
    meta = pickle.load(f)
# TODO want to make this more general to arbitrary encoder/decoder schemes
stoi, itos = meta['stoi'], meta['itos']
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()
start_ids = encode(start)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
            print(decode(y[0].tolist()))
            print('---------------')


In [None]:
import torch
a = {0:0, 1:0, 2:0, 3:0}
for i in range (10000):
    a[torch.multinomial(torch.tensor([0.25, 0.25, 0.25, 0.25]),1).item()] += 1
print(a)

In [None]:
import torch
device_type = 'cuda'
batch_size = 64
device = torch.device(device_type)

train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
calib_data = train_data[:1000]

def get_batch(split):
    if split == 'train':
        data = train_data
    elif split == 'val':
        data = val_data
    elif split == 'calib':
        data = calib_data
    else:
        raise ValueError(f"invalid split: {split}")
    ix = torch.randint(len(data) - 128, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+128]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+128]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

In [None]:
from rejectOption import RejectOption
from model import BranchyGPT, GPTConfig
import numpy as np
import os
import torch

data_dir = "data/shakespeare_char"

X,Y = get_batch('calib')

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2
block_size = 256 # context of up to 256 previous characters
bias = False # do we use bias inside LayerNorm and Linear layers?

model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
                  bias=bias, vocab_size=65, dropout=dropout)
gptconf = GPTConfig(**model_args)
model = BranchyGPT(gptconf).to(torch.device('cuda:0'))
model = torch.compile(model)
print(X.device)
print(next(model.parameters()).device)
print(model(X)[0].shape)
reject_option = RejectOption(dataset=X, model=model)



# Reject Option sample test

In [None]:
import os
import torch
import pickle
from contextlib import nullcontext

from model import GPTConfig, BranchyGPT
from rejectOption import get_device, LLMRejectOption

out_dir = "out-shakespeare-char"
model_name = "mini-gpt"

device = get_device()

# Load model from checkpoint

ckpt_path = os.path.join(out_dir, 'ckpt_' + model_name + '.pt') 
checkpoint = torch.load(ckpt_path , map_location=device)

gptconf = GPTConfig(**checkpoint['model_args'])
model = BranchyGPT(gptconf).to(device)

model.load_state_dict(checkpoint['model'])

# load encoder and decoder

meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
if os.path.exists(meta_path):
    print(f"Loading meta from {meta_path}...")
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    # TODO want to make this more general to arbitrary encoder/decoder schemes
    stoi, itos = meta['stoi'], meta['itos']
    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])
    
# load reject option
reject_option_path = os.path.join(out_dir, 'reject_option_' + model_name + '.pt')
reject_repartition = torch.load(reject_option_path)
reject_option = LLMRejectOption()
reject_option.calibration_set = reject_repartition.T


dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
num_samples = 1
max_new_tokens = 50
temperature = 1
top_k = 40
epsilon = 0.9

seed = 42
torch.manual_seed(seed)

if "cuda" in str(device):
    ctx = torch.cuda.amp.autocast(dtype=ptdtype, device_type="cuda")
else:
    ctx = nullcontext()



# actual inference
with torch.no_grad():
    with ctx:
        for epsilon in torch.linspace(0.1, 1., 9):
            x = (torch.tensor(encode("The war"), dtype=torch.long, device=device)[None, ...])
            print(epsilon)
            for k in range(num_samples):
                y = model.generate(x, max_new_tokens, reject_option, temperature=temperature, top_k=top_k, epsilon=epsilon, decoder=decode)
                print(decode(y[0].tolist()))
                print('---------------')


In [None]:
import pandas as pd
import glob
# read from csv
for file in sorted(glob.glob("results*.csv")):
    df = pd.read_csv(file)
    # print only head, decoded_token
    print(file.split(".csv")[0].split("results")[1])
    #print(df[['head', 'decoded_token']])
    print(f"Budget without reject = {len(df)*6}, with reject : {(df[['head']]+1).sum().to_numpy()}, percentage : {(df[['head']]+1).sum().to_numpy()/(len(df)*6)}")


In [None]:
import torch
from contextlib import nullcontext
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
num_samples = 1
max_new_tokens = 50
temperature = 1
top_k = 40
epsilon = 0.9

seed = 42
torch.manual_seed(seed)

if "cuda" in str(device):
    ctx = torch.cuda.amp.autocast(dtype=ptdtype, device_type="cuda")
else:
    ctx = nullcontext()



# actual inference
with torch.no_grad():
    with ctx:
        for epsilon in torch.linspace(0.9, 1., 9):
            x = (torch.tensor(encode("The war"), dtype=torch.long, device=device)[None, ...])
            print(epsilon)
            for k in range(num_samples):
                y = model.generate(x, max_new_tokens, reject_option, temperature=temperature, top_k=top_k, epsilon=epsilon, decoder=decode)
                print(decode(y[0].tolist()))
                print('---------------')


# Evaluate Branchy GPT on dataset

In [None]:
# With perplexity

dataset = "shakespeare_char"
data_dir = os.path.join("data", dataset)

val_data = np.memmap(os.path.join(data_dir, "val.bin"), dtype=np.uint16, mode="r")


In [None]:
from transformers import AutoModelForCausalLM, LlamaPreTrainedModel
from transformers.utils.generic import ModelOutput
import torch
import torch.nn as nn
from typing import List, Optional, Tuple, Union
from dataclasses import dataclass


class BranchyLlama(LlamaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.model.return_dict = True
        
        self.auxiliary_outputs = nn.ModuleList(nn.Linear(self.model.config.hidden_size, self.model.config.vocab_size, bias=False) for _ in range(len(self.model.config.num_hidden_layers)))

    
    def forward(self, input_ids, labels: Optional[torch.LongTensor] = None, **kwargs):
        
        output = self.model(input_ids, **kwargs)
        
        head_outputs = torch.empty((0, input_ids.shape[0], input_ids.shape[1], self.model.config.vocab_size), device=input_ids.device)
        
        loss_fct = nn.CrossEntropyLoss()
        loss = 0
        
        for state in output.hidden_states:
            head_output = self.auxiliary_outputs[i](state)
            head_outputs = torch.cat((head_outputs, head_output.unsqueeze(0)), dim=0)
            if labels is not None:
                shift_logits = head_output[..., :-1, :].contiguous()
                shift_labels = labels[..., 1:].contiguous()
                # Flatten the tokens
                shift_logits = shift_logits.view(-1, self.model.config.vocab_size)
                shift_labels = shift_labels.view(-1)
                # Enable model parallelism
                shift_labels = shift_labels.to(shift_logits.device)
                # sum of losses
                loss += loss_fct(shift_logits, shift_labels)
        head_loss = loss
        loss = 0
        last_hidden_states = output[0]
        logits = self.model.lm_head(last_hidden_states)
        if labels is not None:
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = labels[..., 1:].contiguous()
                # Flatten the tokens
                shift_logits = shift_logits.view(-1, self.model.config.vocab_size)
                shift_labels = shift_labels.view(-1)
                # Enable model parallelism
                shift_labels = shift_labels.to(shift_logits.device)
                # sum of losses
                loss = loss_fct(shift_logits, shift_labels)        
        
        # return both the original output and the intermediate outputs
        return CausalBranchyLMOutputWithPast(
            loss=loss,
            head_loss=head_loss,
            logits=logits,
            head_outputs=head_outputs,
        )
    
@dataclass
class CausalBranchyLMOutputWithPast(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    head_loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    head_outputs: Optional[Tuple[torch.FloatTensor]] = None
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    
branchyllama = BranchyLlama("openlm-research/open_llama_3b_v2")


TypeError: super(type, obj): obj must be an instance or subtype of type

In [1]:
from src.branchymodel import BranchyLlama

branchyllamaconf = BranchyLlama.config_class.from_pretrained("openlm-research/open_llama_3b_v2")
# TODO Check if this is the right way to do it, because config here is not changing anything while inference
branchyllamaconf.self_supervision = True

model = BranchyLlama.from_pretrained("openlm-research/open_llama_3b_v2", config=branchyllamaconf)
print(model.config)

Some weights of BranchyLlama were not initialized from the model checkpoint at openlm-research/open_llama_3b_v2 and are newly initialized: ['auxiliary_outputs.16.weight', 'auxiliary_outputs.19.weight', 'auxiliary_outputs.0.weight', 'auxiliary_outputs.24.weight', 'auxiliary_outputs.14.weight', 'auxiliary_outputs.4.weight', 'auxiliary_outputs.12.weight', 'auxiliary_outputs.18.weight', 'auxiliary_outputs.15.weight', 'auxiliary_outputs.6.weight', 'auxiliary_outputs.10.weight', 'auxiliary_outputs.8.weight', 'auxiliary_outputs.5.weight', 'auxiliary_outputs.22.weight', 'auxiliary_outputs.23.weight', 'auxiliary_outputs.25.weight', 'auxiliary_outputs.2.weight', 'auxiliary_outputs.1.weight', 'auxiliary_outputs.21.weight', 'auxiliary_outputs.11.weight', 'auxiliary_outputs.9.weight', 'auxiliary_outputs.20.weight', 'auxiliary_outputs.17.weight', 'auxiliary_outputs.13.weight', 'auxiliary_outputs.3.weight', 'auxiliary_outputs.7.weight']
You should probably TRAIN this model on a down-stream task to be

BranchyLlamaConfig {
  "_name_or_path": "openlm-research/open_llama_3b_v2",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 3200,
  "initializer_range": 0.02,
  "intermediate_size": 8640,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 26,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "self_supervision": true,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.30.2",
  "use_cache": true,
  "vocab_size": 32000
}



In [2]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from torch.utils.data.dataset import Dataset
import torch
import random
from accelerate import Accelerator
from torch.utils.data.dataloader import DataLoader


default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

training_args = TrainingArguments(
    per_device_train_batch_size=64,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    **default_args,
)

class RandomIntDataset(Dataset):
    def __init__(self, length):
        self.len = length

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return torch.randint(0, 32000, (1,))

dataloader = DataLoader(RandomIntDataset(training_args.per_device_train_batch_size), batch_size=training_args.per_device_train_batch_size, pin_memory=True)

#if training_args.gradient_checkpointing:
#    model.gradient_checkpointing_enable()

#accelerator = Accelerator()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
#model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
model.to("cuda")
model.train()
model.lm_head.requires_grad_(False)
model.model.requires_grad_(False)
model.auxiliary_outputs.requires_grad_(True)
batch = next(iter(dataloader)).to("cuda")
for step in range(10000):
    print(f"sending batch {batch.shape}")
    output = model(batch)
    loss = output.loss
    batch = output.logits[-1]
    loss = loss / training_args.gradient_accumulation_steps
    loss.backward()
    #accelerator.backward(loss)
    if step % training_args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
    if step % 100 == 0:
        print(f"Step {step} : {loss.item()}")

sending batch torch.Size([64, 1])
True


OutOfMemoryError: CUDA out of memory. Tried to allocate 392.00 MiB (GPU 0; 31.75 GiB total capacity; 28.12 GiB already allocated; 82.94 MiB free; 28.21 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
for batch in dataloader:
    print(batch)
    break  # only print the first batch for demonstration

IndexError: index 7185 is out of bounds for dimension 0 with size 1

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import LlamaTokenizer, AutoModelForCausalLM
import torch

device = torch.device("cpu")

batch_size = 8
tokenizer = LlamaTokenizer.from_pretrained("openlm-research/open_llama_3b_v2")

base_prompts = ["The "] * batch_size
base_sentence = torch.tensor([torch.tensor(tokenizer.encode(prompt), dtype=torch.long, device=device).tolist() for prompt in base_prompts])
print(f"first sentence: {base_sentence[0]}")
for i in range(100):
    outputs = model(base_sentence)
    output = outputs.logits
    print(outputs.loss)
    print(torch.argmax(torch.softmax(output[-1][:, -1, :], dim=-1).squeeze(), dim=-1).shape)

    print(torch.multinomial(torch.softmax(output[-1][:, -1, :], dim=-1).squeeze(), 1).shape)
    base_sentence = torch.cat((base_sentence.squeeze(), torch.argmax(torch.softmax(output[-1][:, -1, :], dim=-1).squeeze(), dim=-1).unsqueeze(-1)), dim=-1)
    for i in range(batch_size):
        print(tokenizer.decode(base_sentence.squeeze()[i].tolist()))


first sentence: tensor([    1,   364, 29500])
True
tensor(8.6826, grad_fn=<DivBackward0>)
torch.Size([8])
torch.Size([8, 1])
<s>The 2
<s>The 2
<s>The 2
<s>The 2
<s>The 2
<s>The 2
<s>The 2
<s>The 2
True
tensor(9.4311, grad_fn=<DivBackward0>)
torch.Size([8])
torch.Size([8, 1])
<s>The 20
<s>The 20
<s>The 20
<s>The 20
<s>The 20
<s>The 20
<s>The 20
<s>The 20
True
tensor(9.3685, grad_fn=<DivBackward0>)
torch.Size([8])
torch.Size([8, 1])
<s>The 201
<s>The 201
<s>The 201
<s>The 201
<s>The 201
<s>The 201
<s>The 201
<s>The 201
True


KeyboardInterrupt: 

In [None]:
print(torch.tensor([tokenizer.encode(prompt) for prompt in base_prompts], device=device, dtype=torch.long))
print(tokenized_base_prompts.logits.shape)

logit = tokenized_base_prompts.logits[-1]
# for each batch
print(torch.tensor([tokenizer.encode(prompt) for prompt in base_prompts], device=device, dtype=torch.long)[0])
print(torch.argmax(torch.softmax(logit[:,-1,:], dim=-1), dim=-1).squeeze().tolist())


tensor([[    1,   364,   950,  1247,   296,   531,  5629,   325, 29500],
        [    1,   364,   950,  1247,   296,   531,  5629,   325, 29500],
        [    1,   364,   950,  1247,   296,   531,  5629,   325, 29500],
        [    1,   364,   950,  1247,   296,   531,  5629,   325, 29500],
        [    1,   364,   950,  1247,   296,   531,  5629,   325, 29500],
        [    1,   364,   950,  1247,   296,   531,  5629,   325, 29500],
        [    1,   364,   950,  1247,   296,   531,  5629,   325, 29500],
        [    1,   364,   950,  1247,   296,   531,  5629,   325, 29500]])
torch.Size([27, 8, 9, 32000])
tensor([    1,   364,   950,  1247,   296,   531,  5629,   325, 29500])
[29532, 29532, 29532, 29532, 29532, 29532, 29532, 29532]


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b_v2")
model = AutoModelForCausalLM.from_pretrained("openlm-research/open_llama_7b_v2")

KeyboardInterrupt: 

In [None]:
import torch 
base_sentence = "The book is on the"
device = torch.device("cpu")
model.to(device)
base_sentence = torch.tensor(tokenizer.encode(base_sentence), dtype=torch.long, device=device)[None, ...].to(device)
for i in range(100):
    output = model(base_sentence)
    base_sentence = torch.cat((base_sentence.squeeze(), torch.multinomial(torch.softmax(output[0][:, -1, :], dim=-1).squeeze(), num_samples=1)), dim=-1).unsqueeze(0)
    print(tokenizer.decode(base_sentence.squeeze().tolist()))


In [None]:
# Load model directly
from transformers import LlamaTokenizer, AutoModelForCausalLM

tokenizer = LlamaTokenizer.from_pretrained("openlm-research/open_llama_3b_v2")
#model = AutoModelForCausalLM.from_pretrained("openlm-research/open_llama_3b_v2", output_hidden_states=True)

In [None]:
from transformers import LlamaTokenizer, AutoModelForCausalLM

print(model.auxiliary_outputs[0].weight)
device = torch.device("cpu")

tokenizer = LlamaTokenizer.from_pretrained("openlm-research/open_llama_3b_v2")

output = model.model(torch.tensor(tokenizer.encode("The book"), dtype=torch.long, device=device)[None, ...], output_hidden_states=True)
print(output.__dict__.keys())
print(len(output[2]))
print(output.hidden_states[0].shape)
print(len(output.hidden_states))
print(output.last_hidden_state.shape)

Parameter containing:
tensor([[-0.0084, -0.0039, -0.0094,  ..., -0.0258,  0.0275,  0.0083],
        [-0.0084,  0.0041, -0.0120,  ..., -0.0012,  0.0023, -0.0426],
        [-0.0181, -0.0111, -0.0202,  ...,  0.0223, -0.0132,  0.0280],
        ...,
        [-0.0600,  0.0631,  0.0010,  ..., -0.0073, -0.0218,  0.0165],
        [ 0.0204,  0.0008,  0.0184,  ..., -0.0318,  0.0433, -0.0039],
        [ 0.0121,  0.0035, -0.0192,  ...,  0.0313, -0.0073,  0.0310]],
       requires_grad=True)
dict_keys(['last_hidden_state', 'past_key_values', 'hidden_states', 'attentions'])
27
torch.Size([1, 3, 3200])
27
torch.Size([1, 3, 3200])


In [None]:
import torch 
device = torch.device("cpu")
#output = model(torch.tensor(tokenizer.encode("The book"), dtype=torch.long, device=device)[None, ...])
#print(output)
branchyoutput = branchyllama.model(torch.tensor(tokenizer.encode("The book"), dtype=torch.long, device=device)[None, ...])
#assert torch.allclose(output.logits, branchyoutput.logits, atol=1e-3)
print(branchyoutput.head_outputs)
print(tokenizer.decode(branchyoutput.logits.argmax(dim=-1).squeeze().tolist()))

AttributeError: 'CausalLMOutputWithPast' object has no attribute 'head_outputs'

In [None]:
print(len(output.hidden_states))