### Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from dataclasses import dataclass
from typing import Optional
import time
from pathlib import Path
import json
from sentencepiece import SentencePieceProcessor
from tqdm import tqdm

### Defining Model Arguments

In [2]:
@dataclass
class ModelArgs:
    dim: int = 4096
    n_layers: int = 32
    n_heads: int = 32
    n_kv_heads: Optional[int] = None
    vocab_size: int = -1 # Later set in the build method
    multiple_of: int = 256
    ffn_dim_multiplier: Optional[float] = None
    norm_eps: float = 1e-5

    # Needed for KV cache
    max_batch_size: int = 32
    max_seq_len: int = 2048

    device: str = None

# @dataclass
# class ModelArgs:
#     dim: int = 256
#     n_layers: int = 2
#     n_heads: int = 8
#     n_kv_heads: Optional[int] = 4
#     vocab_size: int = -1 # Later set in the build method
#     multiple_of: int = 256
#     ffn_dim_multiplier: Optional[float] = None
#     norm_eps: float = 1e-5

#     # Needed for KV cache
#     max_batch_size: int = 32
#     max_seq_len: int = 512

#     device: str = None


### RoPE Functions

In [3]:
def precompute_theta_pos_frequencies(head_dim: int, seq_len: int, device: str, theta: float = 10000.0):
    # As written in the paragraph 3.2.2 of the paper
    # >> In order to generalize our results in 2D to any xi ∈ Rd where **d is even**, [...]
    assert head_dim % 2 == 0, "Dimension must be divisible by 2"
    # Build the theta parameter
    # According to the formula theta_i = 10000^(-2(i-1)/dim) for i = [1, 2, ... dim/2]
    # Shape: (Head_Dim / 2)
    theta_numerator = torch.arange(0, head_dim, 2).float()
    # Shape: (Head_Dim / 2)
    theta = 1.0 / (theta ** (theta_numerator / head_dim)).to(device) # (Dim / 2)
    # Construct the positions (the "m" parameter)
    # Shape: (Seq_Len)
    m = torch.arange(seq_len, device=device)
    # Multiply each theta by each position using the outer product.
    # Shape: (Seq_Len) outer_product* (Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
    freqs = torch.outer(m, theta).float()
    # We can compute complex numbers in the polar form c = R * exp(m * theta), where R = 1 as follows:
    # (Seq_Len, Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
    freqs_complex = torch.polar(torch.ones_like(freqs), freqs)
    return freqs_complex

def apply_rotary_embeddings(x: torch.Tensor, freqs_complex: torch.Tensor, device: str):
    # Separate the last dimension pairs of two values, representing the real and imaginary parts of the complex number
    # Two consecutive values will become a single complex number
    # (B, Seq_Len, H, Head_Dim) -> (B, Seq_Len, H, Head_Dim/2)
    x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
    # Reshape the freqs_complex tensor to match the shape of the x_complex tensor. So we need to add the batch dimension and the head dimension
    # (Seq_Len, Head_Dim/2) --> (1, Seq_Len, 1, Head_Dim/2)
    freqs_complex = freqs_complex.unsqueeze(0).unsqueeze(2)
    # Multiply each complex number in the x_complex tensor by the corresponding complex number in the freqs_complex tensor
    # Which results in the rotation of the complex number as shown in the Figure 1 of the paper
    # (B, Seq_Len, H, Head_Dim/2) * (1, Seq_Len, 1, Head_Dim/2) = (B, Seq_Len, H, Head_Dim/2)
    x_rotated = x_complex * freqs_complex
    # Convert the complex number back to the real number
    # (B, Seq_Len, H, Head_Dim/2) -> (B, Seq_Len, H, Head_Dim/2, 2)
    x_out = torch.view_as_real(x_rotated)
    # (B, Seq_Len, H, Head_Dim/2, 2) -> (B, Seq_Len, H, Head_Dim)
    x_out = x_out.reshape(*x.shape)
    return x_out.type_as(x).to(device)


In [18]:
x = precompute_theta_pos_frequencies(head_dim=256, seq_len=512, device="cpu", theta = 10000.0)
x.shape

torch.Size([512, 128])

In [19]:
ip = torch.randn(3,512,4,256)
op = apply_rotary_embeddings(ip, freqs_complex=x, device="cpu")
op.shape

torch.Size([3, 512, 4, 256])

In [20]:
torch.view_as_complex(op.float().reshape(*op.shape[:-1],-1,2))

tensor([[[[-0.6525+1.6389e-01j, -0.9444+7.5510e-02j, -0.1971+6.2354e-01j,
            ...,  1.4109+6.3330e-01j, -0.8280+1.0972e+00j,
           -0.4271+5.1419e-01j],
          [ 0.8535-1.5828e+00j,  0.8824-3.4369e-01j, -0.7555-1.0106e+00j,
            ...,  0.2799-3.9069e-01j, -0.2931+2.8359e-01j,
           -0.8935+4.1683e-01j],
          [ 0.2616-7.2232e-01j, -1.1195-1.1315e-01j, -0.0701-6.1718e-01j,
            ...,  0.7299+8.2295e-01j,  0.7013-4.7570e-01j,
            1.1760+3.6191e-01j],
          [-0.5551+8.6049e-01j, -0.3708-1.3458e+00j, -1.0049+3.6710e-01j,
            ..., -0.8024-2.9945e-01j, -1.4321-5.5686e-01j,
            0.7922+1.0281e+00j]],

         [[ 1.7885-1.1433e+00j,  0.1194-2.4063e-02j, -0.1473+8.3258e-01j,
            ...,  1.1070+1.0431e+00j,  1.6521+8.7505e-01j,
           -0.3700+1.7823e+00j],
          [ 0.4724+1.0763e+00j, -0.8132+2.0125e+00j, -0.2473-1.2535e+00j,
            ...,  0.1257+6.5832e-01j, -0.3953-1.0079e+00j,
            0.8387+1.6762e+00j],
  

In [21]:
a = torch.tensor([1,2,3,4,5,6,7,8])
print(a.shape)
b = a.reshape(-1,2)
torch.view_as_complex(b.float()).shape

torch.Size([8])


torch.Size([4])

### RMSNorm

In [4]:
class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        # The gamma parameter
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x: torch.Tensor):
        # (B, Seq_Len, Dim) * (B, Seq_Len, 1) = (B, Seq_Len, Dim)
        # rsqrt: 1 / sqrt(x)
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x: torch.Tensor):
        # (Dim) * (B, Seq_Len, Dim) = (B, Seq_Len, Dim)
        return self.weight * self._norm(x.float()).type_as(x)

### Self-Attention

In [5]:
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
    batch_size, seq_len, n_kv_heads, head_dim = x.shape
    if n_rep == 1:
        return x
    return (
        # (B, Seq_Len, N_KV_Heads, 1, Head_Dim)
        x[:, :, :, None, :]
        # (B, Seq_Len, N_KV_Heads, N_Rep, Head_Dim)
        .expand(batch_size, seq_len, n_kv_heads, n_rep, head_dim)
        # (B, Seq_Len, N_KV_Heads * N_Rep, Head_Dim)
        .reshape(batch_size, seq_len, n_kv_heads * n_rep, head_dim)
    )


class SelfAttention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()

        # Indicates the number of heads for the Keys and Values
        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
        # Indicates the number of heads for the Queries
        self.n_heads_q = args.n_heads
        # Indicates how many times the Keys and Values should be repeated
        self.n_rep = self.n_heads_q // self.n_kv_heads
        # Indicates the dimension of each head, that is, the part of the embedding that each head will be responsible for
        self.head_dim = args.dim // args.n_heads

        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)

        self.cache_k = torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))
        self.cache_v = torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))

    def forward(
        self,
        x: torch.Tensor,
        start_pos: int,
        freqs_complex: torch.Tensor
    ):
        batch_size, seq_len, _ = x.shape  # (B, 1, Dim)
        print("x ",x.shape)
        # (B, 1, Dim) -> (B, 1, H_Q * Head_Dim)
        xq = self.wq(x)
        print("xq ",xq.shape)
        # (B, 1, Dim) -> (B, 1, H_KV * Head_Dim)
        xk = self.wk(x)
        print("xk ",xk.shape)
        # (B, 1, Dim) -> (B, 1, H_KV * Head_Dim)
        xv = self.wv(x)
        print("xv ",xv.shape)

        # (B, 1, H_Q * Head_Dim) -> (B, 1, H_Q, Head_Dim)
        xq = xq.view(batch_size, seq_len, self.n_heads_q, self.head_dim)
        print("xq ",xq.shape)
        # (B, 1, H_KV * Head_Dim) -> (B, 1, H_KV, Head_Dim)
        xk = xk.view(batch_size, seq_len, self.n_kv_heads, self.head_dim)
        print("xk ",xk.shape)
        # (B, 1, H_KV * Head_Dim) -> (B, 1, H_KV, Head_Dim)
        xv = xv.view(batch_size, seq_len, self.n_kv_heads, self.head_dim)
        print("xk ",xk.shape)

        # (B, 1, H_Q, Head_Dim) --> (B, 1, H_Q, Head_Dim)
        xq = apply_rotary_embeddings(xq, freqs_complex, device=x.device)
        print("xq ROPE: ",xq.shape)
        # (B, 1, H_KV, Head_Dim) --> (B, 1, H_KV, Head_Dim)
        xk = apply_rotary_embeddings(xk, freqs_complex, device=x.device)
        print("xk ROPE: ",xk.shape)

        # Replace the entry in the cache
        self.cache_k[:batch_size, start_pos : start_pos + seq_len] = xk
        self.cache_v[:batch_size, start_pos : start_pos + seq_len] = xv

        # (B, Seq_Len_KV, H_KV, Head_Dim)
        keys = self.cache_k[:batch_size, : start_pos + seq_len]
        print("keys ",keys.shape)
        # (B, Seq_Len_KV, H_KV, Head_Dim)
        values = self.cache_v[:batch_size, : start_pos + seq_len]
        print("values ",values.shape)
        # Since every group of Q shares the same K and V heads, just repeat the K and V heads for every Q in the same group.

        # (B, Seq_Len_KV, H_KV, Head_Dim) --> (B, Seq_Len_KV, H_Q, Head_Dim)
        keys = repeat_kv(keys, self.n_rep)
        print("keys after repetition",keys.shape)
        # (B, Seq_Len_KV, H_KV, Head_Dim) --> (B, Seq_Len_KV, H_Q, Head_Dim)
        values = repeat_kv(values, self.n_rep)
        print("values after repetition",values.shape)

        # (B, 1, H_Q, Head_Dim) -> (B, H_Q, 1, Head_Dim)
        xq = xq.transpose(1, 2)
        print("xq transpose ",xq.shape)
        # (B, Seq_Len_KV, H_Q, Head_Dim) -> (B, H_Q, Seq_Len_KV, Head_Dim)
        keys = keys.transpose(1, 2)
        print("keys transpose ",keys.shape)
        # (B, Seq_Len_KV, H_Q, Head_Dim) -> (B, H_Q, Seq_Len_KV, Head_Dim)
        values = values.transpose(1, 2)
        print("values transpose ",values.shape)

        # (B, H_Q, 1, Head_Dim) @ (B, H_Q, Head_Dim, Seq_Len_KV) -> (B, H_Q, 1, Seq_Len_KV)
        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
        print("scores ",scores.shape)
        # (B, H_Q, 1, Seq_Len_KV) -> (B, H_Q, 1, Seq_Len_KV)
        scores = F.softmax(scores.float(), dim=-1).type_as(xq)

        # (B, H_Q, 1, Seq_Len) @ (B, H_Q, Seq_Len_KV, Head_Dim) -> (B, H_Q, 1, Head_Dim)
        output = torch.matmul(scores, values)
        print("output before wo ",output.shape)
        # (B, H_Q, 1, Head_Dim) -> (B, 1, H_Q, Head_Dim) -> (B, 1, Dim)
        output = (output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1))
        print("output after reshape ",output.shape)
        output = self.wo(output)
        print("output after wo ",output.shape) # (B, 1, Dim) -> (B, 1, Dim)
        return output 


In [24]:
args = ModelArgs()
sa = SelfAttention(args)
freqs_complex_complete = precompute_theta_pos_frequencies(args.dim // args.n_heads, args.max_seq_len * 2, device=args.device)
seq_len = 10
for start_pos in range(seq_len):
    print("START_POSITION: ",start_pos)
    freqs_complex = freqs_complex_complete[start_pos:start_pos+1]
    x = torch.randn(1,1,256)
    op = sa(x,start_pos,freqs_complex)
    print("*"*50)

START_POSITION:  0
x  torch.Size([1, 1, 256])
xq  torch.Size([1, 1, 256])
xk  torch.Size([1, 1, 128])
xv  torch.Size([1, 1, 128])
xq  torch.Size([1, 1, 8, 32])
xk  torch.Size([1, 1, 4, 32])
xk  torch.Size([1, 1, 4, 32])
xq ROPE:  torch.Size([1, 1, 8, 32])
xk ROPE:  torch.Size([1, 1, 4, 32])
keys  torch.Size([1, 1, 4, 32])
values  torch.Size([1, 1, 4, 32])
keys after repetition torch.Size([1, 1, 8, 32])
values after repetition torch.Size([1, 1, 8, 32])
xq transpose  torch.Size([1, 8, 1, 32])
keys transpose  torch.Size([1, 8, 1, 32])
values transpose  torch.Size([1, 8, 1, 32])
scores  torch.Size([1, 8, 1, 1])
output before wo  torch.Size([1, 8, 1, 32])
output after reshape  torch.Size([1, 1, 256])
output after wo  torch.Size([1, 1, 256])
**************************************************
START_POSITION:  1
x  torch.Size([1, 1, 256])
xq  torch.Size([1, 1, 256])
xk  torch.Size([1, 1, 128])
xv  torch.Size([1, 1, 128])
xq  torch.Size([1, 1, 8, 32])
xk  torch.Size([1, 1, 4, 32])
xk  torch.Siz

### FeedForward

In [6]:
class FeedForward(nn.Module):
    def __init__(
        self,
        args: ModelArgs
    ):
        super().__init__()

        hidden_dim = 4 * args.dim
        hidden_dim = int(2 * hidden_dim / 3)
        if args.ffn_dim_multiplier is not None:
            hidden_dim = int(args.ffn_dim_multiplier * hidden_dim)
        # Round the hidden_dim to the nearest multiple of the multiple_of parameter
        hidden_dim = args.multiple_of * ((hidden_dim + args.multiple_of - 1) // args.multiple_of)

        self.w1 = nn.Linear(args.dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, args.dim, bias=False)
        self.w3 = nn.Linear(args.dim, hidden_dim, bias=False)

    def forward(self, x: torch.Tensor):
        # (B, Seq_Len, Dim) --> (B, Seq_Len, Hidden_Dim)
        swish = F.silu(self.w1(x))
        # (B, Seq_Len, Dim) --> (B, Seq_Len, Hidden_Dim)
        x_V = self.w3(x)
        # (B, Seq_Len, Hidden_Dim) * (B, Seq_Len, Hidden_Dim) --> (B, Seq_Len, Hidden_Dim)
        x = swish * x_V
        # (B, Seq_Len, Hidden_Dim) --> (B, Seq_Len, Dim)
        x = self.w2(x)
        return x



### Encoder

In [7]:
class EncoderBlock(nn.Module):

    def __init__(self, args: ModelArgs):
        super().__init__()

        self.n_heads = args.n_heads
        self.dim = args.dim
        self.head_dim = args.dim // args.n_heads

        self.attention = SelfAttention(args)
        self.feed_forward = FeedForward(args)

        # Normalization BEFORE the attention block
        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
        # Normalization BEFORE the feed forward block
        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
    
    def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor):
        # (B, Seq_Len, Dim) + (B, Seq_Len, Dim) --> (B, Seq_Len, Dim)
        h = x + self.attention.forward(
            self.attention_norm(x), start_pos, freqs_complex
        )
        # (B, Seq_Len, Dim) + (B, Seq_Len, Dim) --> (B, Seq_Len, Dim)
        out = h + self.feed_forward.forward(self.ffn_norm(h))
        return out


### LLaMA-2 Architecture

In [8]:
class Transformer(nn.Module):

    def __init__(self, args: ModelArgs):
        super().__init__()

        assert args.vocab_size != -1, "Vocab size must be set"

        self.args = args
        self.vocab_size = args.vocab_size
        self.n_layers = args.n_layers
        self.tok_embeddings = nn.Embedding(self.vocab_size, args.dim)

        self.layers = nn.ModuleList()
        for layer_id in range(args.n_layers):
            self.layers.append(EncoderBlock(args))

        self.norm = RMSNorm(args.dim, eps=args.norm_eps)
        self.output = nn.Linear(args.dim, self.vocab_size, bias=False)

        self.freqs_complex = precompute_theta_pos_frequencies(self.args.dim // self.args.n_heads, self.args.max_seq_len * 2, device=self.args.device)

    def forward(self, tokens: torch.Tensor, start_pos: int):
        # (B, Seq_Len)
        batch_size, seq_len = tokens.shape
        assert seq_len == 1, "Only one token at a time can be processed"

        # (B, Seq_Len) -> (B, Seq_Len, Dim)
        h = self.tok_embeddings(tokens)
        # print(h.shape)
        # print(self.freqs_complex.shape)
        # Retrieve the pairs (m, theta) corresponding to the positions [start_pos, start_pos + seq_len]
        freqs_complex = self.freqs_complex[start_pos:start_pos + seq_len]
        # print(freqs_complex.shape)
        # print(freqs_complex)
        # Consecutively apply all the encoder layers
        for layer in self.layers:
            h = layer(h, start_pos, freqs_complex)
            # print(h.shape)
        h = self.norm(h)
        output = self.output(h).float()
        print("op ",output.shape)
        return output

#### Rough

In [21]:
model_args = ModelArgs(
    max_seq_len=512,
    max_batch_size=3,
    device="cpu",
)
model_args.vocab_size = tokenizer.vocab_size()
llama_model = Transformer(model_args)

In [22]:
x = torch.tensor([[150,237]])
op = llama_model.forward(x,0)

torch.Size([1, 2, 256])
torch.Size([1024, 32])
torch.Size([2, 32])
tensor([[1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j, 1.0000+0.0000e+00j],
        [0.5403+8.4147e-01j, 0.7318+6.8156e-01j, 0.8460+5.3317e-01j,
         0.9124+4.0931e-01j, 0.9504+3.1098e-01j, 0.9720+2.3492e-01j,
         0.9842+1.7689e-01j, 0.9911+1.3296e-01j, 0.9950

In [39]:
op.shape

torch.Size([1, 1, 32000])

### Inference

In [9]:
class LLaMA:

    def __init__(self, model: Transformer, tokenizer: SentencePieceProcessor, model_args: ModelArgs):
        self.model = model
        self.tokenizer = tokenizer
        self.args = model_args

    @staticmethod
    def build(checkpoints_dir: str, tokenizer_path: str, load_model: bool, max_seq_len: int, max_batch_size: int, device: str):
        prev_time = time.time()
        if load_model:
            checkpoints = sorted(Path(checkpoints_dir).glob("*.pth"))
            assert len(checkpoints) > 0, f"no checkpoint files found in {checkpoints_dir}"
            ckpt_path = checkpoints[0]
            print(f'Loading checkpoint "{ckpt_path}"')
            checkpoint = torch.load(ckpt_path, map_location="cpu")
            print(f"Loaded checkpoint in {time.time() - prev_time:.2f}s")
            prev_time = time.time()
        with open(Path(checkpoints_dir) / "params.json", "r") as f:
            params = json.loads(f.read())

        model_args: ModelArgs = ModelArgs(
            max_seq_len=max_seq_len,
            max_batch_size=max_batch_size,
            device=device,
            **params
        )

        tokenizer = SentencePieceProcessor()
        tokenizer.load(tokenizer_path)
        model_args.vocab_size = tokenizer.vocab_size()
        
        if device == "cuda":
            torch.set_default_tensor_type(torch.cuda.HalfTensor)
        else:
            torch.set_default_tensor_type(torch.BFloat16Tensor)
        
        model = Transformer(model_args).to(device)

        if load_model:
            # The only unmatched key in the checkpoint is rope.freqs. Remove it
            del checkpoint['rope.freqs']
            model.load_state_dict(checkpoint, strict=True)
            print(f"Loaded state dict in {time.time() - prev_time:.2f}s")
        
        return LLaMA(model, tokenizer, model_args)

    def text_completion(self, prompts: list[str], temperature: float = 0.6, top_p: float = 0.9, max_gen_len: Optional[int] = None):
        if max_gen_len is None:
            max_gen_len = self.args.max_seq_len - 1
        # Convert each prompt into tokens
        prompt_tokens = [self.tokenizer.encode(prompt, out_type=int, add_bos=True, add_eos=False) for prompt in prompts]
        # Make sure the batch size is not too large
        batch_size = len(prompt_tokens) # Number of Prompts
        assert batch_size <= self.args.max_batch_size, f"batch size must be less than or equal to {self.args.max_batch_size}"
        max_prompt_len = max(len(prompt) for prompt in prompt_tokens)
        # Make sure the prompt length is not larger than the maximum sequence length
        assert max_prompt_len <= self.args.max_seq_len, f"prompt length must be less than or equal to {self.args.max_seq_len}"
        total_len = min(self.args.max_seq_len, max_gen_len + max_prompt_len)

        # Create the list that will contain the generated tokens, along with the initial prompt tokens
        pad_id = self.tokenizer.pad_id() ## pad_id = -1
        tokens = torch.full((batch_size, total_len), pad_id, dtype=torch.long, device=device)
        for k, t in enumerate(prompt_tokens):
            # Populate the initial tokens with the prompt tokens
            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device=device)
        
        eos_reached = torch.tensor([False] * batch_size, device=device)
        prompt_tokens_mask = tokens != pad_id # True if the token is a prompt token, False otherwise
        cur_iterator = tqdm(range(1, total_len), desc="Generating tokens")
        for cur_pos in cur_iterator:
            with torch.no_grad():
                logits = self.model.forward(tokens[:, cur_pos-1:cur_pos], cur_pos)
                print(logits.shape)
            if temperature > 0:
                # The temperature is applied before the softmax
                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
                next_token = self._sample_top_p(probs, top_p)
            else:
                # Greedily select the token with the max probability
                next_token = torch.argmax(logits[:, -1], dim=-1)

            next_token = next_token.reshape(-1)
            # Only replace token if it is a padding token
            next_token = torch.where(prompt_tokens_mask[:, cur_pos], tokens[:, cur_pos], next_token)
            tokens[:, cur_pos] = next_token
            # EOS is reached only if we found an EOS token for a padding position
            eos_reached |= (~prompt_tokens_mask[:, cur_pos]) & (next_token == self.tokenizer.eos_id)
            if all(eos_reached):
                break

        out_tokens = []
        out_text = []
        for prompt_index, current_prompt_tokens in enumerate(tokens.tolist()):
            # Cut to the EOS token, if present
            if self.tokenizer.eos_id in current_prompt_tokens:
                eos_idx = current_prompt_tokens.index(self.tokenizer.eos_id)
                current_prompt_tokens = current_prompt_tokens[:eos_idx]
            out_tokens.append(current_prompt_tokens)
            out_text.append(self.tokenizer.decode(current_prompt_tokens))
        return (out_tokens, out_text)
    
    def _sample_top_p(self, probs, p):
        # (B, vocab_size)
        probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
        # (B, vocab_size)
        probs_sum = torch.cumsum(probs_sort, dim=-1)
        # (B, vocab_size)
        # (Substracting "probs_sort" shifts the cumulative sum by 1 position to the right before masking)
        mask = probs_sum - probs_sort > p 
        # Zero out all the probabilities of tokens that are not selected by the Top P
        probs_sort[mask] = 0.0 
        # Redistribute the probabilities so that they sum up to 1.
        probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
        # Sample a token (its index) from the top p distribution
        next_token = torch.multinomial(probs_sort, num_samples=1)
        # Get the token position in the vocabulary corresponding to the sampled index
        next_token = torch.gather(probs_idx, -1, next_token)
        return next_token


In [10]:
torch.manual_seed(0)
allow_cuda = False
device = "cuda" if torch.cuda.is_available() and allow_cuda else "cpu"


prompts = [
    "who are you"
]


model = LLaMA.build(
    checkpoints_dir = "/kaggle/input/llama2-7b-model/",
    tokenizer_path = '/kaggle/input/llama2-7b-model/tokenizer.model',
    load_model = True,
    max_seq_len = 1024,
    max_batch_size=3,
    device=device
)

Loading checkpoint "/kaggle/input/llama2-7b-model/consolidated.00.pth"
Loaded checkpoint in 69.64s


  _C._set_default_tensor_type(t)


Loaded state dict in 89.02s


In [19]:
del model

In [28]:
# llama_model = model.model
# tokenizer = model.tokenizer
# model_args = model.args

In [29]:
# llama = LLLaMA(llama_model,tokenizer,model_args)

In [13]:
prompts = [
    # "who are you",
    # "Explain the meaning of LLaMA-2 Architecture in simple language",
    "Explain the concept of self-attention in 2 lines"
]

In [14]:
# Inference the model
out_tokens,out_text = (model.text_completion(prompts,max_gen_len=64,temperature=0.1))
assert len(out_text) == len(prompts)
for i in range(len(out_text)):
    print(f'{out_text[i]}')
    print("* "* 50)

Generating tokens:   0%|          | 0/77 [00:00<?, ?it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 2, 32, 128])
values  torch.Size([1, 2, 32, 128])
keys after repetition torch.Size([1, 2, 32, 128])
values after repetition torch.Size([1, 2, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 2, 128])
values transpose  torch.Size([1, 32, 2, 128])
scores  torch.Size([1, 32, 1, 2])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 3

Generating tokens:   1%|▏         | 1/77 [00:01<01:32,  1.22s/it]

output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 2, 32, 128])
values  torch.Size([1, 2, 32, 128])
keys after repetition torch.Size([1, 2, 32, 128])
values after repetition torch.Size([1, 2, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 2, 128])
values transpose  torch.Size([1, 32, 2, 128])
scores  torch.Size([1, 32, 1, 2])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 

Generating tokens:   3%|▎         | 2/77 [00:02<01:19,  1.07s/it]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 3, 32, 128])
values  torch.Size([1, 3, 32, 128])
keys after repetition torch.Size([1, 3, 32, 128])
values after repetition torch.Size([1, 3, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 3, 128])
values transpose  torch.Size([1, 32, 3, 128])
scores  torch.Size([1, 32, 1, 3])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 3

Generating tokens:   4%|▍         | 3/77 [00:03<01:16,  1.03s/it]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 4, 32, 128])
values  torch.Size([1, 4, 32, 128])
keys after repetition torch.Size([1, 4, 32, 128])
values after repetition torch.Size([1, 4, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 4, 128])
values transpose  torch.Size([1, 32, 4, 128])
scores  torch.Size([1, 32, 1, 4])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 3

Generating tokens:   5%|▌         | 4/77 [00:04<01:13,  1.01s/it]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 6, 32, 128])
values  torch.Size([1, 6, 32, 128])
keys after repetition torch.Size([1, 6, 32, 128])
values after repetition torch.Size([1, 6, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 6, 128])
values transpose  torch.Size([1, 32, 6, 128])
scores  torch.Size([1, 32, 1, 6])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  t

Generating tokens:   6%|▋         | 5/77 [00:05<01:11,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 6, 32, 128])
values  torch.Size([1, 6, 32, 128])
keys after repetition torch.Size([1, 6, 32, 128])
values after repetition torch.Size([1, 6, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 6, 128])
values transpose  torch.Size([1, 32, 6, 128])
scores  torch.Size([1, 32, 1, 6])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 3

Generating tokens:   8%|▊         | 6/77 [00:06<01:09,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 7, 32, 128])
values  torch.Size([1, 7, 32, 128])
keys after repetition torch.Size([1, 7, 32, 128])
values after repetition torch.Size([1, 7, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 7, 128])
values transpose  torch.Size([1, 32, 7, 128])
scores  torch.Size([1, 32, 1, 7])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 3

Generating tokens:   9%|▉         | 7/77 [00:07<01:08,  1.02it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 9, 32, 128])
values  torch.Size([1, 9, 32, 128])
keys after repetition torch.Size([1, 9, 32, 128])
values after repetition torch.Size([1, 9, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 9, 128])
values transpose  torch.Size([1, 32, 9, 128])
scores  torch.Size([1, 32, 1, 9])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  t

Generating tokens:  10%|█         | 8/77 [00:08<01:07,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 9, 32, 128])
values  torch.Size([1, 9, 32, 128])
keys after repetition torch.Size([1, 9, 32, 128])
values after repetition torch.Size([1, 9, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 9, 128])
values transpose  torch.Size([1, 32, 9, 128])
scores  torch.Size([1, 32, 1, 9])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 3

Generating tokens:  12%|█▏        | 9/77 [00:08<01:06,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 10, 32, 128])
values  torch.Size([1, 10, 32, 128])
keys after repetition torch.Size([1, 10, 32, 128])
values after repetition torch.Size([1, 10, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 10, 128])
values transpose  torch.Size([1, 32, 10, 128])
scores  torch.Size([1, 32, 1, 10])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  13%|█▎        | 10/77 [00:09<01:05,  1.03it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 12, 32, 128])
values  torch.Size([1, 12, 32, 128])
keys after repetition torch.Size([1, 12, 32, 128])
values after repetition torch.Size([1, 12, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 12, 128])
values transpose  torch.Size([1, 32, 12, 128])
scores  torch.Size([1, 32, 1, 12])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  14%|█▍        | 11/77 [00:10<01:03,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 12, 32, 128])
values  torch.Size([1, 12, 32, 128])
keys after repetition torch.Size([1, 12, 32, 128])
values after repetition torch.Size([1, 12, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 12, 128])
values transpose  torch.Size([1, 32, 12, 128])
scores  torch.Size([1, 32, 1, 12])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  16%|█▌        | 12/77 [00:11<01:03,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 13, 32, 128])
values  torch.Size([1, 13, 32, 128])
keys after repetition torch.Size([1, 13, 32, 128])
values after repetition torch.Size([1, 13, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 13, 128])
values transpose  torch.Size([1, 32, 13, 128])
scores  torch.Size([1, 32, 1, 13])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  17%|█▋        | 13/77 [00:12<01:02,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 14, 32, 128])
values  torch.Size([1, 14, 32, 128])
keys after repetition torch.Size([1, 14, 32, 128])
values after repetition torch.Size([1, 14, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 14, 128])
values transpose  torch.Size([1, 32, 14, 128])
scores  torch.Size([1, 32, 1, 14])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  18%|█▊        | 14/77 [00:13<01:01,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 15, 32, 128])
values  torch.Size([1, 15, 32, 128])
keys after repetition torch.Size([1, 15, 32, 128])
values after repetition torch.Size([1, 15, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 15, 128])
values transpose  torch.Size([1, 32, 15, 128])
scores  torch.Size([1, 32, 1, 15])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  19%|█▉        | 15/77 [00:14<01:00,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 16, 32, 128])
values  torch.Size([1, 16, 32, 128])
keys after repetition torch.Size([1, 16, 32, 128])
values after repetition torch.Size([1, 16, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 16, 128])
values transpose  torch.Size([1, 32, 16, 128])
scores  torch.Size([1, 32, 1, 16])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  21%|██        | 16/77 [00:15<00:59,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 17, 32, 128])
values  torch.Size([1, 17, 32, 128])
keys after repetition torch.Size([1, 17, 32, 128])
values after repetition torch.Size([1, 17, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 17, 128])
values transpose  torch.Size([1, 32, 17, 128])
scores  torch.Size([1, 32, 1, 17])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  22%|██▏       | 17/77 [00:16<00:58,  1.03it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 19, 32, 128])
values  torch.Size([1, 19, 32, 128])
keys after repetition torch.Size([1, 19, 32, 128])
values after repetition torch.Size([1, 19, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 19, 128])
values transpose  torch.Size([1, 32, 19, 128])
scores  torch.Size([1, 32, 1, 19])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  23%|██▎       | 18/77 [00:17<00:57,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 19, 32, 128])
values  torch.Size([1, 19, 32, 128])
keys after repetition torch.Size([1, 19, 32, 128])
values after repetition torch.Size([1, 19, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 19, 128])
values transpose  torch.Size([1, 32, 19, 128])
scores  torch.Size([1, 32, 1, 19])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  25%|██▍       | 19/77 [00:18<00:56,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 20, 32, 128])
values  torch.Size([1, 20, 32, 128])
keys after repetition torch.Size([1, 20, 32, 128])
values after repetition torch.Size([1, 20, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 20, 128])
values transpose  torch.Size([1, 32, 20, 128])
scores  torch.Size([1, 32, 1, 20])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  26%|██▌       | 20/77 [00:19<00:55,  1.03it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 22, 32, 128])
values  torch.Size([1, 22, 32, 128])
keys after repetition torch.Size([1, 22, 32, 128])
values after repetition torch.Size([1, 22, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 22, 128])
values transpose  torch.Size([1, 32, 22, 128])
scores  torch.Size([1, 32, 1, 22])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  27%|██▋       | 21/77 [00:20<00:54,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 22, 32, 128])
values  torch.Size([1, 22, 32, 128])
keys after repetition torch.Size([1, 22, 32, 128])
values after repetition torch.Size([1, 22, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 22, 128])
values transpose  torch.Size([1, 32, 22, 128])
scores  torch.Size([1, 32, 1, 22])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  29%|██▊       | 22/77 [00:21<00:53,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 23, 32, 128])
values  torch.Size([1, 23, 32, 128])
keys after repetition torch.Size([1, 23, 32, 128])
values after repetition torch.Size([1, 23, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 23, 128])
values transpose  torch.Size([1, 32, 23, 128])
scores  torch.Size([1, 32, 1, 23])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  30%|██▉       | 23/77 [00:22<00:52,  1.03it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 25, 32, 128])
values  torch.Size([1, 25, 32, 128])
keys after repetition torch.Size([1, 25, 32, 128])
values after repetition torch.Size([1, 25, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 25, 128])
values transpose  torch.Size([1, 32, 25, 128])
scores  torch.Size([1, 32, 1, 25])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  31%|███       | 24/77 [00:23<00:51,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 25, 32, 128])
values  torch.Size([1, 25, 32, 128])
keys after repetition torch.Size([1, 25, 32, 128])
values after repetition torch.Size([1, 25, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 25, 128])
values transpose  torch.Size([1, 32, 25, 128])
scores  torch.Size([1, 32, 1, 25])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  32%|███▏      | 25/77 [00:24<00:50,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 26, 32, 128])
values  torch.Size([1, 26, 32, 128])
keys after repetition torch.Size([1, 26, 32, 128])
values after repetition torch.Size([1, 26, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 26, 128])
values transpose  torch.Size([1, 32, 26, 128])
scores  torch.Size([1, 32, 1, 26])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  34%|███▍      | 26/77 [00:25<00:49,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 27, 32, 128])
values  torch.Size([1, 27, 32, 128])
keys after repetition torch.Size([1, 27, 32, 128])
values after repetition torch.Size([1, 27, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 27, 128])
values transpose  torch.Size([1, 32, 27, 128])
scores  torch.Size([1, 32, 1, 27])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  35%|███▌      | 27/77 [00:26<00:48,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 28, 32, 128])
values  torch.Size([1, 28, 32, 128])
keys after repetition torch.Size([1, 28, 32, 128])
values after repetition torch.Size([1, 28, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 28, 128])
values transpose  torch.Size([1, 32, 28, 128])
scores  torch.Size([1, 32, 1, 28])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  36%|███▋      | 28/77 [00:27<00:47,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 29, 32, 128])
values  torch.Size([1, 29, 32, 128])
keys after repetition torch.Size([1, 29, 32, 128])
values after repetition torch.Size([1, 29, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 29, 128])
values transpose  torch.Size([1, 32, 29, 128])
scores  torch.Size([1, 32, 1, 29])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  38%|███▊      | 29/77 [00:28<00:46,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 30, 32, 128])
values  torch.Size([1, 30, 32, 128])
keys after repetition torch.Size([1, 30, 32, 128])
values after repetition torch.Size([1, 30, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 30, 128])
values transpose  torch.Size([1, 32, 30, 128])
scores  torch.Size([1, 32, 1, 30])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  39%|███▉      | 30/77 [00:29<00:45,  1.03it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 32, 32, 128])
values  torch.Size([1, 32, 32, 128])
keys after repetition torch.Size([1, 32, 32, 128])
values after repetition torch.Size([1, 32, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 32, 128])
values transpose  torch.Size([1, 32, 32, 128])
scores  torch.Size([1, 32, 1, 32])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  40%|████      | 31/77 [00:30<00:44,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 32, 32, 128])
values  torch.Size([1, 32, 32, 128])
keys after repetition torch.Size([1, 32, 32, 128])
values after repetition torch.Size([1, 32, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 32, 128])
values transpose  torch.Size([1, 32, 32, 128])
scores  torch.Size([1, 32, 1, 32])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  42%|████▏     | 32/77 [00:31<00:43,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 33, 32, 128])
values  torch.Size([1, 33, 32, 128])
keys after repetition torch.Size([1, 33, 32, 128])
values after repetition torch.Size([1, 33, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 33, 128])
values transpose  torch.Size([1, 32, 33, 128])
scores  torch.Size([1, 32, 1, 33])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  43%|████▎     | 33/77 [00:32<00:42,  1.03it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 35, 32, 128])
values  torch.Size([1, 35, 32, 128])
keys after repetition torch.Size([1, 35, 32, 128])
values after repetition torch.Size([1, 35, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 35, 128])
values transpose  torch.Size([1, 32, 35, 128])
scores  torch.Size([1, 32, 1, 35])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  44%|████▍     | 34/77 [00:33<00:41,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 35, 32, 128])
values  torch.Size([1, 35, 32, 128])
keys after repetition torch.Size([1, 35, 32, 128])
values after repetition torch.Size([1, 35, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 35, 128])
values transpose  torch.Size([1, 32, 35, 128])
scores  torch.Size([1, 32, 1, 35])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  45%|████▌     | 35/77 [00:34<00:40,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 36, 32, 128])
values  torch.Size([1, 36, 32, 128])
keys after repetition torch.Size([1, 36, 32, 128])
values after repetition torch.Size([1, 36, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 36, 128])
values transpose  torch.Size([1, 32, 36, 128])
scores  torch.Size([1, 32, 1, 36])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  47%|████▋     | 36/77 [00:35<00:39,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 37, 32, 128])
values  torch.Size([1, 37, 32, 128])
keys after repetition torch.Size([1, 37, 32, 128])
values after repetition torch.Size([1, 37, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 37, 128])
values transpose  torch.Size([1, 32, 37, 128])
scores  torch.Size([1, 32, 1, 37])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  48%|████▊     | 37/77 [00:36<00:38,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 38, 32, 128])
values  torch.Size([1, 38, 32, 128])
keys after repetition torch.Size([1, 38, 32, 128])
values after repetition torch.Size([1, 38, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 38, 128])
values transpose  torch.Size([1, 32, 38, 128])
scores  torch.Size([1, 32, 1, 38])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  49%|████▉     | 38/77 [00:37<00:37,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 39, 32, 128])
values  torch.Size([1, 39, 32, 128])
keys after repetition torch.Size([1, 39, 32, 128])
values after repetition torch.Size([1, 39, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 39, 128])
values transpose  torch.Size([1, 32, 39, 128])
scores  torch.Size([1, 32, 1, 39])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  51%|█████     | 39/77 [00:38<00:36,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 40, 32, 128])
values  torch.Size([1, 40, 32, 128])
keys after repetition torch.Size([1, 40, 32, 128])
values after repetition torch.Size([1, 40, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 40, 128])
values transpose  torch.Size([1, 32, 40, 128])
scores  torch.Size([1, 32, 1, 40])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  52%|█████▏    | 40/77 [00:39<00:35,  1.03it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 42, 32, 128])
values  torch.Size([1, 42, 32, 128])
keys after repetition torch.Size([1, 42, 32, 128])
values after repetition torch.Size([1, 42, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 42, 128])
values transpose  torch.Size([1, 32, 42, 128])
scores  torch.Size([1, 32, 1, 42])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  53%|█████▎    | 41/77 [00:40<00:35,  1.03it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 42, 32, 128])
values  torch.Size([1, 42, 32, 128])
keys after repetition torch.Size([1, 42, 32, 128])
values after repetition torch.Size([1, 42, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 42, 128])
values transpose  torch.Size([1, 32, 42, 128])
scores  torch.Size([1, 32, 1, 42])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  55%|█████▍    | 42/77 [00:41<00:34,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 43, 32, 128])
values  torch.Size([1, 43, 32, 128])
keys after repetition torch.Size([1, 43, 32, 128])
values after repetition torch.Size([1, 43, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 43, 128])
values transpose  torch.Size([1, 32, 43, 128])
scores  torch.Size([1, 32, 1, 43])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  56%|█████▌    | 43/77 [00:42<00:33,  1.02it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 45, 32, 128])
values  torch.Size([1, 45, 32, 128])
keys after repetition torch.Size([1, 45, 32, 128])
values after repetition torch.Size([1, 45, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 45, 128])
values transpose  torch.Size([1, 32, 45, 128])
scores  torch.Size([1, 32, 1, 45])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  57%|█████▋    | 44/77 [00:43<00:32,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 45, 32, 128])
values  torch.Size([1, 45, 32, 128])
keys after repetition torch.Size([1, 45, 32, 128])
values after repetition torch.Size([1, 45, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 45, 128])
values transpose  torch.Size([1, 32, 45, 128])
scores  torch.Size([1, 32, 1, 45])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  58%|█████▊    | 45/77 [00:44<00:31,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 46, 32, 128])
values  torch.Size([1, 46, 32, 128])
keys after repetition torch.Size([1, 46, 32, 128])
values after repetition torch.Size([1, 46, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 46, 128])
values transpose  torch.Size([1, 32, 46, 128])
scores  torch.Size([1, 32, 1, 46])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  60%|█████▉    | 46/77 [00:44<00:30,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 47, 32, 128])
values  torch.Size([1, 47, 32, 128])
keys after repetition torch.Size([1, 47, 32, 128])
values after repetition torch.Size([1, 47, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 47, 128])
values transpose  torch.Size([1, 32, 47, 128])
scores  torch.Size([1, 32, 1, 47])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  61%|██████    | 47/77 [00:45<00:29,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 48, 32, 128])
values  torch.Size([1, 48, 32, 128])
keys after repetition torch.Size([1, 48, 32, 128])
values after repetition torch.Size([1, 48, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 48, 128])
values transpose  torch.Size([1, 32, 48, 128])
scores  torch.Size([1, 32, 1, 48])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  62%|██████▏   | 48/77 [00:46<00:28,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 49, 32, 128])
values  torch.Size([1, 49, 32, 128])
keys after repetition torch.Size([1, 49, 32, 128])
values after repetition torch.Size([1, 49, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 49, 128])
values transpose  torch.Size([1, 32, 49, 128])
scores  torch.Size([1, 32, 1, 49])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  64%|██████▎   | 49/77 [00:47<00:27,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 50, 32, 128])
values  torch.Size([1, 50, 32, 128])
keys after repetition torch.Size([1, 50, 32, 128])
values after repetition torch.Size([1, 50, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 50, 128])
values transpose  torch.Size([1, 32, 50, 128])
scores  torch.Size([1, 32, 1, 50])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  65%|██████▍   | 50/77 [00:48<00:26,  1.02it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 52, 32, 128])
values  torch.Size([1, 52, 32, 128])
keys after repetition torch.Size([1, 52, 32, 128])
values after repetition torch.Size([1, 52, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 52, 128])
values transpose  torch.Size([1, 32, 52, 128])
scores  torch.Size([1, 32, 1, 52])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  66%|██████▌   | 51/77 [00:49<00:25,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 52, 32, 128])
values  torch.Size([1, 52, 32, 128])
keys after repetition torch.Size([1, 52, 32, 128])
values after repetition torch.Size([1, 52, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 52, 128])
values transpose  torch.Size([1, 32, 52, 128])
scores  torch.Size([1, 32, 1, 52])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  68%|██████▊   | 52/77 [00:50<00:24,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 53, 32, 128])
values  torch.Size([1, 53, 32, 128])
keys after repetition torch.Size([1, 53, 32, 128])
values after repetition torch.Size([1, 53, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 53, 128])
values transpose  torch.Size([1, 32, 53, 128])
scores  torch.Size([1, 32, 1, 53])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  69%|██████▉   | 53/77 [00:51<00:23,  1.02it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 55, 32, 128])
values  torch.Size([1, 55, 32, 128])
keys after repetition torch.Size([1, 55, 32, 128])
values after repetition torch.Size([1, 55, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 55, 128])
values transpose  torch.Size([1, 32, 55, 128])
scores  torch.Size([1, 32, 1, 55])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  70%|███████   | 54/77 [00:52<00:22,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 55, 32, 128])
values  torch.Size([1, 55, 32, 128])
keys after repetition torch.Size([1, 55, 32, 128])
values after repetition torch.Size([1, 55, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 55, 128])
values transpose  torch.Size([1, 32, 55, 128])
scores  torch.Size([1, 32, 1, 55])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  71%|███████▏  | 55/77 [00:53<00:21,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 56, 32, 128])
values  torch.Size([1, 56, 32, 128])
keys after repetition torch.Size([1, 56, 32, 128])
values after repetition torch.Size([1, 56, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 56, 128])
values transpose  torch.Size([1, 32, 56, 128])
scores  torch.Size([1, 32, 1, 56])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  73%|███████▎  | 56/77 [00:54<00:20,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 57, 32, 128])
values  torch.Size([1, 57, 32, 128])
keys after repetition torch.Size([1, 57, 32, 128])
values after repetition torch.Size([1, 57, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 57, 128])
values transpose  torch.Size([1, 32, 57, 128])
scores  torch.Size([1, 32, 1, 57])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  74%|███████▍  | 57/77 [00:55<00:19,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 58, 32, 128])
values  torch.Size([1, 58, 32, 128])
keys after repetition torch.Size([1, 58, 32, 128])
values after repetition torch.Size([1, 58, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 58, 128])
values transpose  torch.Size([1, 32, 58, 128])
scores  torch.Size([1, 32, 1, 58])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  75%|███████▌  | 58/77 [00:56<00:18,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 59, 32, 128])
values  torch.Size([1, 59, 32, 128])
keys after repetition torch.Size([1, 59, 32, 128])
values after repetition torch.Size([1, 59, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 59, 128])
values transpose  torch.Size([1, 32, 59, 128])
scores  torch.Size([1, 32, 1, 59])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  77%|███████▋  | 59/77 [00:57<00:17,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 60, 32, 128])
values  torch.Size([1, 60, 32, 128])
keys after repetition torch.Size([1, 60, 32, 128])
values after repetition torch.Size([1, 60, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 60, 128])
values transpose  torch.Size([1, 32, 60, 128])
scores  torch.Size([1, 32, 1, 60])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  78%|███████▊  | 60/77 [00:58<00:16,  1.02it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 62, 32, 128])
values  torch.Size([1, 62, 32, 128])
keys after repetition torch.Size([1, 62, 32, 128])
values after repetition torch.Size([1, 62, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 62, 128])
values transpose  torch.Size([1, 32, 62, 128])
scores  torch.Size([1, 32, 1, 62])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  79%|███████▉  | 61/77 [00:59<00:15,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 62, 32, 128])
values  torch.Size([1, 62, 32, 128])
keys after repetition torch.Size([1, 62, 32, 128])
values after repetition torch.Size([1, 62, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 62, 128])
values transpose  torch.Size([1, 32, 62, 128])
scores  torch.Size([1, 32, 1, 62])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  81%|████████  | 62/77 [01:00<00:14,  1.02it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 63, 32, 128])
values  torch.Size([1, 63, 32, 128])
keys after repetition torch.Size([1, 63, 32, 128])
values after repetition torch.Size([1, 63, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 63, 128])
values transpose  torch.Size([1, 32, 63, 128])
scores  torch.Size([1, 32, 1, 63])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  82%|████████▏ | 63/77 [01:01<00:13,  1.02it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 65, 32, 128])
values  torch.Size([1, 65, 32, 128])
keys after repetition torch.Size([1, 65, 32, 128])
values after repetition torch.Size([1, 65, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 65, 128])
values transpose  torch.Size([1, 32, 65, 128])
scores  torch.Size([1, 32, 1, 65])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  83%|████████▎ | 64/77 [01:02<00:12,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 65, 32, 128])
values  torch.Size([1, 65, 32, 128])
keys after repetition torch.Size([1, 65, 32, 128])
values after repetition torch.Size([1, 65, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 65, 128])
values transpose  torch.Size([1, 32, 65, 128])
scores  torch.Size([1, 32, 1, 65])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  84%|████████▍ | 65/77 [01:03<00:11,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 66, 32, 128])
values  torch.Size([1, 66, 32, 128])
keys after repetition torch.Size([1, 66, 32, 128])
values after repetition torch.Size([1, 66, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 66, 128])
values transpose  torch.Size([1, 32, 66, 128])
scores  torch.Size([1, 32, 1, 66])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  86%|████████▌ | 66/77 [01:04<00:10,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 67, 32, 128])
values  torch.Size([1, 67, 32, 128])
keys after repetition torch.Size([1, 67, 32, 128])
values after repetition torch.Size([1, 67, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 67, 128])
values transpose  torch.Size([1, 32, 67, 128])
scores  torch.Size([1, 32, 1, 67])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  87%|████████▋ | 67/77 [01:05<00:09,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 69, 32, 128])
values  torch.Size([1, 69, 32, 128])
keys after repetition torch.Size([1, 69, 32, 128])
values after repetition torch.Size([1, 69, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 69, 128])
values transpose  torch.Size([1, 32, 69, 128])
scores  torch.Size([1, 32, 1, 69])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  88%|████████▊ | 68/77 [01:06<00:08,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 69, 32, 128])
values  torch.Size([1, 69, 32, 128])
keys after repetition torch.Size([1, 69, 32, 128])
values after repetition torch.Size([1, 69, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 69, 128])
values transpose  torch.Size([1, 32, 69, 128])
scores  torch.Size([1, 32, 1, 69])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  90%|████████▉ | 69/77 [01:07<00:07,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 70, 32, 128])
values  torch.Size([1, 70, 32, 128])
keys after repetition torch.Size([1, 70, 32, 128])
values after repetition torch.Size([1, 70, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 70, 128])
values transpose  torch.Size([1, 32, 70, 128])
scores  torch.Size([1, 32, 1, 70])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  91%|█████████ | 70/77 [01:08<00:06,  1.01it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 72, 32, 128])
values  torch.Size([1, 72, 32, 128])
keys after repetition torch.Size([1, 72, 32, 128])
values after repetition torch.Size([1, 72, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 72, 128])
values transpose  torch.Size([1, 32, 72, 128])
scores  torch.Size([1, 32, 1, 72])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  92%|█████████▏| 71/77 [01:09<00:05,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 72, 32, 128])
values  torch.Size([1, 72, 32, 128])
keys after repetition torch.Size([1, 72, 32, 128])
values after repetition torch.Size([1, 72, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 72, 128])
values transpose  torch.Size([1, 32, 72, 128])
scores  torch.Size([1, 32, 1, 72])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  94%|█████████▎| 72/77 [01:10<00:04,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 73, 32, 128])
values  torch.Size([1, 73, 32, 128])
keys after repetition torch.Size([1, 73, 32, 128])
values after repetition torch.Size([1, 73, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 73, 128])
values transpose  torch.Size([1, 32, 73, 128])
scores  torch.Size([1, 32, 1, 73])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  95%|█████████▍| 73/77 [01:11<00:03,  1.01it/s]

op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 75, 32, 128])
values  torch.Size([1, 75, 32, 128])
keys after repetition torch.Size([1, 75, 32, 128])
values after repetition torch.Size([1, 75, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 75, 128])
values transpose  torch.Size([1, 32, 75, 128])
scores  torch.Size([1, 32, 1, 75])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens:  96%|█████████▌| 74/77 [01:12<00:02,  1.01it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 75, 32, 128])
values  torch.Size([1, 75, 32, 128])
keys after repetition torch.Size([1, 75, 32, 128])
values after repetition torch.Size([1, 75, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 75, 128])
values transpose  torch.Size([1, 32, 75, 128])
scores  torch.Size([1, 32, 1, 75])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  97%|█████████▋| 75/77 [01:13<00:01,  1.00it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 76, 32, 128])
values  torch.Size([1, 76, 32, 128])
keys after repetition torch.Size([1, 76, 32, 128])
values after repetition torch.Size([1, 76, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 76, 128])
values transpose  torch.Size([1, 32, 76, 128])
scores  torch.Size([1, 32, 1, 76])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens:  99%|█████████▊| 76/77 [01:14<00:00,  1.00it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 77, 32, 128])
values  torch.Size([1, 77, 32, 128])
keys after repetition torch.Size([1, 77, 32, 128])
values after repetition torch.Size([1, 77, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 77, 128])
values transpose  torch.Size([1, 32, 77, 128])
scores  torch.Size([1, 32, 1, 77])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
op  torch.Size([1, 1, 32000])
torch.Size([1, 1, 32000])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128]

Generating tokens: 100%|██████████| 77/77 [01:15<00:00,  1.00it/s]

x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([1, 1, 32, 128])
xk ROPE:  torch.Size([1, 1, 32, 128])
keys  torch.Size([1, 78, 32, 128])
values  torch.Size([1, 78, 32, 128])
keys after repetition torch.Size([1, 78, 32, 128])
values after repetition torch.Size([1, 78, 32, 128])
xq transpose  torch.Size([1, 32, 1, 128])
keys transpose  torch.Size([1, 32, 78, 128])
values transpose  torch.Size([1, 32, 78, 128])
scores  torch.Size([1, 32, 1, 78])
output before wo  torch.Size([1, 32, 1, 128])
output after reshape  torch.Size([1, 1, 4096])
output after wo  torch.Size([1, 1, 4096])
x  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 4096])
xk  torch.Size([1, 1, 4096])
xv  torch.Size([1, 1, 4096])
xq  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xk  torch.Size([1, 1, 32, 128])
xq ROPE:  torch.Size([

Generating tokens: 100%|██████████| 77/77 [01:15<00:00,  1.02it/s]

Explain the concept of self-attention in 2 lines.
Self-attention is a neural network architecture that allows the model to learn to attend to different parts of the input sequence. It is a type of attention mechanism that allows the model to focus on specific parts of the input sequence, such as words or phrases, and to use that information to make predictions.
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 





#### Rough

In [14]:
tokenizer = SentencePieceProcessor()
tokenizer.load('/kaggle/input/llama2-7b-model/tokenizer.model')

True

In [15]:
tokenizer.encode(prompts[0])

[3439, 17632, 1925, 29892, 278, 6368, 310, 14215, 537, 5922, 393, 29871]

In [19]:
tokenizer.encode(prompts[0],add_bos=True,add_eos=False)

[1, 3439, 17632, 1925, 29892, 278, 6368, 310, 14215, 537, 5922, 393, 29871]

In [20]:
tokenizer.vocab_size()

32000

In [21]:
tokenizer.pad_id()

-1

In [24]:
x = torch.full((5,100),-1)
(x!=-1).shape

torch.Size([5, 100])

In [15]:
dim: int = 4096
n_layers: int = 32
n_heads: int = 32
n_kv_heads: Optional[int] = None
vocab_size: int = -1 # Later set in the build method
multiple_of: int = 256
ffn_dim_multiplier: Optional[float] = None
norm_eps: float = 1e-5

# Needed for KV cache
max_batch_size: int = 32
max_seq_len: int = 2048

device: str = None
max_gen_len = 64

In [16]:
# if max_gen_len is None:
#     max_gen_len = max_seq_len - 1
# Convert each prompt into tokens
prompt_tokens = [tokenizer.encode(prompt, out_type=int, add_bos=True, add_eos=False) for prompt in prompts]
# Make sure the batch size is not too large
batch_size = len(prompt_tokens) # Number of Prompts
assert batch_size <= max_batch_size, f"batch size must be less than or equal to {max_batch_size}"
max_prompt_len = max(len(prompt) for prompt in prompt_tokens)
# Make sure the prompt length is not larger than the maximum sequence length
assert max_prompt_len <= max_seq_len, f"prompt length must be less than or equal to {max_seq_len}"
total_len = min(max_seq_len, max_gen_len + max_prompt_len)

# Create the list that will contain the generated tokens, along with the initial prompt tokens
pad_id = tokenizer.pad_id() ## pad_id = -1
tokens = torch.full((batch_size, total_len), pad_id, dtype=torch.long, device=device)
for k, t in enumerate(prompt_tokens):
    # Populate the initial tokens with the prompt tokens
    tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device=device)

eos_reached = torch.tensor([False] * batch_size, device=device)
prompt_tokens_mask = tokens != pad_id # True if the token is a prompt token, False otherwise

In [46]:
tokens

tensor([[    1,  3439, 17632,  1925, 29892,   278,  6368,   310, 14215,   537,
          5922,   393, 29871,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1]])

In [40]:
tokens[0,74:75]

tensor([-1])

In [38]:
print(total_len)
cur_iterator = tqdm(range(1, total_len), desc="Generating tokens")
for cur_pos in cur_iterator:
    print(cur_pos-1," ",cur_pos)
    # with torch.no_grad():
    #     logits = self.model.forward(tokens[:, cur_pos-1:cur_pos], cur_pos)

77


Generating tokens: 100%|██████████| 76/76 [00:00<00:00, 41571.09it/s]

0   1
1   2
2   3
3   4
4   5
5   6
6   7
7   8
8   9
9   10
10   11
11   12
12   13
13   14
14   15
15   16
16   17
17   18
18   19
19   20
20   21
21   22
22   23
23   24
24   25
25   26
26   27
27   28
28   29
29   30
30   31
31   32
32   33
33   34
34   35
35   36
36   37
37   38
38   39
39   40
40   41
41   42
42   43
43   44
44   45
45   46
46   47
47   48
48   49
49   50
50   51
51   52
52   53
53   54
54   55
55   56
56   57
57   58
58   59
59   60
60   61
61   62
62   63
63   64
64   65
65   66
66   67
67   68
68   69
69   70
70   71
71   72
72   73
73   74
74   75
75   76





In [43]:
tokens[:,13:14]

tensor([[-1]])

In [48]:
for cur_pos in cur_iterator:
    print(cur_pos)
    with torch.no_grad():
        logits = model.model.forward(tokens[:, cur_pos-1:cur_pos], cur_pos)
        print(logits.shape)
    # if temperature > 0:
    #     # The temperature is applied before the softmax
    #     probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
    #     next_token = self._sample_top_p(probs, top_p)
    # else:
    #     # Greedily select the token with the max probability
    #     next_token = torch.argmax(logits[:, -1], dim=-1)
    next_token = torch.argmax(logits[:, -1], dim=-1)
    # print(next_token.shape)
    next_token = next_token.reshape(-1)
    # Only replace token if it is a padding token
    next_token = torch.where(prompt_tokens_mask[:, cur_pos], tokens[:, cur_pos], next_token)
    tokens[:, cur_pos] = next_token
    # EOS is reached only if we found an EOS token for a padding position
    eos_reached |= (~prompt_tokens_mask[:, cur_pos]) & (next_token == tokenizer.eos_id)
    if all(eos_reached):
        break

1
torch.Size([1, 1, 32000])
2
torch.Size([1, 1, 32000])
3
torch.Size([1, 1, 32000])
4
torch.Size([1, 1, 32000])
5
torch.Size([1, 1, 32000])
6
torch.Size([1, 1, 32000])
7
torch.Size([1, 1, 32000])
8
torch.Size([1, 1, 32000])
9
torch.Size([1, 1, 32000])
10
torch.Size([1, 1, 32000])
11
torch.Size([1, 1, 32000])
12
torch.Size([1, 1, 32000])
13
torch.Size([1, 1, 32000])
14
torch.Size([1, 1, 32000])
15
torch.Size([1, 1, 32000])
16
torch.Size([1, 1, 32000])
17
torch.Size([1, 1, 32000])
18
torch.Size([1, 1, 32000])
19
torch.Size([1, 1, 32000])
20
torch.Size([1, 1, 32000])
21
torch.Size([1, 1, 32000])
22
torch.Size([1, 1, 32000])
23
torch.Size([1, 1, 32000])
24
torch.Size([1, 1, 32000])
25
torch.Size([1, 1, 32000])
26
torch.Size([1, 1, 32000])
27
torch.Size([1, 1, 32000])
28
torch.Size([1, 1, 32000])
29
torch.Size([1, 1, 32000])
30
torch.Size([1, 1, 32000])
31
torch.Size([1, 1, 32000])
32
torch.Size([1, 1, 32000])
33
torch.Size([1, 1, 32000])
34
torch.Size([1, 1, 32000])
35
torch.Size([1, 1, 32

In [45]:
model.args.vocab_size

32000

Generating tokens: 100%|██████████| 114/114 [04:13<00:00,  2.22s/it]

Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.
The theory of relativity is a theory of physics that describes the relationship between space and time. It is a theory that describes the behavior of matter and energy on a large scale.
The theory of relativity is a theory of physics that describes the relationship between space and time. It is a theory that describes the behavior of matter and energy on a large
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
If Google was an Italian company founded in Milan, it would be called Google Italia.
Google Italia is the Italian version of the world’s most popular search engine. It was launched in 2006 and has since become the most popular search engine in Italy.
Google Italia is a subsidiary of Google Inc., which is based in Mountain View, California. The company was founded in 1998


