In [1]:
from torch import nn
import torch.nn.init as init
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
os.environ["https_proxy"] = "http://127.0.0.1:7890/"
hf_token = "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"

model_id = "meta-llama/Llama-3.2-1B-Instruct"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_id,  padding_side = "left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16, device_map = device, use_auth_token = hf_token)
model.eval()



LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [7]:
text_ = ("Hi this is my first test!")
inputs = tokenizer(text_, return_tensors="pt").to(device)

In [28]:
inputs

{'input_ids': tensor([[128000,  13347,    420,    374,    856,   1176,   1296,      0]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [47]:
input_embed = model.get_input_embeddings()
input_vectors = input_embed(inputs["input_ids"]).to(device)

In [43]:
# NOTE: so in torch, each word is a row.
# NOTE: in_dim counts how many features per example, i.e. the number of columns
# NOTE： out_dim counts how many features per example, i.e. the number of columns in the output.
class Nin(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        # NOTE: couple of notes here, the nn.Parameter tells the model to update the parameter 
        # during gradient descent
        self.W = nn.Parameter(torch.rand(in_dim, out_dim))
        init.xavier_uniform(self.W)
        # NOTE: biases are not typicall xavier inited.
        self.b = nn.Parameter(torch.zeros(out_dim))


    def forward(self, input):
        # NOTE: below b stands for batch, h stands for how many words, w stands for input_dim
        # NOTE: so its gonne ba word_size * embedding_dim(self.in_dim) for input
        # NOTE: the self.W is inited to embedding_dim(in_dim) * output_dim
        # NOTE: which returns a word_size * output_dim matrix
        # NOTE self.b will be brocasted as output[b, h, o] += bias[o] along the last axis.
        output = torch.einsum("bhw, wo -> bho", input, self.W) 
        return output


    


In [None]:
class attention(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.K_W = Nin(in_dim, out_dim)
        self.Q_W = Nin(in_dim, out_dim)
        self.V_W = Nin(in_dim, out_dim)

    def forward(self, input):
        K = self.K_W(input)
        Q = self.Q_W(input)
        V = self.V_W(input)

        similarity_matrix = torch.softmax( torch.einsum("bij, bkj -> bik", K, Q), dim=-1)
        
        
        return similarity_matrix

In [49]:
print(input_vectors)

tensor([[[ 0.0027,  0.0031, -0.0068,  ...,  0.0011,  0.0008,  0.0015],
         [-0.0173,  0.0327,  0.0117,  ...,  0.0154, -0.0248,  0.0415],
         [-0.0042, -0.0027,  0.0352,  ...,  0.0183, -0.0223, -0.0205],
         ...,
         [ 0.0282, -0.0060, -0.0275,  ...,  0.0037, -0.0047,  0.0170],
         [ 0.0031,  0.0109,  0.0305,  ...,  0.0439,  0.0271, -0.0195],
         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<EmbeddingBackward0>)


In [57]:
myattention = attention(in_dim= 2048, out_dim=512)
myattention.to(device)
myattention.to(torch.bfloat16)
output = myattention(input_vectors)

  init.xavier_uniform(self.W)


In [71]:
#output[0] - torch.transpose(output[0],8,8)
torch.softmax(output, dim=-1)


tensor([[[0.1270, 0.1230, 0.1250, 0.1245, 0.1270, 0.1240, 0.1250, 0.1250],
         [0.1260, 0.1235, 0.1250, 0.1235, 0.1260, 0.1250, 0.1260, 0.1245],
         [0.1250, 0.1270, 0.1240, 0.1260, 0.1260, 0.1250, 0.1250, 0.1230],
         [0.1240, 0.1245, 0.1245, 0.1250, 0.1260, 0.1260, 0.1260, 0.1240],
         [0.1245, 0.1260, 0.1260, 0.1260, 0.1250, 0.1240, 0.1250, 0.1235],
         [0.1250, 0.1245, 0.1240, 0.1270, 0.1260, 0.1270, 0.1230, 0.1235],
         [0.1211, 0.1250, 0.1240, 0.1250, 0.1279, 0.1260, 0.1260, 0.1250],
         [0.1240, 0.1250, 0.1245, 0.1250, 0.1270, 0.1250, 0.1235, 0.1250]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SoftmaxBackward0>)