# We explore the `Qwen2.5-0.5B-Instruct` Model

In [1]:
from m2_cw.qwen import load_qwen
import torch
import inspect

model, tokenizer, token_map = load_qwen(small_vocabulary=True)

print(model)
print(type(model))

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(13, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbeddin

## Model Summary:
- Step 0: Notation
    - $\mathcal{V}$ = Vocabulary, here $|\mathcal{V}| = 151,936$. All possible tokens that our model can recognise. 
        - eg: "1", "cat", "ham", "?", ...
    - $D$ = Dimension of the embedding space. Each token in $\mathcal{V}$ is mapped to a learned embedding vector in $\mathbb{R}^D$.
        - eg: "cat" -> $(0.321, 0.341, 2.3, -0.5, ...) \in \mathcal{R}^D$.
    - $N$ = Length of the input sequence (in tokens). 
        - eg:  "The cat is fat." -> ["The ", "cat ", "is ", "fat", "."], $N=5$.
- Step 1: Embedding
    - Sequence of tokens -> Sparce One Hot encoded matrix -> embedding matrix
        - $N$ tokens -> $N \times |\mathcal{V}|$ Matrix -> $N \times D$ Matrix of embedding vectors.
- Step 2: 24 Decoder layers
    - Theory:
        - input saved as residual 1.
            - input layernorm RMSNorm.
            - self attention.
                - 14 Heads, with rotary positional encoding on keys and values
        - residual 1 added. Saved as residual 2
            - post attention layernorm RMSNorm.
            - MLP
        - residual 2 added.
- Step 3: Language Model Head


### Step 1: Embedding

In [112]:
embedding = model.model.embed_tokens

tokens = torch.tensor([151000, 4312, 121])
embeddings = embedding(tokens)

print(type(embedding))

print(tokens.shape)
print(embeddings.shape)

<class 'torch.nn.modules.sparse.Embedding'>
torch.Size([3])
torch.Size([3, 896])


### Step 2: Self-Attention Layer x 24

In [89]:
decoder_layer = model.model.layers[0]
self_attn = decoder_layer.self_attn
q = self_attn.q_proj
k = self_attn.k_proj
v = self_attn.v_proj

print(inspect.getabsfile(self_attn.forward))

/Users/chrismiller/venvs/m2_venvs/m2_coursework/lib/python3.13/site-packages/transformers/models/qwen2/modeling_qwen2.py


In [101]:
hidden_size = 896

test_q = torch.tensor([[1, 2, 3, 4],
                      [5, 6, 7, 8],
                      [9, 10, 11, 12],
                      [13, 14, 15, 16]])
print(test_q.shape) # >>> (4, 4) = (Embedding Dimension, Number of Tokens)
new_shape = (4, -1, 2)
new_q = test_q.view(new_shape)
print(new_q.shape)
print(new_q[:, 0, :])

torch.Size([4, 4])
torch.Size([4, 2, 2])
tensor([[ 1,  2],
        [ 5,  6],
        [ 9, 10],
        [13, 14]])


In [105]:
head = model.lm_head
print(head)

Linear(in_features=896, out_features=151936, bias=True)


In [111]:
print(model)

print(inspect.getabsfile(model.generate))

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe