In [4]:
import torch
import torch.nn as nn 

In [5]:
getattr(torch.nn, "LayerNorm")

torch.nn.modules.normalization.LayerNorm

In [12]:
batch, sentence_length, embedding_dim = 3, 3, 3
embedding = torch.randn(batch, sentence_length, embedding_dim)
print(embedding)
layer_norm = nn.LayerNorm(embedding_dim)
# Activate module
layer_norm(embedding)

tensor([[[-0.1779, -0.1302,  0.9696],
         [-0.3571, -0.5792, -0.2485],
         [ 0.8429, -2.2337, -0.1311]],

        [[ 0.4909, -1.9780, -0.0793],
         [-0.2605, -0.7634,  1.1383],
         [-0.7953, -0.5048, -1.6523]],

        [[-0.0388,  1.5982, -0.5791],
         [-0.4683,  1.3370, -0.4797],
         [ 2.3081,  1.6312,  0.3573]]])


tensor([[[-0.7516, -0.6616,  1.4132],
         [ 0.2748, -1.3385,  1.0637],
         [ 1.0517, -1.3447,  0.2930]],

        [[ 0.9598, -1.3794,  0.4196],
         [-0.3712, -0.9962,  1.3674],
         [ 0.3876,  0.9840, -1.3716]],

        [[-0.3949,  1.3735, -0.9786],
         [-0.7004,  1.4142, -0.7138],
         [ 1.0830,  0.2460, -1.3291]]], grad_fn=<NativeLayerNormBackward0>)

In [24]:
def build_rope_cache(
    seq_len: int, n_elem: int, device = None, base: int = 10000, condense_ratio: int = 1
) :
    """Enhanced Transformer with Rotary Position Embedding.

    Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
    transformers/rope/__init__.py. MIT License:
    https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
    """
    # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$

    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))

    # Create position indexes `[0, 1, ..., seq_len - 1]`
    seq_idx = torch.arange(seq_len, device=device) / condense_ratio

    # Calculate the product of position index and $\theta_i$
    idx_theta = torch.outer(seq_idx, theta).repeat(1, 2)

    return torch.cos(idx_theta), torch.sin(idx_theta)

In [25]:
rope_pct=0.25
n_embed=512
n_head=8
head_size = n_embed//n_head
n_elem = int(rope_pct*head_size)
print(n_elem)
build_rope_cache(10, n_elem)

16


(tensor([[ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
           1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
         [ 0.5403,  0.9504,  0.9950,  0.9995,  0.9999,  1.0000,  1.0000,  1.0000,
           0.5403,  0.9504,  0.9950,  0.9995,  0.9999,  1.0000,  1.0000,  1.0000],
         [-0.4161,  0.8066,  0.9801,  0.9980,  0.9998,  1.0000,  1.0000,  1.0000,
          -0.4161,  0.8066,  0.9801,  0.9980,  0.9998,  1.0000,  1.0000,  1.0000],
         [-0.9900,  0.5828,  0.9553,  0.9955,  0.9996,  1.0000,  1.0000,  1.0000,
          -0.9900,  0.5828,  0.9553,  0.9955,  0.9996,  1.0000,  1.0000,  1.0000],
         [-0.6536,  0.3011,  0.9211,  0.9920,  0.9992,  0.9999,  1.0000,  1.0000,
          -0.6536,  0.3011,  0.9211,  0.9920,  0.9992,  0.9999,  1.0000,  1.0000],
         [ 0.2837, -0.0103,  0.8776,  0.9875,  0.9988,  0.9999,  1.0000,  1.0000,
           0.2837, -0.0103,  0.8776,  0.9875,  0.9988,  0.9999,  1.0000,  1.0000],
         [

In [30]:
import torch

In [29]:
layer_norm = torch.nn.modules.normalization.LayerNorm
layer_norm(512, eps=1.0)

LayerNorm((512,), eps=1.0, elementwise_affine=True)

In [36]:
inp = torch.randn(5, 10, 8)

In [37]:
import torch.nn as nn

In [38]:
inp_layer = nn.Linear(8, 5)

In [40]:
op = inp_layer(inp)

In [41]:
inp.shape

torch.Size([5, 10, 8])

In [42]:
op.shape

torch.Size([5, 10, 5])

In [43]:
inp

tensor([[[-2.0198,  0.3577, -0.5207, -0.1539,  0.3153, -0.6795,  0.2200,
           0.5134],
         [ 0.4865, -0.1810, -0.2653, -0.4134, -0.7792, -0.2879, -0.3637,
          -0.5716],
         [-0.3169, -0.3722,  0.0184,  1.5840,  0.2090,  0.6374, -0.7977,
           1.5537],
         [-0.0475, -0.9240, -0.2025,  1.4259,  1.9119, -0.3163,  0.4386,
          -0.0115],
         [ 1.8534,  0.8006, -0.3712, -0.4632,  0.1588,  1.1213, -0.4442,
           1.0335],
         [ 0.3740, -0.6893, -0.5329, -0.0106,  1.8452, -0.4108,  1.1671,
           0.5873],
         [ 0.0759,  1.8401, -0.6289,  0.3394, -1.5147, -1.5296, -0.6283,
           0.1567],
         [-0.9164, -0.3853,  0.0749, -0.0648,  0.9237,  0.7397, -1.6918,
          -0.3539],
         [ 0.9429,  2.0696,  0.2259, -1.0724, -0.0069, -0.4512, -0.3075,
           1.2850],
         [ 1.7650,  0.5711,  0.5113,  0.6858,  1.2570,  0.9052,  1.8211,
          -0.7861]],

        [[ 0.4796, -0.2595, -0.4435, -0.2692, -2.4653,  0.9393,  1.9

In [44]:
op

tensor([[[-0.2382,  1.2814,  0.4812,  0.6993,  0.3840],
         [ 0.3007, -0.3421, -0.5420,  0.2970, -0.5170],
         [-0.9252, -0.0696,  0.0864,  0.4234,  0.1360],
         [-0.8374, -0.0107, -0.3267,  0.2979,  0.6358],
         [-0.3415, -0.1580, -0.0230,  0.0275, -0.6359],
         [-0.7547,  0.4643, -0.1063,  0.5789, -0.0361],
         [ 0.1354,  0.7427, -0.7805,  0.0535, -0.6395],
         [-0.0532, -0.1849,  0.8555,  0.8299,  0.8211],
         [-0.5045,  0.7879,  0.3762,  0.4318, -1.0083],
         [-0.5817, -0.4732, -0.8947, -1.3382,  0.0570]],

        [[ 0.3030, -0.2322, -1.3049, -0.9431, -1.1370],
         [-0.3908,  1.7777,  0.4137,  0.5385, -0.1016],
         [-0.7657, -0.0157, -0.9643, -1.5109,  0.3757],
         [-0.8375, -0.1582, -0.4734,  1.0624, -0.6310],
         [ 0.1777,  0.5551,  0.3654, -0.4976,  0.1714],
         [-0.6529, -0.1334,  0.1244,  0.3624, -0.7437],
         [-0.4757, -1.0268, -1.3639, -0.4294, -0.6449],
         [-0.6276,  0.0293,  0.4315,  0.6240, 

In [48]:
for name, param in inp_layer.named_parameters():
    print(name, param)

weight Parameter containing:
tensor([[-0.0385,  0.0064, -0.2588, -0.1811, -0.2059,  0.0646, -0.0766, -0.3341],
        [-0.3284,  0.3485, -0.3106, -0.1192,  0.1465, -0.1673,  0.1596,  0.2262],
        [-0.3073,  0.0724,  0.1839, -0.3316,  0.2906,  0.1636, -0.2439,  0.2170],
        [-0.1010, -0.3238,  0.0081, -0.3178,  0.1464, -0.3321, -0.3470,  0.2954],
        [-0.3328,  0.1112, -0.2162,  0.2379,  0.3264,  0.2564, -0.0960, -0.2863]],
       requires_grad=True)
bias Parameter containing:
tensor([-0.1837,  0.0024, -0.1589,  0.2194, -0.1644], requires_grad=True)
