# Layer

In [2]:
import torch
from torch import nn
from torch.nn.modules.transformer import _get_activation_fn, Module, Tensor, Optional, MultiheadAttention, Linear, Dropout, LayerNorm
from einops import rearrange

from layer import TransformerEncoderLayer
from layer_new import *
from encoders import * 
from transformer import *

## setup

In [10]:
# passed into train() in train.py
emsize=512 #yes, same in the paper
nhead=4 #yes, same in the paper
nhid=2*emsize # #yes, same in the paper: 1024
nlayers=1 # nlayers=6 # hmm, paper says 12

# my param:
num_features=20

# for TransformerEncoderLayer()
d_model=emsize
nhead=nhead

encoder = StyleEncoder(num_features, emsize)
y_encoder = StyleEmbEncoder(1, emsize)
style_encoder = None
global_att_embeddings = None 
full_attention = False
efficient_eval_masking = True

## layer - manually

### data

In [21]:
input = (torch.rand([100,20]),torch.randint(0,10,[100,1])) 
single_eval_pos = 78

src=input
src_mask=single_eval_pos

### run

In [22]:
encoder_layer = TransformerEncoderLayer(d_model=emsize, nhead=nhead)

In [23]:
# (1)
assert isinstance(src, tuple), 'inputs (src) have to be given as (x,y) or (style,x,y) tuple'

# (2)
if len(src) == 2: # (x,y) and no style
    src = (None,) + src

# (3)
style_src, x_src, y_src = src
# (4)
x_src = encoder(x_src)
y_src = y_encoder(y_src.unsqueeze(-1) if len(y_src.shape) < len(x_src.shape) else y_src)
style_src = style_encoder(style_src).unsqueeze(0) if style_encoder else \
    torch.tensor([], device=x_src.device)
global_src = torch.tensor([], device=x_src.device) if global_att_embeddings is None else \
    global_att_embeddings.weight.unsqueeze(1).repeat(1, x_src.shape[1], 1)

if src_mask is not None: assert global_att_embeddings is None or isinstance(src_mask, tuple)
if src_mask is None: # this is RUN: default src_mask=None not changed it seems
    if global_att_embeddings is None: # this is RUN: global_att_embeddings=None it seems
        # (5)
        full_len = len(x_src) + len(style_src)
        if full_attention: # NOT RUN: full_attention=False
            src_mask = bool_mask_to_att_mask(torch.ones((full_len, full_len), dtype=torch.bool)).to(x_src.device)
        elif efficient_eval_masking: # this is RUN: efficient_eval_masking=True
            src_mask = single_eval_pos + len(style_src)
        else:
            src_mask = generate_D_q_matrix(full_len, len(x_src) - single_eval_pos).to(x_src.device)
    else:
        src_mask_args = (global_att_embeddings.num_embeddings,
                            len(x_src) + len(style_src),
                            len(x_src) + len(style_src) - single_eval_pos)
        src_mask = (generate_global_att_globaltokens_matrix(*src_mask_args).to(x_src.device),
                    generate_global_att_trainset_matrix(*src_mask_args).to(x_src.device),
                    generate_global_att_query_matrix(*src_mask_args).to(x_src.device))
# (6)
train_x = x_src[:single_eval_pos] + y_src[:single_eval_pos]
# (7)
src = torch.cat([global_src, style_src, train_x, x_src[single_eval_pos:]], 0)

print(src.shape)

torch.Size([100, 512])


In [24]:
output = encoder_layer(src=src, src_mask=single_eval_pos)

ValueError: not enough values to unpack (expected 3, got 2)

## layer

### data

In [55]:
input = torch.rand(10,100,emsize)

### run

In [56]:
encoder_layer = TransformerEncoderLayer(d_model=emsize, nhead=nhead)
output = encoder_layer(input, src_mask=78)

print(output.shape)
print(output)

torch.Size([10, 100, 512])
tensor([[[ 1.3044,  0.4125, -1.3854,  ..., -0.3428,  0.0858,  1.7267],
         [ 0.4566, -0.7945,  0.2055,  ...,  0.5292, -1.2625,  1.2702],
         [ 1.4319,  1.4634,  1.1549,  ..., -0.1578, -1.3186,  0.5462],
         ...,
         [ 0.5837, -1.1374,  1.1215,  ...,  0.1268, -0.7942, -0.7186],
         [ 0.0514,  1.0702,  0.3732,  ..., -1.0067, -0.4521,  1.0087],
         [ 1.5101, -0.4314,  0.5100,  ..., -0.9074, -0.0667,  1.3394]],

        [[ 0.1783,  0.8957, -0.7370,  ..., -0.5013,  0.2503,  0.6662],
         [ 0.9626,  0.5682, -0.2310,  ..., -1.0270, -0.7095, -0.5113],
         [ 1.0692,  0.2713,  0.5007,  ...,  0.5971,  0.8838,  0.3847],
         ...,
         [ 1.3070,  0.1561,  0.9569,  ..., -1.0775,  1.3538,  1.2830],
         [ 1.8336, -0.5232, -1.5707,  ..., -0.1876, -1.0324, -0.4413],
         [ 1.2467,  1.6991, -0.2167,  ...,  0.2264,  1.4520,  0.9081]],

        [[ 1.6673,  0.9933, -1.4003,  ...,  0.1119,  1.1545,  2.0656],
         [ 0.3479,

In [57]:
encoder_layer_new = TransformerEncoderLayer_new(d_model=emsize, nhead=nhead)
output_new = encoder_layer_new(input, src_mask=78)

print(output_new.shape)
print(output_new)

ValueError: too many values to unpack (expected 3)

# Layer NEW 1

In [11]:
d_model=512
dropout=0.0
device=None
dtype=None

batch_first=False
factory_kwargs = {'device': device, 'dtype': dtype}

In [21]:
self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first)

# Implementation of Feedforward model
        
############################## Inter-feature attention ############################################
pre_linear1 = Linear(1, d_model)
pre_linear2 = Linear(1, d_model)

pre_linear3 = Linear(d_model, 1)
pre_linear4 = Linear(d_model, 1)

inter_feature_attn_1 = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
                                    **factory_kwargs)
inter_feature_attn_2 = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
                                    **factory_kwargs)
####################################################################################################

## forward()

In [29]:
torch.manual_seed(1)

# src = torch.rand(2000,emsize) # input that's originally comming 
src = torch.rand(2000,100)
src_mask = 75

In [30]:
src_ = src

In [31]:
%%script echo skipping

single_eval_position = src_mask
            
src1 = src_.unsqueeze(-1)

# Linear Layers()
# Multihead attention for the right and left

################### The interfeature implementation ###########################
src_left_ = pre_linear1(src1[:single_eval_position]) # 
src_right_ = pre_linear2(src1[single_eval_position:]) # <- linear layers
            
src_left_ = inter_feature_attn_1(src_left_, src_left_, src_left_)[0] #
src_right_ = inter_feature_attn_2(src_right_, src_right_, src_right_)[0] # <- interfeature attnetion

src_left_ = pre_linear3(src_left_) # 
src_right_ = pre_linear4(src_right_) # <- linear layers to squeeze everything back up

src_left_ = torch.squeeze(src_left_)
src_right_ = torch.squeeze(src_right_, -1) # <- set it back to usual [points, features] tensor
###############################################################################


src_left = self_attn(src_left_, src_left_, src_left_)[0]
src_right = self_attn(src_right_, src_left_, src_left_)[0]

src2 = torch.cat([src_left, src_right], dim=0)

skipping


steps

In [32]:
single_eval_position = src_mask

print(single_eval_position)

75


In [33]:
src1 = src_.unsqueeze(-1)

print(src_.shape)
print(src1.shape)

torch.Size([2000, 100])
torch.Size([2000, 100, 1])


In [34]:
src_left_ = pre_linear1(src1[:single_eval_position]) # 
src_right_ = pre_linear2(src1[single_eval_position:]) # <- linear layers

print(src_left_.shape)
print(src_right_.shape)

torch.Size([75, 100, 512])
torch.Size([1925, 100, 512])


In [None]:
src_left_ = inter_feature_attn_1(src_left_, src_left_, src_left_)[0] #
src_right_ = inter_feature_attn_2(src_right_, src_right_, src_right_)[0] # <- interfeature attnetion

# Layer NEW 2

In [56]:
d_model=512
dropout=0.0
layer_norm_eps=1e-5
device=None
dtype=None

batch_first=False
factory_kwargs = {'device': device, 'dtype': dtype}

In [58]:
self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first)

############################## Inter-feature attention ############################################
pre_linear1 = Linear(1, d_model)
pre_linear2 = Linear(d_model, 1)

inter_feature_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
                                    **factory_kwargs)

pre_norm_ = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
pre_dropout = Dropout(dropout)
####################################################################################################


## forward()

In [91]:
torch.manual_seed(1)

# src = torch.rand(2000,emsize) # input that's originally comming 
src = torch.rand(100,200,100)
src_mask = 75

In [92]:
src_ = src

In [93]:
%%script echo skipping

################### The interfeature implementation ###########################
            
src_left_ = src_[:single_eval_position]
src_right_ = src_[single_eval_position:] # <- split the data

src_left_ = rearrange(src_left_, 'b h w -> w (b h) 1') #
src_right_ = rearrange(src_right_, 'b h w -> w (b h) 1') # <- rearrange for Interfeature attention

print(f'Before 1st linear layer dimensions of src_left_ and right_: {src_left_.size(), src_right_.size()}')

src_left_ = self.pre_linear1(src_left_) # <- linear layers
src_right_ = self.pre_linear1(src_right_) # <- linear layers

print(f'After 1st linear layer dimensions of src_left_ and right_: {src_left_.size(), src_right_.size()}')

src_left_ = self.inter_feature_attn(src_left_, src_left_, src_left_)[0] #
src_right_ = self.inter_feature_attn(src_right_, src_right_, src_right_)[0] # <- interfeature attnetion

print(f'After 1st attention layer dimensions of src_left_ and right_: {src_left_.size(), src_right_.size()}')

src_left_ = self.pre_linear2(src_left_) # 
src_right_ = self.pre_linear2(src_right_) # <- linear layers to squeeze everything back up

print(f'After 2st linear layer dimensions of src_left_ and right_: {src_left_.size(), src_right_.size()}')

src_left_ = rearrange(src_left_, 'w (b h) 1 -> b h w', b = single_eval_position)
src_right_ = rearrange(src_right_, 'w (b h) 1 -> b h w', b = src_.size()[0] - single_eval_position)

print(f'After squeeze dimensions of src_left_ and right_: {src_left_.size(), src_right_.size()}')

src_left_ = self.pre_norm_(src_[:single_eval_position] + self.pre_dropout(src_left_))
src_right_ = self.pre_norm_(src_[single_eval_position:] + self.pre_dropout(src_right_)) # <- residual layer
###############################################################################

skipping


steps

In [94]:
src_left_ = src_[:single_eval_position]
src_right_ = src_[single_eval_position:] # <- split the data

print(src_left_.shape)
print(src_right_.shape)

torch.Size([75, 200, 100])
torch.Size([25, 200, 100])


In [95]:
src_left_ = rearrange(src_left_, 'b h w -> w (b h) 1') #
src_right_ = rearrange(src_right_, 'b h w -> w (b h) 1') # <- rearrange for Interfeature attention

print(src_left_.shape)
print(src_right_.shape)

torch.Size([100, 15000, 1])
torch.Size([100, 5000, 1])


### test

In [44]:
torch.manual_seed(1)
tensor = torch.randint(0,10,[5,4])
linear = Linear(1, 3)

print(tensor)
print(tensor.unsqueeze(-1))


tensor([[5, 9, 4, 8],
        [3, 3, 1, 1],
        [9, 2, 8, 9],
        [6, 3, 3, 0],
        [2, 1, 2, 6]])
tensor([[[5],
         [9],
         [4],
         [8]],

        [[3],
         [3],
         [1],
         [1]],

        [[9],
         [2],
         [8],
         [9]],

        [[6],
         [3],
         [3],
         [0]],

        [[2],
         [1],
         [2],
         [6]]])


In [46]:
print(linear(tensor.unsqueeze(-1).float()).shape)
print(linear(tensor.unsqueeze(-1).float()))

torch.Size([5, 4, 3])
tensor([[[-2.2934, -3.3403, -0.7902],
         [-3.7830, -5.7561, -1.4607],
         [-1.9210, -2.7363, -0.6226],
         [-3.4106, -5.1521, -1.2930]],

        [[-1.5486, -2.1323, -0.4550],
         [-1.5486, -2.1323, -0.4550],
         [-0.8037, -0.9244, -0.1197],
         [-0.8037, -0.9244, -0.1197]],

        [[-3.7830, -5.7561, -1.4607],
         [-1.1762, -1.5284, -0.2873],
         [-3.4106, -5.1521, -1.2930],
         [-3.7830, -5.7561, -1.4607]],

        [[-2.6658, -3.9442, -0.9578],
         [-1.5486, -2.1323, -0.4550],
         [-1.5486, -2.1323, -0.4550],
         [-0.4313, -0.3204,  0.0479]],

        [[-1.1762, -1.5284, -0.2873],
         [-0.8037, -0.9244, -0.1197],
         [-1.1762, -1.5284, -0.2873],
         [-2.6658, -3.9442, -0.9578]]], grad_fn=<AddBackward0>)


In [51]:
x = linear(tensor.unsqueeze(-1).float())
query, key, value = x, x, x

print(query.shape)

torch.Size([5, 4, 3])


In [52]:
query, key, value = [x.transpose(1, 0) for x in (query, key, value)]

print(query.shape)

torch.Size([4, 5, 3])


In [18]:
torch.manual_seed(0)
x = torch.randint(0,10,[1152, 1])
x[0:5]

tensor([[4],
        [9],
        [3],
        [0],
        [3]])

In [39]:
torch.manual_seed(0)
tensor1 = torch.rand([5,1,6])
tensor2 = torch.rand([5,6])
#tensor1 = torch.randint(0,10,[5,1,6])

print(tensor2.shape)

torch.Size([5, 6])


In [40]:
query, key, value = tensor1, tensor1, tensor1
multihead_attn = nn.MultiheadAttention(6, 2)
attn_output1, attn_output_weights = multihead_attn(query, key, value)

print(attn_output1.shape)

torch.Size([5, 1, 6])


In [43]:
%%script exho skipping

query2, key2, value2 = tensor2, tensor2, tensor2
multihead_attn2 = nn.MultiheadAttention(6, 2)
attn_output2, attn_output_weights2 = multihead_attn(query2, key2, value2)

print(attn_output2.shape)

Couldn't find program: 'exho'


In [48]:
torch.manual_seed(1)
tensor = torch.randint(0,10,[4,3,1])
print(tensor.shape)
print(tensor)
print(tensor.squeeze(2).shape)
print(tensor.squeeze(2))

torch.Size([4, 3, 1])
tensor([[[5],
         [9],
         [4]],

        [[8],
         [3],
         [3]],

        [[1],
         [1],
         [9]],

        [[2],
         [8],
         [9]]])
torch.Size([4, 3])
tensor([[5, 9, 4],
        [8, 3, 3],
        [1, 1, 9],
        [2, 8, 9]])
