In [1]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = bert_model(**inputs)
    # Obtain the embeddings by averaging the last hidden states
    embeddings = outputs.last_hidden_state.mean(1)
    return embeddings


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Assuming your inputs
text_embedding = get_bert_embeddings("The image contains rain near the subject, derain it")  # [Batch, TextEmbeddingDim]
latent_image = np.load("D:/FYP/latent_vector.npy")  # [Batch, H, W, Channels]
latent_vector_tensor = torch.from_numpy(latent_image)

# Convert image to PyTorch's channel-first format and apply adaptive avg pooling
latent_vector_tensor = latent_vector_tensor.permute(0, 3, 1, 2)  # [Batch, Channels, H, W]
pooled_image = F.adaptive_avg_pool2d(latent_vector_tensor, (8, 8))  # Reduce spatial dimensions
image_features = pooled_image.flatten(start_dim=2)  # [Batch, Channels, NewH*NewW]
image_features = image_features.permute(0, 2, 1)  # Prepare for attention [Batch, SeqLen, Channels]

print(text_embedding.shape)
print(image_features.shape)


torch.Size([1, 768])
torch.Size([1, 64, 3])


In [4]:
query = text_embedding.unsqueeze(0)  # Add sequence length dimension
query.shape
key_value = image_features
key_value.shape

# Step 1: Define a linear projection layer for tensor2
projection_layer = nn.Linear(in_features=3, out_features=768)

# Step 2: Apply the linear projection to tensor2
# Reshape tensor2 to [64, 3] to apply linear projection, then reshape back to [1, 64, 768]
tensor2_projected = projection_layer(key_value.view(-1, 3)).view(1, 64, 768)

# Step 3: Expand tensor1 to match the sequence length of tensor2_projected
tensor1_expanded = query.expand(-1, 64, -1)  # Size: [1, 64, 768]

print(tensor1_expanded.shape)
print(tensor2_projected.shape)

query = tensor1_expanded
key_value = tensor2_projected
attention_scores = torch.bmm(tensor1_expanded, tensor2_projected.transpose(1, 2))
print(attention_scores.shape)
attention_weights = F.softmax(attention_scores, dim=-1)

# Compute the weighted sum of values
attention_output = torch.bmm(attention_weights, tensor2_projected)

print(attention_output)
print(attention_output.shape)

torch.Size([1, 64, 768])
torch.Size([1, 64, 768])
torch.Size([1, 64, 64])
tensor([[[-0.3339, -1.0060, -0.7532,  ...,  0.3036, -0.1059, -0.2459],
         [-0.3339, -1.0060, -0.7532,  ...,  0.3036, -0.1059, -0.2459],
         [-0.3339, -1.0060, -0.7532,  ...,  0.3036, -0.1059, -0.2459],
         ...,
         [-0.3339, -1.0060, -0.7532,  ...,  0.3036, -0.1059, -0.2459],
         [-0.3339, -1.0060, -0.7532,  ...,  0.3036, -0.1059, -0.2459],
         [-0.3339, -1.0060, -0.7532,  ...,  0.3036, -0.1059, -0.2459]]],
       grad_fn=<BmmBackward0>)
torch.Size([1, 64, 768])


In [5]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerEncoderModel(nn.Module):
    def __init__(self, feature_size, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerEncoderModel, self).__init__()
        self.pos_encoder = PositionalEncoding(feature_size, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model=feature_size, nhead=nhead, dim_feedforward=nhid, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer=encoder_layers, num_layers=nlayers)
        self.feature_size = feature_size

    def forward(self, src):
        src = src * math.sqrt(self.feature_size)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        return output

# Parameters for the Transformer model
feature_size = 768  # Feature size (embedding dimension)
nhead = 8  # Number of heads in the multiheadattention models
nhid = 768  # Dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6  # Number of nn.TransformerEncoderLayer in nn.TransformerEncoder
dropout = 0.2  # Dropout value

model = TransformerEncoderModel(feature_size, nhead, nhid, nlayers, dropout)

# Assuming input tensor is of shape [batch size, sequence length, features]
input_tensor = attention_output

# Transpose the input to match the expected format [sequence length, batch size, features]
input_tensor_transposed = input_tensor.transpose(0, 1)  # Shape: [64, 1, 768]

output = model(input_tensor_transposed)
print(output.shape)  # Should be [64, 1, 768] (sequence length, batch size, features)


torch.Size([64, 1, 768])


