In [1]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

from PIL import Image

In [2]:
# Device configuration
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available() # For macOS
    else "cpu"
)

print(f"Using {device}")

Using cuda


In [3]:
class PatchEmbedding(nn.Module):
    """Patch the image (needs to be square) and performs a linear projection of the patchs see : """
    def __init__(self, img_size, patch_size, in_channels=3, embedding_dim=512):
        super().__init__()
        self.img_size = img_size
        self.n_patches = (self.img_size // patch_size) ** 2
        self.proj_layer = nn.Conv2d(in_channels, embedding_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        """x : [n_batches, in_channels, img_size, img_size]
            output : [n_batches, embedding_dim, n_batches]
        """

        x = self.proj_layer(x) #[n_bathces, embedding_dim, sqrt(n_patches), sqrt(n_pathces)]
        x = x.flatten(2) #[n_batches, embedding_dim, n_patches]

        return x

class EncoderDecoderBlock(nn.Module):

    def __init__(self, dim, n_heads, mlp_ratio=4, p_dropout=0.5):
        super(EncoderDecoderBlock, self).__init__()

        self.dim = dim
        self.n_heads = n_heads
        self.p_dropout = p_dropout
        self.mlp_ratio = mlp_ratio
        self.norm1 = nn.LayerNorm(self.dim)
        self.norm2 = nn.LayerNorm(self.dim)
        self.norm3 = nn.LayerNorm(self.dim)
        self.cross_attention = nn.MultiheadAttention(self.dim, self.n_heads, dropout=self.p_dropout, batch_first=True).to(device)
        self.first_attention = nn.MultiheadAttention(self.dim, self.n_heads, dropout=self.p_dropout, batch_first=True).to(device)
        self.MLP = nn.Sequential(
            nn.Linear(self.dim, self.dim * mlp_ratio),
            nn.GELU(),
            nn.Dropout(self.p_dropout),
            nn.Linear(self.dim * mlp_ratio, self.dim),
            nn.Dropout(self.p_dropout)
        )

    def forward(self, x, features):
        """
        x : [n_samples, n_patches + 1, embedding_dim]
        output : [n_samples, n_patches + 1, embedding_dim]
        """
        attention_out, attn1_weights = self.first_attention(x, x, x)
        first_out = self.norm1(attention_out + x)
        cross_attention, attn2_weights = self.cross_attention(first_out.to(device), features.to(device), features.to(device))
        second_out = self.norm2(first_out + cross_attention)
        mlp_out = self.MLP(second_out)
        output = self.norm3(mlp_out + second_out)

        return output



In [4]:
# Load captions from the text file
with open(os.path.join('./', 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [5]:
# Create mapping of image to captions
mapping = {}
for line in tqdm(captions_doc.split('\n')):
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    caption = " ".join(caption)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [6]:
# Clean the captions
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            caption = caption.lower()
            caption = caption.replace('[^A-Za-z]', '')
            caption = caption.replace('\s+', ' ')
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [7]:
# Preprocess the text
clean(mapping)

In [8]:
all_captions = [caption for captions in mapping.values() for caption in captions]
len(all_captions)

40455

In [9]:
import torchtext
from torchtext.data import get_tokenizer

tokenizer = get_tokenizer("basic_english")

# Tokenize the text
tokenized_text = [tokenizer(caption) for caption in all_captions]

# Build vocabulary : Mapping every token to an integer index
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_text)
vocab_size = len(vocab)
print(vocab_size)

8896


In [10]:
max_length = max(len(caption.split()) for caption in all_captions)
print(max_length)

35


In [11]:

def one_hot(a, num_classes):

    out = np.zeros(num_classes)
    out[a] = 1
    return out


In [12]:
class CaptioningDataset(Dataset):
  def __init__(self, data_keys, mapping, transform, tokenizer, max_length):
    self.data_keys = data_keys
    self.mapping = mapping
    self.transform = transform
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.data_keys)

  def __getitem__(self, idx):
      key = self.data_keys[idx]
      captions = self.mapping[key]

      caption = captions[np.random.choice(len(captions))]
      input2, y = torch.zeros(self.max_length).int(), torch.zeros((self.max_length, vocab_size))

      tokens = self.tokenizer(caption)
      caption_indices = [vocab[token] for token in tokens]

      image = Image.open('Images/' + self.data_keys[idx] + '.jpg')
      image = self.transform(image)
      # print("1", input2.shape)
      for i in range(1, len(caption_indices)):
          in_seq, out_seq = caption_indices[i-1], caption_indices[i]

          out_seq = int(out_seq)

          #in_seq = in_seq[:self.max_length] + [0] * max(0, self.max_length - len(in_seq))
          out_seq = one_hot(out_seq, num_classes=vocab_size)
          input2[i-1] = int(in_seq)
          # print("2", input2.shape)

          y[i-1] = torch.as_tensor(out_seq)
      # print("3", input2.shape)
      # y = pad_sequence([y, dummy3])[:, 0, :]
      # input1 = pad_sequence([torch.tensor(input1), dummy1])[:, 0, :]
      # input2 = pad_sequence([torch.tensor(input2), dummy2])[:, 0, :]
      # print("4", input2.shape)
      # print(input2)
      return image.transpose(0,2), input2, y




In [13]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [14]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.75)
train = image_ids[:split]
test = image_ids[split:]

In [15]:
batch_size = 32
train_dataset = CaptioningDataset(train, mapping, transform, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)

In [16]:

test_dataset = CaptioningDataset(test, mapping, transform, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

In [17]:
print(train_dataset.__getitem__(0)[0].size())
print(train_dataset.__getitem__(0)[1].size())
print(train_dataset.__getitem__(0)[2].size())

torch.Size([224, 224, 3])
torch.Size([35])
torch.Size([35, 8896])


In [18]:
class ImageCaptioningModel(nn.Module):
  def __init__(self, vit, encoder_decoder, vocab_size, embedding_dim):
    super().__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim

    self.vit = vit
    self.transformer = encoder_decoder.to(device)

    self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
    self.decoder = nn.Linear(embedding_dim, vocab_size)

  def forward(self, image, input2):
    vit_out = self.vit(image.cpu().numpy()).numpy()
    vit_out = torch.from_numpy(vit_out)
    embedding_out = self.embedding(input2)

    output = self.transformer(embedding_out, vit_out)

    output = self.decoder(output)

    return output




In [19]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor


image_encoder_model = "google/vit-base-patch16-224-in21k"
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)

2023-12-14 16:23:21.056067: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
for batch in train_loader:
    image, inputs2, targets = batch
    features = feature_extractor(image)
    break

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


In [21]:
from vit_keras import vit
vit_model = vit.vit_b32(
        image_size = (224,224),
        activation = 'softmax',
        pretrained = True,
        include_top = False,
        pretrained_top = False,
        )



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

2023-12-14 16:23:24.517062: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-14 16:23:24.606449: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://git

In [22]:

feat = vit_model.predict(image.numpy())
feat

2023-12-14 16:23:29.331248: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902




2023-12-14 16:23:29.960777: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


array([[ 0.64806175,  0.74852777, -0.3822827 , ...,  0.06661464,
         0.7493057 , -1.1575644 ],
       [ 0.26736924,  1.7556778 ,  0.77989626, ..., -0.19642167,
         0.5085754 , -0.9234603 ],
       [ 0.21296726, -0.21295676, -1.0947556 , ...,  0.46254605,
        -0.86662513, -0.4105462 ],
       ...,
       [ 0.4376147 ,  1.9498951 , -2.073971  , ...,  0.4937921 ,
         0.30152392,  0.0212844 ],
       [ 0.8694054 ,  1.1882443 ,  0.5216992 , ...,  0.5407412 ,
        -0.44117576,  0.03475773],
       [ 0.18222791,  0.91645265, -2.2571015 , ...,  0.26988328,
        -1.2860185 , -0.42763546]], dtype=float32)

In [23]:
feat.shape

(64, 768)

In [24]:
import tensorflow as tf


new_input = vit_model.input
hidden_layer = vit_model.layers[-2].output
vision_transformer_model = tf.keras.Model(new_input, hidden_layer)

In [25]:
featt = vision_transformer_model(image.numpy())

In [26]:
# Instantiate the model
embedding_size = 768
hidden_size = 256
n_heads = 12
depth = 1
dropout = 0.5

transformer = EncoderDecoderBlock(embedding_size,n_heads)


model = ImageCaptioningModel(vision_transformer_model, transformer , vocab_size, embedding_size)
model = model.to(device)

In [27]:
criterion = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [28]:
def loss_function(real, pred):
    mask = torch.logical_not(torch.eq(real, torch.zeros_like(real)))
    print(mask.size())
    loss_ = criterion(real[mask], pred[mask])
    mask = mask.to(dtype=loss_.dtype)
    loss_ *= mask

    return torch.sum(loss_) / torch.sum(mask)

In [None]:
targets.size()

In [29]:
targets[torch.logical_not(torch.eq(targets, torch.zeros_like(targets)))].size()

torch.Size([637])

In [30]:
# Train the model
num_epochs = 50

for epoch in range(num_epochs):
    total_loss = 0
    model.train()

    for batch in train_loader:
        image, inputs2, targets = batch

        image, inputs2, targets = image.to(device), inputs2.to(device), targets.to(device)
        # Generate output sequence from the model
        output = model(image, inputs2)

        mask = torch.logical_not(torch.eq(targets, torch.zeros_like(targets)))

        # Calculate the loss
        loss = criterion(output[mask], targets[mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}')

Epoch [1/50], Loss: 4405.0879
Epoch [2/50], Loss: 4279.7670
Epoch [3/50], Loss: 4271.7992
Epoch [4/50], Loss: 4320.0488
Epoch [5/50], Loss: 4262.4734
Epoch [6/50], Loss: 4263.0032
Epoch [7/50], Loss: 4272.9275
Epoch [8/50], Loss: 4282.2784
Epoch [9/50], Loss: 4303.9345
Epoch [10/50], Loss: 4285.5711
Epoch [11/50], Loss: 4286.8969
Epoch [12/50], Loss: 4295.9728
Epoch [13/50], Loss: 4287.2005
Epoch [14/50], Loss: 4267.2739
Epoch [15/50], Loss: 4276.3354
Epoch [16/50], Loss: 4283.2896
Epoch [17/50], Loss: 4287.9636
Epoch [18/50], Loss: 4259.7562
Epoch [19/50], Loss: 4285.7737
Epoch [20/50], Loss: 4312.4996
Epoch [21/50], Loss: 4281.1047
Epoch [22/50], Loss: 4267.7041
Epoch [23/50], Loss: 4269.2603
Epoch [24/50], Loss: 4245.9423
Epoch [25/50], Loss: 4255.5041
Epoch [26/50], Loss: 4265.8577
Epoch [27/50], Loss: 4268.9237
Epoch [28/50], Loss: 4267.8221
Epoch [29/50], Loss: 4256.5229
Epoch [30/50], Loss: 4267.9430
Epoch [31/50], Loss: 4277.2751
Epoch [32/50], Loss: 4295.8732
Epoch [33/50], Lo

In [None]:
# Evaluation
def idx_to_word(index):
    try:
        return vocab.get_itos()[index]
    except:
        return None

def predict_caption(model, image_path, max_length):
    model.eval()
    image = Image.open(image_path)
    image = transform(image).transpose(0,2)
    input2 = torch.zeros(1).int()
    in_text = 'startseq'
    for _ in range(max_length):
        input2[0] = torch.as_tensor(vocab[in_text.split(' ')[-1]], dtype=torch.int64)
        input2 = input2.to(device)

        outputs = model(image, input2)

        outputs = F.softmax(outputs, dim=1)

        y_pred = torch.argmax(outputs, dim=1).squeeze(0).item()

        word = idx_to_word(y_pred)
        in_text += ' ' + word

        if word is None or word == 'endseq' :
            break

    return in_text

In [None]:
predict_caption(model, "Image/3736786640_70df13be2c", 35)