In [1]:
!wget https://storage.googleapis.com/4995-dlcv-project-data/Flickr8k.zip

--2023-12-05 07:22:17--  https://storage.googleapis.com/4995-dlcv-project-data/Flickr8k.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.10.207, 142.251.12.207, 172.217.194.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.10.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1112850179 (1.0G) [application/zip]
Saving to: ‘Flickr8k.zip’


2023-12-05 07:23:14 (18.9 MB/s) - ‘Flickr8k.zip’ saved [1112850179/1112850179]



In [2]:
!wget https://storage.googleapis.com/4995-dlcv-project-data/season_images.zip

--2023-12-05 07:23:16--  https://storage.googleapis.com/4995-dlcv-project-data/season_images.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.12.207, 172.217.194.207, 172.253.118.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.12.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5493694674 (5.1G) [application/zip]
Saving to: ‘season_images.zip’


2023-12-05 07:28:11 (17.8 MB/s) - ‘season_images.zip’ saved [5493694674/5493694674]



In [3]:
!mkdir -p "/content/flickr8k"

In [4]:
!mkdir -p "/content/season_images"

In [5]:
!unzip -q "/content/Flickr8k.zip" -d "/content/flickr8k"

In [6]:
!unzip -q "/content/season_images.zip" -d "/content/season_images"

In [7]:
import os
from collections import defaultdict
import glob
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [8]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Use {DEVICE} device")

Use cuda device


In [9]:
# Count data
flickr8k_data = glob.glob('/content/flickr8k/Images/*.jpg')
print(f"count of Flickr8k images :  {len(flickr8k_data)}")

count of Flickr8k images :  8091


In [11]:
# Load the features and IDs
loaded_features_list = np.load('/content/features.npy', allow_pickle=True)
loaded_ids_list = np.load('/content/ids.npy', allow_pickle=True)
# Recreate the dictionary
loaded_features_dict = dict(zip(loaded_ids_list, loaded_features_list))

In [12]:
print(len(loaded_features_dict), list(loaded_features_dict.items())[0])

8091 ('102455176_5f8ead62d5.jpg', array([0.36299232, 0.19369601, 0.22639096, ..., 0.15519994, 0.28031123,
       0.05308247]))


In [13]:
# Create a dictionary that has image name as key and all 5 captions as value
def read_image_captions(filename):
    image_descriptions = defaultdict(list)
    with open(filename,'r') as file_list:
        next(file_list)
        for line in file_list:
            line = line.strip()
            img_caption_list = line.split(".jpg,")
            img_name, captions = img_caption_list[0] + ".jpg", img_caption_list[1]
            caption_list = ["<START>"] + captions.lower().split(" ") + ["<END>"]
            image_descriptions[img_name].append(caption_list)
    return image_descriptions

In [14]:
descriptions = read_image_captions("/content/flickr8k/captions.txt")

In [15]:
print(descriptions["1001773457_577c3a7d70.jpg"])

[['<START>', 'a', 'black', 'dog', 'and', 'a', 'spotted', 'dog', 'are', 'fighting', '<END>'], ['<START>', 'a', 'black', 'dog', 'and', 'a', 'tri-colored', 'dog', 'playing', 'with', 'each', 'other', 'on', 'the', 'road', '.', '<END>'], ['<START>', 'a', 'black', 'dog', 'and', 'a', 'white', 'dog', 'with', 'brown', 'spots', 'are', 'staring', 'at', 'each', 'other', 'in', 'the', 'street', '.', '<END>'], ['<START>', 'two', 'dogs', 'of', 'different', 'breeds', 'looking', 'at', 'each', 'other', 'on', 'the', 'road', '.', '<END>'], ['<START>', 'two', 'dogs', 'on', 'pavement', 'moving', 'toward', 'each', 'other', '.', '<END>']]


In [16]:
# Split the dataset so that train : validation : test is 70 : 15 : 15
image_names = list(descriptions.keys())
random.shuffle(image_names)
total_images = len(image_names)

train_end = int(0.7 * total_images)
validation_end = train_end + int(0.15 * total_images)

train_names = image_names[: train_end]
val_names = image_names[train_end : validation_end]
test_names = image_names[validation_end :]

In [100]:
# Create a list of image names in the order
image_names = list(loaded_features_dict.keys())

# Use the image name subsets to create training, validation and test sets
train_features = {name: loaded_features_dict[name] for name in train_names}
val_features = {name: loaded_features_dict[name] for name in val_names}
test_features = {name: loaded_features_dict[name] for name in test_names}


In [18]:
list(train_features.items())[0]

('102455176_5f8ead62d5.jpg',
 array([0.36299232, 0.19369601, 0.22639096, ..., 0.15519994, 0.28031123,
        0.05308247]))

In [102]:
max_length = max(len(description) for name in train_names for description in descriptions[name])
print("Maximum length of a sequence: ", max_length)

Maximum length of a sequence:  40


In [20]:
# Create mapping for unique words in training data
train_tokens = set()
for name in train_names:
    captions = descriptions[name]
    for caption in captions:
        for token in caption:
            train_tokens.add(token)
train_tokens_sorted = sorted(list(train_tokens))

id_to_word = {}
word_to_id = {}
for i, token in enumerate(train_tokens_sorted):
    id_to_word[i] = token
    word_to_id[token] = i

In [22]:
print(word_to_id["dog"], id_to_word[2125])

2125 dog


In [24]:
class TextDataset(Dataset):
    def __init__(self, train_list, descriptions, word_to_id, max_len, vocab_size, image_features):
        self.data = []
        self.image_features = image_features
        for img_name in train_list:
            captions = descriptions[img_name]
            for caption in captions:
                for i in range(1, len(caption)):
                    encoded_input = [get_word_id(w, word_to_id) for w in caption[:i]]
                    if len(encoded_input) < max_len:
                        encoded_input += [0] * (max_len - len(encoded_input))
                    encoded_output = get_word_id(caption[i], word_to_id)

                    # Get the corresponding image feature
                    img_feature = self.image_features[img_name]

                    # Append a tuple of the encoded_input, encoded_output and the image_feature
                    self.data.append((encoded_input, encoded_output, img_feature))

        self.vocab_size = vocab_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq, output_word, img_feature = self.data[idx]
        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(output_word, dtype=torch.long), torch.tensor(img_feature, dtype=torch.float)

In [25]:
def get_word_id(word, word_to_id):
    return word_to_id.get(word, word_to_id["<UNK>"])

In [37]:
MAX_LEN = max_length
vocab_size = len(word_to_id)

train_dataset = TextDataset(train_names, descriptions, word_to_id, MAX_LEN, vocab_size, train_features)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataset = TextDataset(val_names, descriptions, word_to_id, MAX_LEN, vocab_size, val_features)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

In [67]:
val_names[5]

'1569687608_0e3b3ad044.jpg'

In [70]:
train_names[10]

'3121521593_18f0ec14f7.jpg'

In [71]:
descriptions["3121521593_18f0ec14f7.jpg"]

[['<START>',
  'a',
  'smiling',
  'girl',
  'in',
  'glasses',
  'and',
  'a',
  'blue-gray',
  'scarf',
  'with',
  'a',
  'smiling',
  'girl',
  'in',
  'a',
  'black',
  'coat',
  'leaning',
  'on',
  'her',
  'shoulder',
  '.',
  '<END>'],
 ['<START>',
  'a',
  'woman',
  'leaning',
  'her',
  'head',
  'on',
  'the',
  'shoulder',
  'of',
  'another',
  'woman',
  '.',
  '<END>'],
 ['<START>',
  '"two',
  'girls',
  ',',
  'one',
  'wearing',
  'glasses',
  ',',
  'dressed',
  'in',
  'cold',
  'weather',
  'clothing',
  'smile',
  'for',
  'the',
  'camera',
  '."',
  '<END>'],
 ['<START>',
  'two',
  'warmly',
  'dress',
  'girls',
  'posing',
  'for',
  'a',
  'picture',
  'outdoors',
  '<END>'],
 ['<START>',
  'two',
  'young',
  'woman',
  'hug',
  'and',
  'pose',
  'together',
  'for',
  'a',
  'picture',
  '.',
  '<END>']]

In [None]:
# features with the word embeddings and feeds them into the LSTM version
class ImageCaptioningModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, image_feature_dim):
        super(ImageCaptioningModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)
        # Add a layer to transform the image features
        self.image_fc = nn.Linear(image_feature_dim, hidden_dim)

    def forward(self, x, image_features):
        # Transform the image features
        image_features = self.image_fc(image_features).unsqueeze(1)

        # Embed the input sequence
        x = self.embedding(x)

        # Repeat the image features to match the sequence length
        image_features = image_features.repeat(1, x.size(1), 1)
        x = torch.cat([x, image_features], dim=-1)

        # LSTM layer output: (batch_size, max_len, hidden_dim * 2)
        x, _ = self.lstm(x)
        # We only use the output of the last time step
        x = x[:, -1, :]
        # Fully connected layer output: (batch_size, vocab_size)
        x = self.fc(x)

        return x

In [79]:
# feature in initial state for LSTM version
class ImageCaptioningModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, image_feature_dim):
        super(ImageCaptioningModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)
        # Add a layer to transform the image features into a suitable initial state for the LSTM
        self.image_fc = nn.Linear(image_feature_dim, hidden_dim)

    def forward(self, x, image_features):
        x = self.embedding(x)
        # Use the transformed image features as the initial hidden state of the LSTM
        h0 = self.image_fc(image_features).unsqueeze(0).repeat(2, 1, 1)
        c0 = torch.zeros_like(h0)
        x, _ = self.lstm(x, (h0, c0))
        x = x[:, -1, :]
        x = self.fc(x)
        return x

In [39]:
image_feature_dim = 2048  # set this to the size of your image features
embedding_dim = 200
hidden_dim = 300

In [103]:
model

ImageCaptioningModel(
  (embedding): Embedding(7811, 200, padding_idx=0)
  (lstm): LSTM(200, 300, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=600, out_features=7811, bias=True)
  (image_fc): Linear(in_features=2048, out_features=300, bias=True)
)

In [80]:
import torch.nn.functional as F

model = ImageCaptioningModel(vocab_size, embedding_dim, hidden_dim, image_feature_dim)

# Use the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    for input_seq, output_word, img_feature in train_loader:
        # If using GPU, move data to GPU
        input_seq = input_seq.to(device)
        output_word = output_word.to(device)
        img_feature = img_feature.to(device)

        # Forward pass
        outputs = model(input_seq, img_feature)

        # Compute loss
        loss = criterion(outputs, output_word)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * input_seq.size(0)

    epoch_loss = running_loss / len(train_dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}')

    # Validation loop
    model.eval()  # Set model to evaluation mode
    running_val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for val_input_seq, val_output_word, val_img_feature in val_loader:
            # If using GPU, move data to GPU
            val_input_seq = val_input_seq.to(device)
            val_output_word = val_output_word.to(device)
            val_img_feature = val_img_feature.to(device)

            # Forward pass
            val_outputs = model(val_input_seq, val_img_feature)

            # Compute loss
            val_loss = criterion(val_outputs, val_output_word)
            running_val_loss += val_loss.item() * val_input_seq.size(0)

            # Compute accuracy
            _, predicted = torch.max(val_outputs.data, 1)
            total += val_output_word.size(0)
            correct += (predicted == val_output_word).sum().item()

    epoch_val_loss = running_val_loss / len(val_dataset)
    accuracy = correct / total * 100  # Compute the accuracy
    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {epoch_val_loss}, Accuracy: {accuracy}')

Epoch 1/5, Loss: 4.664137127982076
Epoch 1/5, Validation Loss: 4.392096644088375, Accuracy: 28.239490314159806
Epoch 2/5, Loss: 4.180260459329399
Epoch 2/5, Validation Loss: 4.171144671843194, Accuracy: 29.732104780243212
Epoch 3/5, Loss: 4.053153982923867
Epoch 3/5, Validation Loss: 4.178437808234081, Accuracy: 29.75536630438997
Epoch 4/5, Loss: 4.131807929132734
Epoch 4/5, Validation Loss: 4.210902051707148, Accuracy: 29.5899510215686
Epoch 5/5, Loss: 4.026902520270426
Epoch 5/5, Validation Loss: 4.240431427658302, Accuracy: 29.295305049043048


In [81]:
def sample_decoder(img_feature):
    seq = ["<START>"]
    while len(seq) < MAX_LEN and seq[-1] != "<END>":
        encoded_input = [word_to_id[w] for w in seq]
        encoded_input = torch.tensor([encoded_input]).to(device)
        if len(encoded_input[0]) < MAX_LEN:
            padding = torch.zeros((1, MAX_LEN - len(encoded_input[0]))).long().to(device)
            encoded_input = torch.cat((encoded_input, padding), dim=1)

        # Forward pass through the model
        with torch.no_grad():
            outputs = model(encoded_input, img_feature)

        # The output is a distribution over the vocabulary.
        # Use the softmax function to convert it to probabilities
        probs = F.softmax(outputs, dim=-1)

        # Sample a word from the distribution
        sampled_word = torch.multinomial(probs[0], 1)

        pred_word = id_to_word[sampled_word.item()]
        seq.append(pred_word)
    return seq

In [118]:
def sample_decoder(img_feature, start_seq):
    seq = [id_to_word[id.item()] for id in start_seq if id != 0]  # 0 is usually the padding value
    while len(seq) < MAX_LEN and seq[-1] != "<END>":
        encoded_input = [word_to_id[w] for w in seq]
        encoded_input = torch.tensor([encoded_input]).to(device)
        img_feature = img_feature.to(device)

        # Forward pass through the model
        with torch.no_grad():
            outputs = model(encoded_input, img_feature)

        # The output is a distribution over the vocabulary.
        # Use the softmax function to convert it to probabilities
        probs = F.softmax(outputs, dim=-1)

        # Sample a word from the distribution
        sampled_word = torch.multinomial(probs[0], 1)

        pred_word = id_to_word[sampled_word.item()]
        seq.append(pred_word)
    return seq

# Get an input sequence and the associated image feature from the validation dataset
input_seq, _, img_feature = val_dataset[0]

# Generate a caption for the image
caption = sample_decoder(img_feature, input_seq)

# Print the caption
print(' '.join(caption))

<START> sled white kitchen others on bat <END>
