In [1]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from PIL import Image

In [2]:
# Device configuration
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available() # For macOS
    else "cpu"
)

print(f"Using {device}")

Using cuda


In [3]:
# Load VGG16 Model
model = models.vgg16(weights='DEFAULT')
 # Remove the last fully connected layer
model = nn.Sequential(*list(model.features.children()))
model = model.to(device)
model.eval()

# Summarize
print(model)

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

In [4]:
# Extract features from images
features = {}
directory = 'Images'

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

for img_name in tqdm(os.listdir(directory)):
    img_path = os.path.join(directory, img_name)
    image = Image.open(img_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    feature = model(image)
    feature = feature.view(feature.size(0), -1).detach().cpu().numpy()[0]
    image_id = img_name.split('.')[0]
    features[image_id] = feature

  0%|          | 0/8091 [00:00<?, ?it/s]

In [5]:
# Store features in pickle
with open(os.path.join('./', 'features.pkl'), 'wb') as f:
    pickle.dump(features, f)

In [4]:
# Load features from pickle
with open(os.path.join('./', 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

In [5]:
print(features['3250076419_eb3de15063'])
print(features['3250076419_eb3de15063'].shape)

[0.66494083 2.135181   0.9853073  ... 7.3553123  5.759862   4.8131127 ]
(25088,)


In [6]:
# Load captions from the text file
with open(os.path.join('./', 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [7]:
# Create mapping of image to captions
mapping = {}
for line in tqdm(captions_doc.split('\n')):
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    caption = " ".join(caption)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [8]:
# Clean the captions
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            caption = caption.lower()
            caption = caption.replace('[^A-Za-z]', '')
            caption = caption.replace('\s+', ' ')
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [9]:
# Before preprocess of text
print(mapping['1000268201_693b08cb0e'])

['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .']


In [10]:
# Preprocess the text
clean(mapping)

# After preprocess of text
print(mapping['1000268201_693b08cb0e'])

['startseq child in pink dress is climbing up set of stairs in an entry way endseq', 'startseq girl going into wooden building endseq', 'startseq little girl climbing into wooden playhouse endseq', 'startseq little girl climbing the stairs to her playhouse endseq', 'startseq little girl in pink dress going into wooden cabin endseq']


In [11]:
all_captions = [caption for captions in mapping.values() for caption in captions]
len(all_captions)

40455

In [12]:
all_captions[:10]

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq',
 'startseq black dog and spotted dog are fighting endseq',
 'startseq black dog and tri-colored dog playing with each other on the road endseq',
 'startseq black dog and white dog with brown spots are staring at each other in the street endseq',
 'startseq two dogs of different breeds looking at each other on the road endseq',
 'startseq two dogs on pavement moving toward each other endseq']

In [13]:
import torchtext
from torchtext.data import get_tokenizer

tokenizer = get_tokenizer("basic_english")

# Tokenize the text
tokenized_text = [tokenizer(caption) for caption in all_captions]

# Build vocabulary : Mapping every token to an integer index
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_text)
vocab_size = len(vocab)
print(vocab_size)

8896


In [14]:
print(vocab['you'])

1377


In [15]:
max_length = max(len(caption.split()) for caption in all_captions)
print(max_length)

35


In [16]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.75)
train = image_ids[:64]
test = image_ids[split:]

In [17]:
def one_hot(a, num_classes):
  return np.squeeze(np.eye(num_classes)[a.reshape(-1)])

In [18]:
class CaptionDataset(Dataset):
    def __init__(self, data_keys, mapping, features, tokenizer, max_length):
        self.data_keys = data_keys
        self.mapping = mapping
        self.features = features
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_keys)

    def __getitem__(self, idx):
        key = self.data_keys[idx]
        captions = self.mapping[key]

        caption = captions[np.random.choice(len(captions))]
        input1, input2, y = [], [], []
        
        tokens = self.tokenizer(caption)
        caption_indices = [vocab[token] for token in tokens]
        
        for i in range(1, len(caption_indices)):
            in_seq, out_seq = caption_indices[:i], caption_indices[i]
            
            out_seq = int(out_seq)
            
            in_seq = in_seq[:self.max_length] + [0] * max(0, self.max_length - len(in_seq))
            out_seq = one_hot(torch.tensor(out_seq), num_classes=vocab_size)
            
            input1.append(features[key])
            input2.append(torch.tensor(in_seq))
            y.append(torch.Tensor(out_seq))
        
        # y = y[:self.max_length] + [0 for _ in range(vocab_size)]*max(0, self.max_length - len(y))
        print([0 for _ in range(vocab_size)]*max(0, self.max_length - len(y)))
        print(y[:self.max_length] + [0 for _ in range(vocab_size)]*max(0, self.max_length - len(y)))
        return input1, input2, y



In [19]:
batch_size = 32
train_dataset = CaptionDataset(train, mapping, features, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

In [20]:
class ImageCaptioningModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(ImageCaptioningModel, self).__init__()
        
        # Image feature layers
        self.image_feature_layer = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(25088, hidden_size),
            nn.ReLU()
        )
        
        # Sequence feature layers
        self.sequence_feature_layer = nn.Sequential(
            nn.Embedding(vocab_size, embedding_size, padding_idx=0),
            nn.Dropout(0.4),
            nn.LSTM(embedding_size, hidden_size)
        )
        
        # Decoder layers
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, vocab_size),
            nn.Softmax(dim=1)
        )

    def forward(self, image_input, sequence_input):
        fe = self.image_feature_layer(image_input)
        se, _ = self.sequence_feature_layer(sequence_input)
        se = se[:, -1, :]
        combined = torch.cat((fe, se), dim=1)
        output = self.decoder(combined)
        return output

In [21]:
# Instantiate the model
embedding_size = 256
hidden_size = 256

model = ImageCaptioningModel(vocab_size, embedding_size, hidden_size)
model = model.to(device)

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [23]:
# # Train the model
# num_epochs = 1

# for epoch in range(num_epochs):
#     total_loss = 0
#     model.train()

#     for batch in train_loader:
#         inputs1, inputs2, targets = batch
#         inputs1, inputs2, targets = inputs1.to(device), inputs2.to(device), targets.to(device)

#         # Generate output sequence from the model
#         output = model(inputs1, inputs2)

#         # # Reshape the output and targets to have the same batch size
#         # output = output.view(-1, vocab_size)
#         # targets = targets.view(-1)

#         # Calculate the loss

#         loss = criterion(output, targets)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     average_loss = total_loss / len(train_loader)
#     print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}')

In [None]:
# Save the model
# torch.save(model.state_dict(), os.path.join('./', 'best_model.pth'))