In [None]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader

from PIL import Image

In [None]:
# Device configuration
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available() # For macOS
    else "cpu"
)

print(f"Using {device}")

In [None]:
# Load VGG16 Model
model = models.vgg16(weights='DEFAULT')
 # Remove the last fully connected layer
model = nn.Sequential(*list(model.features.children()))
model = model.to(device)
model.eval()

# Summarize
print(model)

In [None]:
# Extract features from images
features = {}
directory = 'Images'

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

for img_name in tqdm(os.listdir(directory)):
    img_path = os.path.join(directory, img_name)
    image = Image.open(img_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    feature = model(image)
    feature = feature.view(feature.size(0), -1).detach().cpu().numpy()[0]
    image_id = img_name.split('.')[0]
    features[image_id] = feature

In [None]:
# Store features in pickle
with open(os.path.join('./', 'features.pkl'), 'wb') as f:
    pickle.dump(features, f)

In [None]:
# Load features from pickle
with open(os.path.join('./', 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

In [None]:
print(features['3250076419_eb3de15063'])
print(features['3250076419_eb3de15063'].shape)

In [None]:
# Load captions from the text file
with open(os.path.join('./', 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
# Create mapping of image to captions
mapping = {}
for line in tqdm(captions_doc.split('\n')):
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    caption = " ".join(caption)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

In [None]:
# Clean the captions
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            caption = caption.lower()
            caption = caption.replace('[^A-Za-z]', '')
            caption = caption.replace('\s+', ' ')
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [None]:
# Before preprocess of text
print(mapping['1000268201_693b08cb0e'])

In [None]:
# Preprocess the text
clean(mapping)

# After preprocess of text
print(mapping['1000268201_693b08cb0e'])

In [None]:
all_captions = [caption for captions in mapping.values() for caption in captions]
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
import torchtext
from torchtext.data import get_tokenizer

tokenizer = get_tokenizer("basic_english")

# Tokenize the text
tokenized_text = [tokenizer(caption) for caption in all_captions]

# Build vocabulary : Mapping every token to an integer index
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_text)
vocab_size = len(vocab)
print(vocab_size)

In [None]:
print(vocab['you'])

In [None]:
max_length = max(len(caption.split()) for caption in all_captions)
print(max_length)

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.75)
train = image_ids[:split]
test = image_ids[split:]

In [None]:
# # Create data generator
# class CaptionDataset(Dataset):
#     def __init__(self, data_keys, mapping, features, tokenizer, max_length):
#         self.data_keys = data_keys
#         self.mapping = mapping
#         self.features = features
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.data_keys)

#     def __getitem__(self, idx):
#         key = self.data_keys[idx]
#         captions = self.mapping[key]

#         caption = captions[np.random.choice(len(captions))]

#         # for caption in captions :
#         tokens = self.tokenizer(caption)
#         caption_indices = [vocab[token] for token in tokens]
#         caption_indices = caption_indices[:self.max_length] + [0] * max(0, self.max_length - len(caption_indices))
#         image_features = torch.tensor(self.features[key], dtype=torch.float32)
#         caption_indices = torch.tensor(caption_indices, dtype=torch.long)

#         return image_features, caption_indices

from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import one_hot

class CustomDataset(Dataset):
    def __init__(self, data_keys, mapping, features, tokenizer, max_length, vocab_size, vocab):
        self.data_keys = data_keys
        self.mapping = mapping
        self.features = features
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.vocab = vocab

    def __len__(self):
        return len(self.data_keys)

    def __getitem__(self, idx):
        key = self.data_keys[idx]
        captions = self.mapping[key]
        X1, X2, y = list(), list(), list()

        for caption in captions:
            seq = self.tokenizer(caption)
            seq = [self.vocab[word] for word in seq]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = in_seq[:self.max_length] + [0] * max(0, self.max_length - len(in_seq))
                out_seq = torch.nn.functional.one_hot(torch.LongTensor([out_seq]), num_classes=self.vocab_size).squeeze(0)

                X1.append(torch.Tensor(self.features[key]))
                X2.append(torch.LongTensor(in_seq))
                y.append(out_seq)

        # Pad sequences to the maximum length in the batch
        X2_padded = pad_sequence(X2, batch_first=True, padding_value=0)
        y_padded = pad_sequence(y, batch_first=True, padding_value=0)

        # Convert out_seq to one-hot encoding
        y_one_hot = one_hot(y_padded.view(-1), num_classes=self.vocab_size).view(y_padded.size(0), y_padded.size(1), -1)

        return torch.stack(X1), X2_padded, y_one_hot



In [None]:
# batch_size = 32
# train_dataset = CaptionDataset(train, mapping, features, tokenizer, max_length)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

train_loader = DataLoader(CustomDataset(train, mapping, features, tokenizer, max_length, vocab_size, vocab), batch_size=8, shuffle=True)

In [None]:
class ImageCaptioningModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(ImageCaptioningModel, self).__init__()
        
        # Image feature layers
        self.image_feature_layer = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(25088, hidden_size),
            nn.ReLU()
        )
        
        # Sequence feature layers
        self.sequence_feature_layer = nn.Sequential(
            nn.Embedding(vocab_size, embedding_size, padding_idx=0),
            nn.Dropout(0.4),
            nn.LSTM(embedding_size, hidden_size)
        )
        
        # Decoder layers
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, vocab_size),
            nn.Softmax(dim=1)
        )

    def forward(self, image_input, sequence_input):
        fe = self.image_feature_layer(image_input)
        se, _ = self.sequence_feature_layer(sequence_input)
        se = se[:, -1, :]
        combined = torch.cat((fe, se), dim=1)
        output = self.decoder(combined)
        return output

In [None]:
# Instantiate the model
embedding_size = 256
hidden_size = 256

model = ImageCaptioningModel(vocab_size, embedding_size, hidden_size)
model = model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
num_epochs = 1

for epoch in range(num_epochs):
    total_loss = 0
    model.train()

    for batch in train_loader:
        inputs1, inputs2, targets = batch
        inputs1, inputs2, targets = inputs1.to(device), inputs2.to(device), targets.to(device)

        # Generate output sequence from the model
        output = model(inputs1, inputs2)

        # Reshape the output and targets to have the same batch size
        output = output.view(-1, vocab_size)
        targets = targets.view(-1)

        # Calculate the loss

        loss = criterion(output, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}')

In [None]:
# Save the model
torch.save(model.state_dict(), os.path.join('./', 'best_model.pth'))

In [None]:
# Evaluation
def idx_to_word(index, tokenizer):
    for word, idx in tokenizer.word_index.items():
        if idx == index:
            return word
    return None

def predict_caption(model, image, tokenizer, max_length):
    model.eval()

    in_text = 'startseq'

    for _ in range(max_length):
        inputs = [torch.tensor(image).to(device), tokenizer.encode_plus(in_text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)]
        outputs = model(*inputs)
        y_pred = torch.argmax(outputs[1], dim=2).squeeze(0)[-1].item()

        word = idx_to_word(y_pred, tokenizer)
        if word is None or word == 'endseq':
            break

        in_text += ' ' + word

    return in_text

In [None]:

# BLEU Score Calculation
from nltk.translate.bleu_score import corpus_bleu

actual, predicted = [], []

for key in tqdm(test):
    captions = mapping[key]
    y_pred = predict_caption(model, features[key], tokenizer, max_length)

    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()

    actual.append(actual_captions)
    predicted.append(y_pred)

In [None]:
# Calculate BLEU score
bleu1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
bleu2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))

print(f"BLEU-1: {bleu1:.4f}")
print(f"BLEU-2: {bleu2:.4f}")

In [None]:
# Generate caption for an image
def generate_caption(image_name):
    image_id = image_name.split('.')[0]
    img_path = os.path.join('./', "Images", image_name)
    image = Image.open(img_path)

    captions = mapping[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)

    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)

    plt.imshow(image)


In [None]:
generate_caption("1001773457_577c3a7d70.jpg")