In [None]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

from PIL import Image

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device}")

In [None]:
from ultralytics import YOLO
yolo_model = YOLO("yolov8n.pt")
yolo_model = yolo_model.model.model[:10]
yolo_model = yolo_model.to(device)
yolo_model.eval()
print(yolo_model)

In [None]:
# Extract features from images
features = {}
directory = 'Images'

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

for img_name in tqdm(os.listdir(directory)):
    img_path = os.path.join(directory, img_name)
    image = Image.open(img_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    feature = yolo_model(image)
    feature = feature.view(feature.size(0), -1).detach().cpu().numpy()[0]
    image_id = img_name.split('.')[0]
    features[image_id] = feature

In [None]:
# Store features in pickle
with open(os.path.join('./', 'features.pkl'), 'wb') as f:
    pickle.dump(features, f)

In [None]:
# Load features from pickle
with open(os.path.join('./', 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

In [None]:
print(features['3250076419_eb3de15063'])
print(features['3747543364_bf5b548527'])

print(features['3250076419_eb3de15063'].shape)
print(len(features))
feature_size = features['3250076419_eb3de15063'].shape[0]

In [None]:
# Load captions from the text file
with open(os.path.join('./', 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
# Create mapping of image to captions
mapping = {}
for line in tqdm(captions_doc.split('\n')):
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    caption = " ".join(caption)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

In [None]:
# Clean the captions
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            caption = caption.lower()
            caption = caption.replace('[^A-Za-z]', '')
            caption = caption.replace('\s+', ' ')
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [None]:
# Before preprocess of text
print(mapping['33108590_d685bfe51c'])

In [None]:
# Preprocess the text
clean(mapping)

# After preprocess of text
print(mapping['33108590_d685bfe51c'])

In [None]:
all_captions = [caption for captions in mapping.values() for caption in captions]
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
import torchtext
from torchtext.data import get_tokenizer

tokenizer = get_tokenizer("basic_english")

# Tokenize the text
tokenized_text = [tokenizer(caption) for caption in all_captions]

vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_text)
vocab_size = len(vocab)
print(vocab_size)

In [None]:
print(vocab['you'])

In [None]:
max_length = max(len(caption.split()) for caption in all_captions)
print(max_length)

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids)*0.75)
train = image_ids[:256]
# train_captions = all_captions[:32*5]

In [None]:
test = image_ids[7900:]
# test_captions = all_captions[7900*5:]

In [None]:
def one_hot(a, num_classes):
    out = np.zeros(num_classes)
    out[a] = 1
    return out
    

In [None]:
class CaptionDataset(Dataset):
    def __init__(self, data_keys, mapping, features, tokenizer, max_length):
        self.data_keys = data_keys
        self.mapping = mapping
        self.features = features
        self.tokenizer = tokenizer
        self.max_length = max_length
        

    def __len__(self):
        return len(self.data_keys)

    def __getitem__(self, idx):
        key = self.data_keys[idx]
        captions = self.mapping[key]

        # caption = captions[np.random.choice(len(captions))]
        caption = captions[0]
        input1, input2, y = torch.zeros((1, feature_size)), torch.zeros(max_length).int(), torch.zeros((max_length, vocab_size))
        input1[0] = torch.as_tensor(features[key])
        tokens = self.tokenizer(caption)
        caption_indices = [vocab[token] for token in tokens]

        for i in range(1, len(caption_indices)):
            in_seq, out_seq = caption_indices[i-1], caption_indices[i]

            out_seq = int(out_seq)
            out_seq = one_hot(out_seq, num_classes=vocab_size)
         
            input2[i-1] = int(in_seq)


            y[i-1] = torch.as_tensor(out_seq)

        return input1, input2, y, idx

    # def __getitem__(self, idx):
    #     caption = self.data_keys[idx]
    #     key = self.mapping[idx//5]
        
    #     input1, input2, y = torch.zeros((1, feature_size)), torch.zeros(max_length).int(), torch.zeros((max_length, vocab_size))
    #     input1[0] = torch.as_tensor(features[key])
    #     tokens = self.tokenizer(caption)
    #     caption_indices = [vocab[token] for token in tokens]

    #     for i in range(1, len(caption_indices)):
    #         in_seq, out_seq = caption_indices[i-1], caption_indices[i]

    #         out_seq = int(out_seq)
    #         out_seq = one_hot(out_seq, num_classes=vocab_size)
         
    #         input2[i-1] = int(in_seq)


    #         y[i-1] = torch.as_tensor(out_seq)

    #     return input1, input2, y, idx


In [None]:
batch_size = 32
# train_dataset = CaptionDataset(train_captions, train, features, tokenizer, max_length)
train_dataset = CaptionDataset(train, mapping, features, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

In [None]:
print(train_dataset.__getitem__(0)[0].size())
print(train_dataset.__getitem__(0)[1].size())
print(train_dataset.__getitem__(0)[2].size())

In [None]:
class Encoder(nn.Module) :
    def __init__(self, feature_size, hidden_size) :
        super(Encoder, self).__init__()
        self.image_feature_layer = nn.Sequential(
            # nn.Dropout(0.4),
            nn.Linear(feature_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, hidden_size),   
            nn.ReLU()
        )

    def forward(self, image_input) :
        return self.image_feature_layer(image_input) 


class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(Decoder, self).__init__()

        self.emb = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.lstm_cell = nn.LSTMCell(embedding_size, hidden_size)
        
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, vocab_size)
        )

    def forward(self, fe, sequence_input, h, c):
        se = self.emb(sequence_input)
        outputs = []
        if len(se.size()) == 3 :
            for t in range(se.size(1)) :
                h, c = self.lstm_cell(se[:, t, :], (h+fe, c))
                output_t = self.decoder(h)
                outputs.append(output_t)
            outputs = torch.stack(outputs, dim=1)
   
        else : 
            h, c = self.lstm_cell(se.squeeze(0), (h+fe, c))
            outputs = self.decoder(h)
            

        return outputs, h, c    

class ImageCaptioningModel(nn.Module) :
    def __init__(self, feature_size, vocab_size, embedding_size, hidden_size):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = Encoder(feature_size, hidden_size)
        self.decoder = Decoder(vocab_size, embedding_size, hidden_size)
        self.hidden_size = hidden_size

    def forward(self, image_input, sequence_input, h=None, c=None) :
        fe = self.encoder(image_input)
        c = None
        if h is None :
            if len(fe.size()) == 3 :
                fe = fe.permute(1, 0, 2)
                fe = fe.squeeze(0)
                c = torch.randn(fe.shape[0], self.hidden_size).to(device)
            
            else : 
                fe = fe.squeeze(0)
                c = torch.randn(self.hidden_size).to(device) 
            return self.decoder(fe, sequence_input, fe, c)
        
        c = torch.randn(self.hidden_size).to(device) 
        return self.decoder(fe, sequence_input, h, c)

In [None]:
# Instantiate the model
embedding_size = 256
hidden_size = 256

model = ImageCaptioningModel(feature_size, vocab_size, embedding_size, hidden_size)
model = model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.1, verbose=True)

In [None]:
# Evaluation
def idx_to_word(index):
    try:
        return vocab.get_itos()[index]
    except:
        return None

def predict_caption(model, feature, max_length) :
    model.eval()
    input1 = torch.zeros((1, feature_size)).to(device)
    input1[0] = torch.as_tensor(feature)
    input1 = input1.squeeze(0)
    hidden = None
    c = None 
    input2 = torch.zeros(1).int()
    in_text = 'startseq'
    for _ in range(max_length) :
        input2[0] = torch.as_tensor(vocab[in_text.split(' ')[-1]], dtype=torch.int64)
        input2 = input2.to(device)
        outputs, hidden, c = model(input1, input2, hidden, c)

        outputs = F.softmax(outputs, dim=0)

        # y_pred = torch.argmax(outputs, dim=1).squeeze(0).item()
        y_pred = torch.multinomial(outputs, 1).squeeze(0).item()
        
        word = idx_to_word(y_pred)
        in_text += ' ' + word
        
        if word is None or word == 'endseq' :
            break

    return in_text



In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Train the model
num_epochs = 150
                                                         
for epoch in range(num_epochs):
    total_loss_train = 0
    total_loss_test = 0
    model.train()

    for batch in train_loader:
        inputs1, inputs2, targets, _ = batch

        inputs1, inputs2, targets = inputs1.to(device), inputs2.to(device), targets.to(device)
        hidden, c = None, None
        output, hidden, c = model(inputs1, inputs2, hidden, c)

        mask = torch.sum(targets, dim=-1) != 0
        output_flat = output.view(-1, vocab_size)
        targets_flat = targets.view(-1, vocab_size)

        output_masked = output_flat[mask.view(-1)]
        targets_masked = targets_flat[mask.view(-1)]
        loss = criterion(output_masked, targets_masked)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()

    average_loss_train = total_loss_train / len(train_loader)
    scheduler.step(average_loss_train)

    # if epoch % 30 == 0 or epoch == num_epochs - 1 :
    #     actual, predicted = [], []
    #     for key in train :
    #         captions = mapping[key]
    #         y_pred = predict_caption(model, features[key], max_length)
            
    #         actual_captions = [caption.split() for caption in captions]
    #         y_pred = y_pred.split()
    #         actual.append(actual_captions)
    #         predicted.append(y_pred)

    #     bleu1_train = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))

        # print(f'Epoch [{epoch + 1}/{num_epochs}], Loss Train: {average_loss_train:.4f}, BLEU-1 Score Test: {bleu1_train:.4f}')
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss Train: {average_loss_train:.4f}')


In [None]:

actual, predicted = [], []

for key in tqdm(test):
    captions = mapping[key]
    y_pred = predict_caption(model, features[key], max_length)
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    actual.append(actual_captions)
    predicted.append(y_pred)

# Calculate BLEU score
bleu1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
bleu2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))

print(f"BLEU-1: {bleu1}")
print(f"BLEU-2: {bleu2}")


In [None]:
# Generate caption for an image
def generate_caption(image_name) :
    image_id = image_name.split('.')[0]
    img_path = os.path.join('./', "Images", image_name)
    image = Image.open(img_path)

    captions = mapping[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)

    y_pred = predict_caption(model, features[image_id], max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)

    plt.imshow(image)

In [None]:
print(generate_caption(f'{train[1]}.jpg'))

In [None]:
print(generate_caption(f'{train[2]}.jpg'))

In [None]:
print(generate_caption(f'{train[3]}.jpg'))

In [None]:
print(generate_caption(f'{train[4]}.jpg'))

In [None]:
print(generate_caption(f'{test[25]}.jpg'))

In [None]:
print(generate_caption(f'{test[56]}.jpg'))