In [None]:
from transformers import BertTokenizer, TFBertModel, ViTModel
from tensorflow.keras.layers import Dense, Dropout, concatenate
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LSTM, concatenate, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import os
from PIL import Image

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Path to your captions file in Google Drive
file_path = '/content/drive/My Drive/captions.txt'

# Reading the captions
captions = open(file_path, 'r').read()

# Create a dictionary mapping image names to captions

def load_captions(captions):
    mapping = {}
    for line in captions.strip().split('\n'):
        tokens = line.strip().split(',')
        if len(tokens) < 2:
            continue
        image_id, caption = tokens[0], tokens[1]
        image_id = image_id.split('#')[0]
        if image_id not in mapping:
            mapping[image_id] = []
        mapping[image_id].append(caption)
    return mapping

all_captions_mapping = load_captions(captions)
print(f"Total images: {len(all_captions_mapping)}")

Total images: 2088


In [None]:
# Selecting the first 1500 images

all_captions_mapping.pop('image', None)
captions_mapping = {k: all_captions_mapping[k] for k in list(all_captions_mapping.keys())[:6000]}

In [None]:
from tensorflow.keras.applications.vgg16 import preprocess_input

def preprocess_image(image_path):
    img = Image.open(image_path)

    # Convert RGBA to RGB if necessary
    if img.mode != 'RGB':
        img = img.convert('RGB')

    # Resize the image to (224, 224) regardless of its original size
    img = img.resize((224, 224))

    # Convert to numpy array and preprocess for CNN input
    img = np.array(img)
    img = preprocess_input(img)

    return img

In [None]:
import string

def clean_captions(captions_mapping):
    table = str.maketrans('', '', string.punctuation)
    for img_id, captions in captions_mapping.items():
        for i, caption in enumerate(captions):
            # Tokenize
            caption = caption.lower()
            caption = caption.translate(table)
            caption = caption.strip()
            caption = ' '.join([word for word in caption.split() if len(word)>1])
            # Add start and end tokens
            caption = 'startseq ' + caption + ' endseq'
            captions[i] = caption

clean_captions(captions_mapping)

In [None]:
# Build a list of all captions

all_captions = []
for captions in captions_mapping.values():
    all_captions.extend(captions)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {vocab_size}")

# Maximum length of a caption
max_length = max(len(caption.split()) for caption in all_captions)
print(f"Maximum caption length: {max_length}")

Vocabulary Size: 1840
Maximum caption length: 23


In [None]:
def load_vit_model():
    vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
    return vit_model

vit_model = load_vit_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
import torch
import numpy as np

# Load ViT feature extractor and model
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

# Load and preprocess the image
def preprocess_image(img_path):
    feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
    img = Image.open(img_path).convert("RGB")
    inputs = feature_extractor(images=img, return_tensors="pt")
    return inputs


# Path to the image
image_path = '/content/drive/My Drive/Flickr8k/Flickr8k_Dataset/Images/973827791_467d83986e.jpg'  # Replace with an actual image path

# Preprocess the image
inputs = preprocess_image(image_path)

# Pass the image through the model
with torch.no_grad():  # Disable gradient calculation for efficiency
    outputs = vit_model(**inputs)

# Extract the feature vector from the last hidden state
feature_vector = outputs.last_hidden_state

# Print the shape of the feature vector
print(f"Feature vector shape: {feature_vector.shape}")




Feature vector shape: torch.Size([1, 197, 768])


In [None]:
def create_sequences(tokenizer, max_length, captions_list, image_id, features):
    X1, X2, y = [], [], []
    for caption in captions_list:
        seq = tokenizer.texts_to_sequences([caption])[0]
        for i in range(1, len(seq)):
            in_seq = seq[:i]
            out_seq = seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]  # Always pad to max_length
            out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(features)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [None]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, concatenate
from tensorflow.keras.models import Model

def create_lstm_model(vocab_size, max_length):

    # Image feature input
    inputs1 = Input(shape=(768,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Sequence input
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)

    # LSTM layer with use_cudnn=False to prevent cuDNN-related padding mask errors
    se3 = LSTM(256, use_cudnn=False)(se2)

    # Decoder (combine features)
    decoder1 = concatenate([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # Define the model
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

# Build the LSTM model
lstm_model = create_lstm_model(vocab_size, max_length)
lstm_model.summary()

In [None]:
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
import os
import numpy as np
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
import torch
from torch.utils.data import DataLoader, Dataset

# Custom dataset for loading and preprocessing images
class ImageDataset(Dataset):
    def __init__(self, images_directory, feature_extractor):
        self.img_paths = [os.path.join(images_directory, img) for img in os.listdir(images_directory)]
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        img = Image.open(img_path).convert("RGB")
        img_inputs = self.feature_extractor(images=img, return_tensors="pt")
        return img_inputs['pixel_values'].squeeze(0), os.path.basename(img_path)  # Return preprocessed image and filename

# Extract features in batches
def extract_vit_features_batch(vit_model, dataset, batch_size=16):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    features = {}

    vit_model.eval()  # Set model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vit_model.to(device)

    with torch.no_grad():
        for batch_imgs, batch_names in dataloader:
            batch_imgs = batch_imgs.to(device)
            outputs = vit_model(batch_imgs)
            batch_features = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Get mean of patch embeddings for each image
            for i, img_name in enumerate(batch_names):
                features[img_name] = batch_features[i]

    return features

# Initialize feature extractor and model
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

# Create dataset and extract features in batches
image_dataset = ImageDataset('/content/drive/My Drive/Flickr8k/Flickr8k_Dataset/Images', feature_extractor)
features = extract_vit_features_batch(vit_model, image_dataset, batch_size=16)

print(f"Extracted features for {len(features)} images")


Extracted features for 8111 images


In [None]:
# Prepare training data
X1, X2, y = [], [], []
for img_id, captions_list in captions_mapping.items():
    # Use the full image filename to retrieve the feature
    if img_id in features:
        feature = features[img_id]  # No need to split the img_id
        xi1, xi2, yi = create_sequences(tokenizer, max_length, captions_list, img_id, feature)
        X1.extend(xi1)
        X2.extend(xi2)
        y.extend(yi)
    else:
        print(f"Warning: No features found for image {img_id}")

X1 = np.array(X1)
X2 = np.array(X2)
y = np.array(y)
print(f"X1 shape: {X1.shape}, X2 shape: {X2.shape}, y shape: {y.shape}")


X1 shape: (14555, 768), X2 shape: (14555, 23), y shape: (14555, 1840)


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# Define the model checkpoint callback to save the best model
filepath = 'model-ep{epoch:03d}-loss{loss:.3f}.keras'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

# Define the early stopping callback to prevent overfitting
early_stopping = EarlyStopping(monitor='loss', patience=3)

# Fit the model with early stopping and checkpointing
lstm_model.fit([X1, X2], y, epochs=150, batch_size=64,
               callbacks=[checkpoint, early_stopping], verbose=1)


Epoch 1/150
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 5.9379
Epoch 1: loss improved from inf to 5.51398, saving model to model-ep001-loss5.514.keras
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - loss: 5.9361
Epoch 2/150
[1m226/228[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - loss: 4.8617
Epoch 2: loss improved from 5.51398 to 4.80381, saving model to model-ep002-loss4.804.keras
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 4.8610
Epoch 3/150
[1m223/228[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - loss: 4.4454
Epoch 3: loss improved from 4.80381 to 4.44538, saving model to model-ep003-loss4.445.keras
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 4.4454
Epoch 4/150
[1m225/228[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - loss: 4.1180
Epoch 4: loss improved from 4.44538 to 4.1284

<keras.src.callbacks.history.History at 0x798d7ff97910>

## **5. Evaluating the Model**

In [None]:
def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        # Encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        # Predict next word
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        # Map integer to word
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

In [None]:
# Load an image
image_id = list(captions_mapping.keys())[10].split('.')[0] # Convert dict_keys object to a list
photo = features[image_id]
photo = np.expand_dims(photo, axis=0)

# Generate caption
caption = generate_caption(lstm_model, tokenizer, photo, max_length)

import matplotlib.pyplot as plt

img = Image.open('/content/drive/My Drive/Flickr8k/Flickr8k_Dataset/Images/'+ list(captions_mapping.keys())[10]) # Convert dict_keys object to a list
plt.imshow(img)
plt.axis('off')
plt.show()

print(f"Generated caption: {caption}")

In [None]:
# Load an image
image_id = list(captions_mapping.keys())[1044].split('.')[0] # Convert dict_keys object to a list
photo = features[image_id]
photo = np.expand_dims(photo, axis=0)

# Generate caption
caption = generate_caption(lstm_model, tokenizer, photo, max_length)

import matplotlib.pyplot as plt

img = Image.open('/content/drive/My Drive/Flickr8k/Flickr8k_Dataset/Images/'+ list(captions_mapping.keys())[1044]) # Convert dict_keys object to a list
plt.imshow(img)
plt.axis('off')
plt.show()

print(f"Generated caption: {caption}")

In [None]:
# Load an image
image_id = list(captions_mapping.keys())[1190].split('.')[0] # Convert dict_keys object to a list
photo = features[image_id]
photo = np.expand_dims(photo, axis=0)

# Generate caption
caption = generate_caption(lstm_model, tokenizer, photo, max_length)

import matplotlib.pyplot as plt

img = Image.open('/content/drive/My Drive/Flickr8k/Flickr8k_Dataset/Images/'+ list(captions_mapping.keys())[1190]) # Convert dict_keys object to a list
plt.imshow(img)
plt.axis('off')
plt.show()

print(f"Generated caption: {caption}")

In [None]:
# Load an image
image_id = list(captions_mapping.keys())[1128].split('.')[0] # Convert dict_keys object to a list
photo = features[image_id]
photo = np.expand_dims(photo, axis=0)

# Generate caption
caption = generate_caption(lstm_model, tokenizer, photo, max_length)

import matplotlib.pyplot as plt

img = Image.open('/content/drive/My Drive/Flickr8k/Flickr8k_Dataset/Images/'+ list(captions_mapping.keys())[1128]) # Convert dict_keys object to a list
plt.imshow(img)
plt.axis('off')
plt.show()

print(f"Generated caption: {caption}")

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate_model(model, captions_mapping, features, tokenizer, max_length):
    actual, predicted = [], []

    # Loop through each image ID and its corresponding captions
    for img_id, captions_list in captions_mapping.items():
        # Check if the image features are available
        if img_id in features:
            feature = features[img_id]
            # Generate a caption for the image
            y_pred = generate_caption(model, tokenizer, feature.reshape(1, -1), max_length)

            # Prepare the reference and predicted captions
            references = [caption.split() for caption in captions_list]  # Tokenize the actual captions
            y_pred = y_pred.split()  # Tokenize the predicted caption

            # Append to actual and predicted lists
            actual.append(references)
            predicted.append(y_pred)
        else:
            print(f"Warning: No features found for image {img_id}")

    # Calculate BLEU scores with different weights
    bleu1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))  # BLEU-1
    bleu2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))  # BLEU-2
    bleu3 = corpus_bleu(actual, predicted, weights=(0.33, 0.33, 0.33, 0))  # BLEU-3
    bleu4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))  # BLEU-4

    # Print all BLEU scores
    print(f'BLEU-1: {bleu1:.4f}')
    print(f'BLEU-2: {bleu2:.4f}')
    print(f'BLEU-3: {bleu3:.4f}')
    print(f'BLEU-4: {bleu4:.4f}')

# Run the evaluation
evaluate_model(lstm_model, captions_mapping, features, tokenizer, max_length)


