<a href="https://colab.research.google.com/github/DAbhishek02/artificial-intelligence-project/blob/main/IMAGE_CAPTIONING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import tensorflow as tf
import cv2
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import re
from scipy.sparse import csr_matrix, vstack
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.resnet50 import preprocess_input

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_images_list = os.listdir('/content/drive/MyDrive/flickr30k_images/flickr30k_images')
sample_size = 30
train_images_list = train_images_list[:sample_size]

size = (224, 224)
num_channels = 3
train_images = []

In [5]:
for i in train_images_list:
    image_path = os.path.join('/content/drive/MyDrive/flickr30k_images/flickr30k_images', i)
    img = plt.imread(image_path)
    resized_img = cv2.resize(img, size)
    train_images.append(resized_img)

train_images = np.array([cv2.resize(plt.imread(os.path.join('/content/drive/MyDrive/flickr30k_images/flickr30k_images', i)), size) for i in train_images_list])
real_images = train_images.copy()

# If the path is correct and the file exists, try using a relative path:
# train_captions_df = pd.read_csv('results.csv', delimiter='|')
train_captions_df = pd.read_csv('/content/drive/MyDrive/flickr30k_images/results.csv', delimiter='|') # Original Line
train_captions_df.columns = ['image_name', 'comment_number', 'comment']
print("First 5 rows of train_captions_df:")
print(train_captions_df.head())
print("\nNumber of rows in train_captions_df:", len(train_captions_df))
print("\nColumn names in train_captions_df:", train_captions_df.columns)

First 5 rows of train_captions_df:
       image_name comment_number  \
0  1000092795.jpg              0   
1  1000092795.jpg              1   
2  1000092795.jpg              2   
3  1000092795.jpg              3   
4  1000092795.jpg              4   

                                             comment  
0   Two young guys with shaggy hair look at their...  
1   Two young , White males are outside near many...  
2   Two men in green shirts are standing in a yard .  
3       A man in a blue shirt standing in a garden .  
4            Two friends enjoy time spent together .  

Number of rows in train_captions_df: 158915

Column names in train_captions_df: Index(['image_name', 'comment_number', 'comment'], dtype='object')


In [6]:
def images_map_caption(train_images_list, train_captions_df):
    caption = []
    # Print image names to check if they exist in the dataframe:
    print("Image names in train_images_list:")
    print(train_images_list)
    print("\nImage names in train_captions_df['image_name']:")
    print(train_captions_df['image_name'].unique())
    for i in train_images_list:
        # Check if image name exists in the dataframe before accessing
        if i in train_captions_df['image_name'].values:
            caption.append(train_captions_df[train_captions_df['image_name'] == i]['comment'].iat[0])
        else:
            print(f"Warning: Image {i} not found in captions dataframe.")
    return caption

captions = np.array(images_map_caption(train_images_list, train_captions_df))

start_tag = '<s>'
end_tag = '<e>'

Image names in train_images_list:
[]

Image names in train_captions_df['image_name']:
['1000092795.jpg' '10002456.jpg' '1000268201.jpg' ... '997876722.jpg'
 '99804383.jpg' '998845445.jpg']


In [7]:
train = np.array([None] * sample_size)
real_images = np.array([None] * sample_size)

In [8]:
j = 0
for i in train_images_list:
    real_images[j] = np.array(plt.imread('../content/drive/MyDrive/flickr30k_images/flickr30k_images' + i))
    train[j] = np.array(plt.imread('../content/drive/MyDrive/flickr30k_images/flickr30k_images' + i))
    j += 1

In [9]:
def images_map_caption(train_images_list, train_captions):
    caption = []
    for i in train_images_list:
        caption.append(train_captions[train_captions['image_name'] == i]['comment'].iat[0])
    return caption

In [10]:
j = 0
for i in train_images_list:
    # Fix the path by using os.path.join
    image_path = os.path.join('/content/drive/MyDrive/flickr30k_images/flickr30k_images', i)
    real_images[j] = np.array(plt.imread(image_path))
    train[j] = np.array(plt.imread(image_path))
    j += 1

In [11]:
train = train_images.copy()  # Assuming you want to stack images in real_images

In [13]:
def images_map_caption(train_images_list, train_captions_df):
    caption = []
    for i in train_images_list:
        # Check if there's a match for the current image name
        matching_captions = train_captions_df[train_captions_df['image_name'].str.lower() == i.lower()]['comment']
    return caption

captions = np.array(images_map_caption(train_images_list, train_captions_df))
print("Shape of captions:", captions.shape)
print("First 5 captions:", captions[:5])

Shape of captions: (0,)
First 5 captions: []


In [14]:
print("\nFirst 10 entries in train_captions_df['image_name']:")
print(train_captions_df['image_name'].head(10))


First 10 entries in train_captions_df['image_name']:
0    1000092795.jpg
1    1000092795.jpg
2    1000092795.jpg
3    1000092795.jpg
4    1000092795.jpg
5      10002456.jpg
6      10002456.jpg
7      10002456.jpg
8      10002456.jpg
9      10002456.jpg
Name: image_name, dtype: object


In [15]:
def get_vocab(captions):
    all_words = []
    processed_captions = []
    for cap in captions:
        cap = re.sub(r' +', ' ', cap)
        cap = start_tag + ' ' + cap + ' ' + end_tag
        processed_captions.append(cap)
        all_words.extend(cap.split())
    vocab = sorted(list(set(all_words)))
    word_to_index = {word: i for i, word in enumerate(vocab)}
    index_to_word = {i: word for word, i in word_to_index.items()}
    return vocab, word_to_index, index_to_word, processed_captions

if captions.size > 0:
    vocab, word_to_index, index_to_word, processed_captions = get_vocab(captions)
    vocab_size = len(vocab)
    max_caption_length = max(len(cap.split()) for cap in processed_captions)
    print("\nVocabulary size:", vocab_size)
    print("Max caption length:", max_caption_length)
else:
    print("\nError: The 'captions' list is empty. Cannot create vocabulary.")


Error: The 'captions' list is empty. Cannot create vocabulary.


In [21]:
image_input = Input(shape=(224, 224, 3))
resnet_model = ResNet50(include_top=False, weights='imagenet')(image_input)
image_features = tf.keras.layers.Flatten()(resnet_model)
image_features = Dense(256, activation='relu')(image_features) # Reduce dimensionality
image_model = Model(inputs=image_input, outputs=image_features)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [24]:
epochs = 10 # Increase this significantly for meaningful training
batch_size = 1

In [28]:
def generate_caption(image, image_model, caption_model, word_to_index, index_to_word, max_length):
    photo_feature = image_model.predict(np.expand_dims(image, axis=0))
    in_text = start_tag
    for _ in range(max_length):
        sequence = [word_to_index[word] for word in in_text.split() if word in word_to_index]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = caption_model.predict([sequence, photo_feature]) # Assuming caption model takes both
        yhat = np.argmax(yhat)
        word = index_to_word.get(yhat)
        if word is None or word == end_tag:
            break
        in_text += ' ' + word
    return in_text.replace(start_tag, '').replace(end_tag, '').strip()

In [None]:
print("Model building and preprocessing complete. Training and generation are commented out for now.")
print("Remember to uncomment and adjust training parameters for actual learning.")

Model building and preprocessing complete. Training and generation are commented out for now.
Remember to uncomment and adjust training parameters for actual learning.
