In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import os
from PIL import Image
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string


In [None]:
dataset_dir_img = "gdrive/MyDrive/flickr30k_images"

In [None]:
image_data = []

for root, dirs, files in os.walk(os.path.join(dataset_dir_img, "flickr30k_images")):
    for file in files:
        # Check if the file is an image file
        if file.endswith((".jpg", ".jpeg", ".png")):
            # Construct the full file path
            file_path = os.path.join(root, file)

            # Load the image using PIL
            image = Image.open(file_path)
            image_data.append(image)

In [None]:
dataset_dir_csv = "gdrive/MyDrive/flickr30k_images/results.csv"

In [None]:
with open(dataset_dir_csv, 'r') as file:
    # Read the first line to get the column names
    columns = file.readline().strip().split('|')

    # Read remaining lines, split by the '|' separator, and construct the DataFrame
    data = [line.strip().split('|') for line in file]

    # Create a DataFrame from the parsed data with the extracted column names
    df = pd.DataFrame(data, columns=columns)

# Now you have the DataFrame `df` containing data from the CSV file
# You can access columns like `image_name`, `comment_number`, and `comment`
print(df.head())  # Example: print the first few rows of the DataFrame

       image_name  comment_number  \
0  1000092795.jpg               0   
1  1000092795.jpg               1   
2  1000092795.jpg               2   
3  1000092795.jpg               3   
4  1000092795.jpg               4   

                                             comment  
0   Two young guys with shaggy hair look at their...  
1   Two young , White males are outside near many...  
2   Two men in green shirts are standing in a yard .  
3       A man in a blue shirt standing in a garden .  
4            Two friends enjoy time spent together .  


In [None]:
df = pd.read_csv(dataset_dir_csv, delimiter='|')

# Display the first few rows of the DataFrame
print(df.head())

       image_name  comment_number  \
0  1000092795.jpg               0   
1  1000092795.jpg               1   
2  1000092795.jpg               2   
3  1000092795.jpg               3   
4  1000092795.jpg               4   

                                             comment  
0   Two young guys with shaggy hair look at their...  
1   Two young , White males are outside near many...  
2   Two men in green shirts are standing in a yard .  
3       A man in a blue shirt standing in a garden .  
4            Two friends enjoy time spent together .  


In [None]:
df.columns
df[' comment'] = df[' comment'].astype(str)


In [None]:
combined_captions_df = df.groupby('image_name')[' comment'].apply(lambda x: ' '.join(x)).reset_index()


In [None]:
combined_captions_df.columns

Index(['image_name', ' comment'], dtype='object')

In [None]:
len(combined_captions_df)

31783

In [None]:
import os

# Extract image IDs from file paths
image_paths = []

for root, dirs, files in os.walk(os.path.join(dataset_dir_img, "flickr30k_images")):
    for file in files:
        if file.endswith((".jpg", ".jpeg", ".png")):

            file_path = os.path.join(root, file)
            # Extract the image ID from the file path (assuming it's part of the file name)
            image_id = os.path.splitext(file)[0]
            image_paths.append((image_id, file_path))

# Map image IDs to image paths
id_to_path = dict(image_paths)

# Match image names with image IDs and get corresponding file paths
image_paths_matched = []

for image_name in combined_captions_df['image_name']:
#for image_name in df['image_name']:

    # Extract image ID from image name
    image_id = os.path.splitext(image_name)[0]
    # Get the corresponding file path using the image ID
    file_path = id_to_path.get(image_id)
    if file_path:
        image_paths_matched.append(file_path)
    else:
        image_paths_matched.append(None)  # Or any placeholder value for missing images

# Add image_paths_matched as a new column to the DataFrame
combined_captions_df['image_path'] = image_paths_matched
#df['image_path'] = image_paths_matched


In [None]:
combined_captions_df['image_path'].head(3)

0    gdrive/MyDrive/flickr30k_images/flickr30k_imag...
1    gdrive/MyDrive/flickr30k_images/flickr30k_imag...
2    gdrive/MyDrive/flickr30k_images/flickr30k_imag...
Name: image_path, dtype: object

In [None]:
!pip install nltk



In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Lowercasing
        text = text.lower()

        # Tokenization
        tokens = word_tokenize(text)

        # Removing Punctuation
        tokens = [token for token in tokens if token not in string.punctuation]

        # Removing Stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

        # Stemming
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

        return tokens
    else:
        return []  # Return an empty list for non-string inputs

combined_captions_df['preprocessed_comment'] = combined_captions_df[' comment'].apply(preprocess_text)
#df['preprocessed_comment'] = df[' comment'].apply(preprocess_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
combined_captions_df['preprocessed_comment']

0        [two, young, guy, shaggi, hair, look, hand, ha...
1        [sever, men, hard, hat, oper, giant, pulley, s...
2        [child, pink, dress, climb, set, stair, entri,...
3        [someon, blue, shirt, hat, stand, stair, lean,...
4        [two, men, one, gray, shirt, one, black, shirt...
                               ...                        
31778    [woman, write, pad, room, gold, decor, wall, w...
31779    [person, red, shirt, climb, rock, face, cover,...
31780    [two, male, construct, worker, work, street, o...
31781    [older, busker, glass, play, eastern, string, ...
31782    [man, short, hawaiian, shirt, lean, rail, pilo...
Name: preprocessed_comment, Length: 31783, dtype: object

In [None]:
df_temp = pd.DataFrame(columns=combined_captions_df.columns)

df_temp=combined_captions_df.head(50).copy()


In [None]:
#df_temp['preprocessed_comment'] = df_temp['preprocessed_comment'].apply(lambda x: ' '.join(word for word in x))
df_temp['preprocessed_comment']

In [None]:
print(vocab_size)

10000


In [None]:
captions = df_temp['preprocessed_comment'].head(5)  # Select a few captions for inspection

# Tokenize the captions
tokenized_sequences = tokenizer.texts_to_sequences(captions)

# Print the original captions and their tokenized sequences
for caption, sequence in zip(captions, tokenized_sequences):
    print("Original Caption:", caption)
    print("Tokenized Sequence:", sequence)
    print()

Original Caption: ['two', 'young', 'guy', 'shaggi', 'hair', 'look', 'hand', 'hang', 'yard', 'two', 'young', 'white', 'male', 'outsid', 'near', 'mani', 'bush', 'two', 'men', 'green', 'shirt', 'stand', 'yard', 'man', 'blue', 'shirt', 'stand', 'garden', 'two', 'friend', 'enjoy', 'time', 'spent', 'togeth']
Tokenized Sequence: [12, 20, 217, 78, 205, 149, 12, 20, 13, 41, 12, 25, 46, 26, 173, 149, 3, 28, 26, 173, 208, 12, 321, 209, 322]

Original Caption: ['sever', 'men', 'hard', 'hat', 'oper', 'giant', 'pulley', 'system', 'worker', 'look', 'piec', 'equip', 'two', 'men', 'work', 'machin', 'wear', 'hard', 'hat', 'four', 'men', 'top', 'tall', 'structur', 'three', 'men', 'larg', 'rig']
Tokenized Sequence: [25, 63, 30, 324, 325, 326, 480, 205, 12, 25, 591, 63, 30, 36, 25, 58, 113, 84, 25, 331]

Original Caption: ['child', 'pink', 'dress', 'climb', 'set', 'stair', 'entri', 'way', 'littl', 'girl', 'pink', 'dress', 'go', 'wooden', 'cabin', 'littl', 'girl', 'climb', 'stair', 'playhous', 'littl', 'gir

In [None]:
vocab_size

542

In [None]:
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences


#df_temp['preprocessed_comment'] = df_temp['preprocessed_comment'].apply(lambda x: ''.join(x))

# Load pre-trained ResNet50 model (excluding top layers)
base_model = ResNet50(weights='imagenet', include_top=False)
image_model = Model(inputs=base_model.input, outputs=base_model.output)

# Function to extract features from an image
def extract_image_features(image_path, model):
    img = Image.open(image_path)
    img = img.resize((224, 224))  # Resize image to match input size expected by ResNet50
    img_array = np.array(img)
    img_array = preprocess_input(img_array)  # Preprocess input image
    img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
    features = model.predict(img_array)  # Extract features
    return features

# Custom data generator

def custom_data_generator(data_keys, mapping, image_paths, tokenizer, max_length, vocab_size, batch_size):
    while True:
        for key, image_path in zip(data_keys, image_paths):
            # Extract image features
            image_features = extract_image_features(image_path, image_model)

            caption = mapping[key]  # Get the single caption associated with the image

            # Process the single caption
            sequence = [tokenizer.word_index[word] for word in caption]

            # Generate input and output sequences
            for i in range(1, len(sequence)):
                # Prepare input sequence
                input_seq = sequence[:i]
                input_seq = pad_sequences([input_seq], maxlen=max_length, padding='post')[0]

                # Prepare output sequence (next word index)
                output_seq = sequence[i]

                # Pad the output sequence to match the length of the input sequence
                #output_seq = pad_sequences([[output_seq]], maxlen=max_length, padding='post')[0]

                # Convert output_seq to one-hot encoding
                output_seq = to_categorical(output_seq, num_classes=vocab_size)

                # Reshape the output sequence to match the model's output shape
                output_seq = output_seq.reshape(1, 1, vocab_size)
                print('output_seq',len(output_seq))
                print('input_seq',len(input_seq))
                # Yield the data
                yield [image_features, np.array(input_seq)[np.newaxis, ...]], output_seq








# Define the data keys, mapping, and tokenizer
data_keys = df_temp['image_name']
mapping = dict(zip(df_temp['image_name'], df_temp['preprocessed_comment']))
#tokenizer = tf.keras.preprocessing.text.Tokenizer()
#tokenizer.fit_on_texts(df_temp['preprocessed_comment'])

# Define the list of all comments
all_comments = df_temp['preprocessed_comment'].tolist()
# Concatenate all comments into a single string
all_comments_text = ' '.join(map(str, all_comments))
# Split the concatenated text into unique words
unique_words = set(all_comments_text.split())
# Get the count of unique words
num_unique_words = len(unique_words)
# Initialize the Tokenizer with the correct vocabulary size
tokenizer = Tokenizer(num_words=num_unique_words)
# Fit the Tokenizer on the text data
tokenizer.fit_on_texts(all_comments)
# Get the vocabulary size (including reserved tokens)
vocab_size = len(tokenizer.word_index) + 1

# Define parameters
max_length = 52
batch_size = 32

# Define image paths
image_paths = df_temp['image_path'].tolist()

# Instantiate the custom data generator
generator = custom_data_generator(data_keys, mapping, image_paths, tokenizer, max_length, vocab_size, batch_size)
#generator = custom_data_generator(data_keys, mapping, image_paths, max_length, vocab_size, batch_size)

# Define multimodal model
def create_multimodal_model(max_length, vocab_size):
    # Image feature layers
    image_features_input = tf.keras.layers.Input(shape=(2048,))
    image_features_reshaped = tf.keras.layers.Reshape((1, 1, 2048))(image_features_input)

    # Text feature layers
    text_input = tf.keras.layers.Input(shape=(max_length,))
    embedding_layer = tf.keras.layers.Embedding(vocab_size, 256, mask_zero=True)(text_input)
    sequence_dropout = tf.keras.layers.Dropout(0.4)(embedding_layer)

    # Transformer layer for text processing
    transformer_layer = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=256, dropout=0.2)(sequence_dropout, sequence_dropout)
    text_features = tf.keras.layers.GlobalAveragePooling1D()(transformer_layer)
    text_features = tf.keras.layers.Reshape((1, 1, 256))(text_features)

    # Combine image and text features
    merged_features = tf.keras.layers.Concatenate()([image_features_reshaped, text_features])
    dense1 = tf.keras.layers.Dense(256, activation='relu')(merged_features)
    output = tf.keras.layers.Dense(vocab_size, activation='softmax')(dense1)

    model = tf.keras.models.Model(inputs=[image_features_input, text_input], outputs=output)
    return model

# Create the multimodal model
multimodal_model = create_multimodal_model(max_length, vocab_size)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
multimodal_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model using the generator
steps_per_epoch = len(data_keys) // batch_size  # Adjust if necessary
multimodal_model.fit(generator, epochs=20, steps_per_epoch=None)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


NameError: name 'df_temp' is not defined

In [None]:
all_comments = df_temp['preprocessed_comment'].tolist()
all_comments_text = ' '.join(map(str, all_comments))
unique_words = set(all_comments_text.split())
num_unique_words = len(unique_words)
print(num_unique_words)

594
