# Connecting google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Connecting kaggle

In [2]:
!pip install -q kaggle

In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"arbazkhancs","key":"140450ae4ad01edb47d18c912d681ad3"}'}

In [4]:
# Create A Kaggle Folder
! mkdir ~/.kaggle

In [5]:
# Copy the kaggle.json file to the above folder
! cp kaggle.json ~/.kaggle/

In [6]:
# Permission for the json to act
! chmod 600 ~/.kaggle/kaggle.json

In [7]:
# to list all kaggle datasets
! kaggle datasets list

ref                                                          title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
-----------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
syedanwarafridi/vehicle-sales-data                           Vehicle Sales Data                                  19MB  2024-02-21 20:16:17           5928         93  1.0              
tarunrm09/climate-change-indicators                          Climate change Indicators                           34KB  2024-02-22 08:53:54           2777         67  1.0              
nelgiriyewithana/apple-quality                               Apple Quality                                      170KB  2024-01-11 14:31:07          24953        531  1.0              
devi5723/e-commerce-cosmetics-dataset                        E-commerce Cosmetic

# Downloading Dataset

In [8]:
! kaggle datasets download -d adityajn105/flickr8k

Downloading flickr8k.zip to /content
 99% 1.03G/1.04G [00:10<00:00, 138MB/s]
100% 1.04G/1.04G [00:10<00:00, 107MB/s]


In [None]:
# unzip the downloaded file
! unzip flickr8k.zip

# Importing neccessary librarys

In [42]:
from os import listdir
import numpy as np
from tqdm import tqdm
import pickle

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Load Descriptions

In [11]:
with open('captions.txt', 'r') as f:
  next(f) # continue the header
  captions_doc = f.read()

In [13]:
captions_doc.split('\n')[0]

'1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .'

# Mapping the image_id with it's Captions

In [21]:
def load_descption_to_dictionary(descriptions):
  map = {}
  for line in tqdm(descriptions.split("\n")):
    # description shpuld have atleast 2 words
    if len(line) < 2:
      continue

    # extract image_id, image descripton
    image_id, image_desc = line.split(",")[0], line.split(",")[1:]

    # extract image_id not it's extension
    image_id = image_id.split(".")[0]

    # Convert the caption list to a string by joining its elements
    caption = ' '.join(image_desc)

    # Create a list if the image ID is not already in the mapping dictionary
    if image_id not in map:
      map[image_id] = []

    # store the image_id and it's caption
    map[image_id].append(caption)

  return map

In [22]:
captions_dict = load_descption_to_dictionary(captions_doc)
captions_dict["1000268201_693b08cb0e"], len(captions_dict)

100%|██████████| 40456/40456 [00:00<00:00, 515547.61it/s]


(['A child in a pink dress is climbing up a set of stairs in an entry way .',
  'A girl going into a wooden building .',
  'A little girl climbing into a wooden playhouse .',
  'A little girl climbing the stairs to her playhouse .',
  'A little girl in a pink dress going into a wooden cabin .'],
 8091)

# Cleaning Captions

In [23]:
def clean_captions(captions_dict):
  for image, captions in tqdm(captions_dict.items()):
    for i in range(len(captions)):
      # Load captions one by one
      caption = captions[i]

      # Convert the caption to lowercase
      caption = caption.lower()

      # delete digits, special chars, etc.,
      caption = caption.replace('[^A-Za-z]', '')

      # delete additional spaces
      caption = caption.replace('\s+', ' ')

      # add start and end tags to the caption and remove word < length 2
      caption = '<startseq> ' + " ".join([word for word in caption.split() if len(word)>1]) + ' <endseq>'
      captions[i] = caption

  return

In [24]:
print("Before Cleaning")
captions_dict["1000268201_693b08cb0e"]

Before Cleaning


['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [25]:
clean_captions(captions_dict)
print("\nAfter Cleaning")
captions_dict["1000268201_693b08cb0e"]

100%|██████████| 8091/8091 [00:00<00:00, 33187.05it/s]


After Cleaning





['<startseq> child in pink dress is climbing up set of stairs in an entry way <endseq>',
 '<startseq> girl going into wooden building <endseq>',
 '<startseq> little girl climbing into wooden playhouse <endseq>',
 '<startseq> little girl climbing the stairs to her playhouse <endseq>',
 '<startseq> little girl in pink dress going into wooden cabin <endseq>']

# Creating list of captions

In [28]:
def create_caption_list(captions_dict):
  all_captions = []
  for imageId in tqdm(captions_dict):
    for caption in captions_dict[imageId]:
      all_captions.append(caption)
  return all_captions

In [29]:
all_captions = create_caption_list(captions_dict)
all_captions[:10], len(all_captions)

100%|██████████| 8091/8091 [00:00<00:00, 525668.60it/s]


(['<startseq> child in pink dress is climbing up set of stairs in an entry way <endseq>',
  '<startseq> girl going into wooden building <endseq>',
  '<startseq> little girl climbing into wooden playhouse <endseq>',
  '<startseq> little girl climbing the stairs to her playhouse <endseq>',
  '<startseq> little girl in pink dress going into wooden cabin <endseq>',
  '<startseq> black dog and spotted dog are fighting <endseq>',
  '<startseq> black dog and tri-colored dog playing with each other on the road <endseq>',
  '<startseq> black dog and white dog with brown spots are staring at each other in the street <endseq>',
  '<startseq> two dogs of different breeds looking at each other on the road <endseq>',
  '<startseq> two dogs on pavement moving toward each other <endseq>'],
 40455)

# Create Tokenizer

In [37]:
def create_tokenizer(all_captions):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(all_captions)
  # print(tokenizer.word_index)
  return tokenizer

In [38]:
tokenizer = create_tokenizer(all_captions)

In [39]:
vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(caption.split())for caption in all_captions)

vocab_size, max_len

(8485, 35)

# Train Test Split

In [41]:
imageIds = list(captions_dict.keys())
split = int(len(imageIds) * 0.90)
train = imageIds[:split]
test = imageIds[split:]

# Save

In [45]:
with open("/content/drive/MyDrive/Colab Notebooks/Image-Captioning Project/Preprocessing/tokenizer.pkl", "wb") as f:
  pickle.dump(tokenizer, f)

with open("/content/drive/MyDrive/Colab Notebooks/Image-Captioning Project/Preprocessing/captions_dict.pkl", "wb") as f:
  pickle.dump(captions_dict, f)

with open("/content/drive/MyDrive/Colab Notebooks/Image-Captioning Project/Preprocessing/all_captions.pkl", "wb") as f:
  pickle.dump(all_captions, f)