# Part 3 - Image Caption Mapping And Preprocessing

## Mount Google Drive unto Colab Environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd

/content


In [3]:
# Check if in Colab Env
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

We're running Colab


In [4]:
# Change Mount Location
if IN_COLAB:
  # Mount the Google Drive at mount
  mount='/content/drive/'
  print("Colab: mounting Google drive on ", mount)

  drive.mount(mount)

  # Switch to the directory on the Google Drive that you want to use
  import os
  drive_root = mount + "/My Drive/Colab Notebooks/Capstone/Code"
  
  # Create drive_root if it doesn't exist
  create_drive_root = True
  if create_drive_root:
    print("\nColab: making sure ", drive_root, " exists.")
    os.makedirs(drive_root, exist_ok=True)
  
  # Change to the directory
  print("\nColab: Changing directory to ", drive_root)
  %cd $drive_root

Colab: mounting Google drive on  /content/drive/
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).

Colab: making sure  /content/drive//My Drive/Colab Notebooks/Capstone/Code  exists.

Colab: Changing directory to  /content/drive//My Drive/Colab Notebooks/Capstone/Code
/content/drive/My Drive/Colab Notebooks/Capstone/Code


In [5]:
# check allocated GPU
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3715531752231825981
 xla_global_id: -1]

## Import Modules

In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import Counter 

# to explore text in captions data
import nltk
nltk.download('stopwords')
import regex as re
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer

# to process the images
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# to process the text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# for modelling
from tensorflow.keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger

#utilities
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# set directories
BASE_DIR = mount + "/My Drive/Colab Notebooks/Capstone/Data/flickr8k"
WORKING_DIR = mount + "/My Drive/Colab Notebooks/Capstone/Code"

## Load Features From Pickle

In [28]:
# load features from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

## Load Captions Data

In [29]:
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

## Map Image to Captions

In [30]:
# create mapping of image to captions
mapping = {}
#process lines
for line in tqdm(captions_doc.split('\n')):
    #split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    #remove extension from image ID
    image_id = image_id.split('.')[0]
    #convert caption list to string
    caption = ' '.join(caption)
    #create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [47]:
# store mapping function in pickle
pickle.dump(mapping, open(os.path.join(WORKING_DIR, 'mapping.pkl'), 'wb'))

In [32]:
len(mapping)

8091

## Preprocess the Captions Data

In [33]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            #take one caption at a time
            caption = captions[i]
            #preprocessing steps
            #convert to lowercase
            caption = caption.lower()
            #replace all digits, special characters, etc.
            caption = caption.replace('[^A-Za-z]', '')
            #delete additional spaces
            caption = caption.replace('\s+', ' ')
            #add start and end tags to captions
            caption = 'startseq ' + ' '.join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [34]:
# store cleaning function in pickle
pickle.dump(clean, open(os.path.join(WORKING_DIR, 'clean.pkl'), 'wb'))

In [35]:
# before preprocessing of text
mapping['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [36]:
#preprocess the text
clean(mapping)

In [37]:
#after preprocessing of text
mapping['1000268201_693b08cb0e']

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq']

In [38]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [39]:
len(all_captions)

40455

In [40]:
all_captions[:10]

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq',
 'startseq black dog and spotted dog are fighting endseq',
 'startseq black dog and tri-colored dog playing with each other on the road endseq',
 'startseq black dog and white dog with brown spots are staring at each other in the street endseq',
 'startseq two dogs of different breeds looking at each other on the road endseq',
 'startseq two dogs on pavement moving toward each other endseq']

In [41]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1 #index up to the greatest token ID

In [42]:
#number of unique words
vocab_size

8485

In [43]:
# get maximum length of caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length

35

In [44]:
type(all_captions)

list

In [49]:
# store processed captions in pickle
pickle.dump(all_captions, open(os.path.join(WORKING_DIR, 'all_captions.pkl'), 'wb'))