# Main notebook for data processing in robo_romeo project

## Imports - this should do us for the whole project. 

In [1]:
import numpy as np
from PIL import Image
import os
import string
from pickle import dump
from pickle import load
import tensorflow as tf
from tensorflow.keras.applications.xception import Xception #to get pre-trained model Xception
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer #for text tokenization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import add
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense#Keras to build our CNN and LSTM
from tensorflow.keras.layers import LSTM, Embedding, Dropout
from tqdm import tqdm_notebook as tqdm #to check loop progress
tqdm().pandas()


2022-06-08 15:24:50.773066: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-08 15:24:50.773145: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


0it [00:00, ?it/s]

## Data cleaning

 - load_doc( filename ) – To load the document file and read the contents of the file into a string.

In [2]:
# Load the document file into memory
def load_doc(filename):
    # Open file to read
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [3]:
filename = '../raw_data/Flickr8k_text/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)

 - load_descriptions(doc) – To create a description dictionary that will map images with all 5 captions.

In [4]:
# extract descriptions for images
def load_descriptions(doc):
    mapping = dict()
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # remove filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

In [5]:
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

Loaded: 8092 


 - clean_descriptions( descriptions) – to clean the data by taking all descriptions as input. This will perform several types of cleaning including uppercase to lowercase conversion, punctuation removal, and removal of the number containing words.



In [6]:
import string

In [7]:
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] =  ' '.join(desc)

In [8]:
# clean descriptions
clean_descriptions(descriptions)

 - txt_vocab( descriptions ) – to create a vocabulary from all the unique words extracted out from descriptions.



In [9]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [10]:
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 8763


 - save_descriptions( descriptions, filename ) – This function is used to store all the preprocessed descriptions into a file.



In [11]:
!pwd

/home/agolovin/code/CMaxK/robo_romeo/notebooks


In [12]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
         

In [13]:
# save descriptions
save_descriptions(descriptions, 'descriptions.txt')

In [14]:
!pwd

/home/agolovin/code/CMaxK/robo_romeo/notebooks


In [15]:
filename = '../raw_data/Flickr8k_text/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
# clean descriptions
clean_descriptions(descriptions)
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))
# save to file
save_descriptions(descriptions, 'descriptions.txt')


Loaded: 8092 
Vocabulary Size: 8763


In [16]:
t = Tokenizer()
t.fit_on_texts(descriptions)

In [99]:
def sequencing(file):
    
    with open(file) as f:
        lines = f.readlines()
    #splitting the lines in to lists
    
    list = []
    training_list = []
    X1 = []
    X2 = []
    y = []
    token_vocab = []
    
    k_file = "../raw_data/Flickr8k_text/Flickr_8k.trainImages.txt"

    with open(k_file) as n:
        l = n.readlines()
    for line in l:
        training_list.append(line.replace(".jpg\n", ""))
    
    for j in lines:
        list.append(j.split())
    
    for item in list:
        item.insert(1,"startsequence")
        item.append("endsequence") 
    
    # for loop to append X1,X2,y
    
    
    
    for i in list:
        for seq in range(2,len(i)):
            if i[0] in training_list:
                X1.append(i[0])
                X2.append(i[1:seq])
                token_vocab.append(i[1:(seq+1)])
                y.append(i[seq])
            else:
                pass


    return(X1,X2,y,token_vocab)

In [100]:
X1,X2,y,token_vocab = sequencing('descriptions.txt')

In [110]:
y_vocab = len(set(y))
y_vocab

7577

In [113]:
type(y)

list

In [114]:
#y = to_categorical(y, num_classes=y_vocab)

In [117]:
#Tokenizing X2
t = Tokenizer()
t.fit_on_texts(token_vocab)
X2_tokenized = t.texts_to_sequences(X2)

#Tokenizing y
y_tokenized = t.texts_to_sequences(y)

In [118]:
y_tokenized

[[35],
 [2],
 [69],
 [130],
 [4],
 [124],
 [57],
 [458],
 [11],
 [606],
 [2],
 [25],
 [6688],
 [684],
 [12],
 [17],
 [321],
 [71],
 [236],
 [184],
 [12],
 [26],
 [17],
 [124],
 [71],
 [236],
 [3383],
 [12],
 [26],
 [17],
 [124],
 [3],
 [606],
 [24],
 [63],
 [3383],
 [12],
 [26],
 [17],
 [2],
 [69],
 [130],
 [321],
 [71],
 [236],
 [4468],
 [12],
 [13],
 [7],
 [5],
 [716],
 [7],
 [18],
 [340],
 [12],
 [13],
 [7],
 [5],
 [1115],
 [7],
 [39],
 [9],
 [145],
 [96],
 [8],
 [3],
 [206],
 [12],
 [13],
 [7],
 [5],
 [14],
 [7],
 [9],
 [21],
 [842],
 [18],
 [568],
 [29],
 [145],
 [96],
 [2],
 [3],
 [104],
 [12],
 [10],
 [28],
 [11],
 [680],
 [2119],
 [78],
 [29],
 [145],
 [96],
 [8],
 [3],
 [206],
 [12],
 [10],
 [28],
 [8],
 [882],
 [862],
 [295],
 [145],
 [96],
 [12],
 [26],
 [17],
 [199],
 [2],
 [603],
 [87],
 [2],
 [52],
 [11],
 [579],
 [1151],
 [9],
 [63],
 [228],
 [2],
 [1204],
 [12],
 [26],
 [17],
 [4],
 [43],
 [2],
 [52],
 [11],
 [51],
 [579],
 [1151],
 [12],
 [40],
 [17],
 [2],
 [3],
 [74]

In [None]:
y = to_categorical(y_tokenized, num_classes=y_vocab)

In [105]:
X2_tokenized

[[1, 35],
 [1, 35, 2],
 [1, 35, 2, 69],
 [1, 35, 2, 69, 130],
 [1, 35, 2, 69, 130, 4],
 [1, 35, 2, 69, 130, 4, 124],
 [1, 35, 2, 69, 130, 4, 124, 57],
 [1, 35, 2, 69, 130, 4, 124, 57, 458],
 [1, 35, 2, 69, 130, 4, 124, 57, 458, 11],
 [1, 35, 2, 69, 130, 4, 124, 57, 458, 11, 606],
 [1, 35, 2, 69, 130, 4, 124, 57, 458, 11, 606, 2],
 [1, 35, 2, 69, 130, 4, 124, 57, 458, 11, 606, 2, 25],
 [1, 35, 2, 69, 130, 4, 124, 57, 458, 11, 606, 2, 25, 6688],
 [1, 35, 2, 69, 130, 4, 124, 57, 458, 11, 606, 2, 25, 6688, 684],
 [1, 35, 2, 69, 130, 4, 124, 57, 458, 11, 606, 2, 25, 6688, 684, 12],
 [1, 17],
 [1, 17, 321],
 [1, 17, 321, 71],
 [1, 17, 321, 71, 236],
 [1, 17, 321, 71, 236, 184],
 [1, 17, 321, 71, 236, 184, 12],
 [1, 26],
 [1, 26, 17],
 [1, 26, 17, 124],
 [1, 26, 17, 124, 71],
 [1, 26, 17, 124, 71, 236],
 [1, 26, 17, 124, 71, 236, 3383],
 [1, 26, 17, 124, 71, 236, 3383, 12],
 [1, 26],
 [1, 26, 17],
 [1, 26, 17, 124],
 [1, 26, 17, 124, 3],
 [1, 26, 17, 124, 3, 606],
 [1, 26, 17, 124, 3, 606, 24

In [67]:
#padding
X2_pad = pad_sequences(X2_tokenized, dtype='int32', padding='post', value=0)

In [51]:
f = X1,X2_pad,y_tokenized

In [52]:
#Saving Caption file
file = "../raw_data/captions/cap"
outfile = open(file,'wb')
dump(f,outfile)
outfile.close()

In [53]:
file = "../raw_data/captions/cap"

In [54]:
with open(file, 'rb') as handle:
    b = load(handle)