In [None]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import pdb
import time
import gc
from scipy.sparse import csr_matrix, lil_matrix
from gensim.models import KeyedVectors

In [None]:
with open("../data/features/train_features.pkl", "rb") as handle:
    train_features = pickle.load(handle)

In [None]:
with open("../data/features/valid_features.pkl", "rb") as handle:
    valid_features = pickle.load(handle)

In [None]:
train_captions = pd.read_csv("../data/split_lists/train_ids.csv", dtype = str)
valid_captions = pd.read_csv("../data/split_lists/valid_ids.csv", dtype = str)

In [None]:
valid_captions.head()

In [None]:
print(valid_captions.shape[0] == len(valid_features))
print(train_captions.shape[0] == len(train_features))

In [None]:
train_captions.caption = "startseq " + train_captions.caption + " endseq"
valid_captions.caption = "startseq " + valid_captions.caption + " endseq"

In [None]:
train_captions.head()

In [None]:
tokenizer = Tokenizer()

In [None]:
all_captions = np.concatenate([train_captions.caption.values,valid_captions.caption.values])

In [None]:
tokenizer.fit_on_texts(all_captions.astype(str))

In [None]:
vocab_size = 1 + len(tokenizer.word_index)
vocab_size

In [None]:
mkdir ../data/tokenizer

In [None]:
with open("../data/tokenizer/tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle)

In [None]:
def encode_and_pad(caption, sequence_length = 15):
    encoded = tokenizer.texts_to_sequences([caption])
    padded = pad_sequences(encoded, 
                            maxlen = sequence_length,
                            padding = "post", 
                            truncating = "post")[0]
    return(padded)

In [None]:
def encode(caption):
    encoded = tokenizer.texts_to_sequences([caption])[0]
    return (encoded)

In [None]:
def consolidate_dataset(features_dict, captions_df, sequence_length = 15):
    X_photos, X_captions = [], []
    y = [] 
    e = 0
    prevtime = time.time()
    for photo_id in captions_df['photo_id']:
        if photo_id not in features_dict:
            continue
        if e % 1000 == 0:
            print ("reached %d in %f sec" % (e, time.time() - prevtime))
            prevtime = time.time()
        e += 1

        current_feature = features_dict[photo_id][0]
        current_caption = str(captions_df.loc[captions_df.photo_id == photo_id].iloc[0]["caption"])
        current_caption_split = current_caption.split()
        for i in range(1,len(current_caption.split())):
            X_photos.append(current_feature)
            in_words, out_word = " ".join(current_caption_split[:i]), current_caption_split[i]
            in_seq = encode_and_pad(in_words, sequence_length = sequence_length)
            X_captions.append(in_seq)
            y.append(encode(out_word))
    return(X_photos, X_captions, y)

In [None]:
X_valid_photos, X_valid_captions, y_valid = consolidate_dataset(valid_features, valid_captions, sequence_length=15)

In [None]:
X_train_photos, X_train_captions, y_train = consolidate_dataset(train_features, train_captions, sequence_length=15)

In [None]:
X_valid_photos = np.array(X_valid_photos, dtype = np.float32)

In [None]:
X_valid_captions = np.array(X_valid_captions, dtype = np.int16)

In [None]:
y_valid = np.array(y_valid, dtype = np.int16)

In [None]:
mkdir ../data/preprocessed

In [None]:
def save_npy(path, arr):
    with open(path, "wb") as handle:
        np.save(path, arr)

In [None]:
save_npy("../data/preprocessed/X_valid_captions.npy", X_valid_captions)

In [None]:
X_train_photos = np.array(X_train_photos, dtype = np.float32)

In [None]:
X_train_captions= np.array(X_train_captions, dtype = np.int16)

In [None]:
y_train = np.array(y_train, np.int16)

In [None]:
save_npy("../data/preprocessed/X_train_captions.npy", X_train_captions)

In [None]:
embedding_model = KeyedVectors.load_word2vec_format('~/Desktop/embeddings/word2vec/GoogleNews-vectors-negative300.bin',
                                                   binary = True)

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in embedding_model:
        embedding_matrix[i] = embedding_model[word]

In [None]:
embedding_model["this"]

In [None]:
mkdir ../data/embedding_matrix

In [None]:
save_npy("../data/embedding_matrix/embedding_matrix.npy", embedding_matrix)