In [1]:
# Path to pretrained W2V model, can be downloaded at https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
w2v_vectors_path = '../Word2Vec/model/GoogleNews-vectors-negative300.bin'

# Path to a text file where each line contains a stopword to be removed
stopwords_path = './Data/stopwords.txt'

# Path the the merged annotations, created by Merge_Annotations notebook
annotations_merged_path = './Data/val_merged.json'

# Path to the database
db_path = '../Word2Vec/model/trainMediaDb.v1.sqlite'

# Name of the DB folder from which to take the images
db_folder = "val2014"

# Output filename for the word embeddings array
w2v_output_path = './Data/w2v_val2014.out'

# Output filename for the exemplar array
exemplar_output_path = './Data/exemplar_val2014.out'

In [2]:
import json

with open(annotations_merged_path, "r") as read_file:
    data = json.load(read_file)

all_images = data["images"]
file_name_to_dict = {}

# Create a map between files names (from database) and index in image array
pos = 0
for i in range(0, len(all_images)):
    file_name_to_dict[all_images[i]["file_name"]] = i

In [3]:
# Fetch exemplar vectors from database and populate the image arrays
import sqlite3
import struct
import numpy as np

conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("SELECT Item_FileName as file, Folder_Path as path, ItemEngineExemplar_Exemplar as feature, ItemEngineExemplar_Id as id, ItemEngineExemplar_ItemId as itemId from ItemEngineExemplar, Folder, Item where ItemEngineExemplar_ItemId = Item_Id and Item_ParentFolderId = Folder_Id")

print("Fetching all feature vectors from database")

while True:
    row = cur.fetchone()
    
    if row == None:
        break
        
    if row[1].lower().find(db_folder) == -1:
        continue
        
    blob = row[2]
    
    if blob is None:
        continue
        
    data = np.array(struct.unpack('1024f', blob))
    file_name = row[0].lower()
        
    if file_name not in file_name_to_dict:
        print("Error with " + file_name)
    else:
        all_images[file_name_to_dict[file_name]]["exemplar"] = data

Fetching all feature vectors from database
Error with johnny-mnemonic-blu-raycompre.jpg


In [4]:
# Load NLP.
import gensim

if 'model' not in globals():
    model = gensim.models.KeyedVectors.load_word2vec_format(w2v_vectors_path, binary=True)

#function to average all words vectors in a sentence
def avg_sentence_vector(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        nwords = nwords+1
        featureVec = np.add(featureVec, model[word])

    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
        
    return featureVec

def remove_stop_words(words, model, stopWords):
    result = []
    for word in words:
        if word in model.vocab and word not in stopWords:
            result.append(word)
    
    return result

with open(stopwords_path) as f:
    stopWords = f.read().split("\n")



In [5]:
# Generate data to feed to DSSM

x_w2v = []
x_exemplar = []

print("Generating word embeddings", end='')
count = 0

for image in all_images:
    if "exemplar" not in image:
        continue
        
    for caption in image["captions"]:
        caption = remove_stop_words(caption.split(), model, stopWords)
        embedding = avg_sentence_vector(caption, model, num_features=300)

        x_w2v.append(embedding.tolist())
        x_exemplar.append(image["exemplar"])

    if count % (len(all_images) // 100) == 0:
        print(".", end = '')
    
    count = count + 1

print("Converting to numpy arrays")
x_w2v = np.array(x_w2v).astype(np.float64)
x_exemplar = np.array(x_exemplar).astype(np.float64)

print("Storing numpy arrays to file")
np.save(w2v_output_path, x_w2v)
np.save(exemplar_output_path, x_exemplar)
print("Done")


Generating word embeddings.....................................................................................................Converting to numpy arrays
Storing numpy arrays to file
Done
