In [1]:
import os, sys
import gensim
import logging
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt

In [2]:
DATA_DIR = os.environ["DATA_DIR"]
VECTORS_FILE = os.path.join(DATA_DIR, "GoogleNews-vectors-negative300.bin")
WORDS_FILE = os.path.join(DATA_DIR, 'questions-words.txt')

In [3]:
DISTANCES_FOLDER = os.path.join(DATA_DIR, 'distances')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.KeyedVectors.load_word2vec_format(VECTORS_FILE, binary=True)
model.accuracy(WORDS_FILE)

2018-07-27 21:32:41,062 : INFO : loading projection weights from /mnt/Dados/datasets/GoogleNews-vectors-negative300.bin
2018-07-27 21:33:26,973 : INFO : loaded (3000000, 300) matrix from /mnt/Dados/datasets/GoogleNews-vectors-negative300.bin
2018-07-27 21:33:26,989 : INFO : precomputing L2-norms of word weight vectors
2018-07-27 21:33:32,843 : INFO : capital-common-countries: 83.6% (423/506)
2018-07-27 21:33:37,332 : INFO : capital-world: 82.7% (1144/1383)
2018-07-27 21:33:37,757 : INFO : currency: 39.8% (51/128)
2018-07-27 21:33:46,232 : INFO : city-in-state: 74.6% (1739/2330)
2018-07-27 21:33:47,496 : INFO : family: 90.1% (308/342)
2018-07-27 21:33:50,188 : INFO : gram1-adjective-to-adverb: 32.3% (262/812)
2018-07-27 21:33:51,420 : INFO : gram2-opposite: 50.5% (192/380)
2018-07-27 21:33:55,663 : INFO : gram3-comparative: 91.9% (1224/1332)
2018-07-27 21:33:57,922 : INFO : gram4-superlative: 88.0% (618/702)
2018-07-27 21:34:00,715 : INFO : gram5-present-participle: 79.8% (694/870)
2018

In [61]:
labels = pd.read_csv(os.path.join(DATA_DIR, "label_all.csv"), names=['imagenet_img_id', 'label'])
nouns = pd.read_csv(os.path.join(DATA_DIR, "vqa_nouns.csv"), names=['vqa_img_id', 'imagenet_img_id', 'noun'])

nouns_vectors = []

for index, row in nouns.iterrows():
    noun = str(row["noun"]).lower()
    if noun in model:        
        nouns_vectors.append( model[noun] )

labels_vectors = []
for k, v in labels.iterrows():    
    label = str(v["label"]).lower()
    if label in model:
        labels_vectors.append( model[label] )

distances = euclidean_distances(nouns_vectors, labels_vectors)
tam_i, tam_j = distances.shape

nouns_arr = nouns.values
labels_arr = labels.values

del nouns
del labels
del nouns_vectors
del labels_vectors

distances.shape

(80828, 2352)

In [None]:
data = []
for i in range(0, tam_i):
    for j in range(0, tam_j):
        data.append([nouns_arr[i,0], nouns_arr[i,1], nouns_arr[i,2], labels_arr[j,0], labels_arr[j, 1], distances[i,j]])

df = pd.DataFrame(data, columns=['vqa_image', 'question_id', 'noun', 'imagenet_img', 'label', 'distance'])
df.head(5)