# Learning concept representation with Word2Vec

With the objective of improving the DRMM neural architecture for a classic ad-hoc Information Retrieval problem.
Using the Robust4 dataset and the concept build with Wordnet : http://wordnetweb.princeton.edu/perl/webwn 


Theses two tutorials helped me to achieve it. 

https://github.com/llSourcell/word_vectors_game_of_thrones-LIVE/blob/master/Thrones2Vec.ipynb

https://rare-technologies.com/word2vec-tutorial/

In [1]:
from __future__ import absolute_import, division, print_function
import codecs 
import glob 
import logging 
import multiprocessing
import os 
import pprint
import re
import ast
import json
import operator
import collections


In [2]:

import nltk 
import gensim.models.word2vec as w2v 
import sklearn.manifold # dimensionality reduction for visualisation.
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns
from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import preprocess_string,remove_stopwords,strip_numeric, strip_tags, strip_punctuation

In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Loading data

In [4]:
def load_all_path_docs_robust4(folder="/local/karmim/Stage_M1_RI/data/annotated_collection_tagme_score/015"):
        """
            We load all the path of all the annotated document of the robust4 collection.
        """
        
        all_file_name =[]
        for r,_,f in os.walk(folder): 
            for file in f: 
                all_file_name.append(os.path.join(r,file))
        return all_file_name

In [5]:
all_file_name = load_all_path_docs_robust4()
print("There is",len(all_file_name),"files in the collection")

There is 2295 files in the collection


In [6]:
all_file_name[0]

'/local/karmim/Stage_M1_RI/data/annotated_collection_tagme_score/015/FR94/10/FR941003.2'

In [62]:
def load_all_query_annotated_robust4(file = '/local/karmim/Stage_M1_RI/data/topics-title.annotated.csv',pre_process=True,CUSTOM_FILTERS=[lambda x: x.lower(),remove_stopwords],delete_meaning=False):
    query_an = {} # Dict with words and concept for a query id
    concept = {} # Dict with only the concept for a query id
    f = codecs.open(file,'r',encoding='utf-8',errors='ignore')
    for line in f: 
        #print(line.split())
        line = np.array(line.split())
        index = np.where(np.char.find(line, '$#')>=0)
        concept[line[0]] = list(line[index])
        query_an[line[0]] = list(line[1:])
        
        if delete_meaning : 
            concept[line[0]] = [ w[:-5] for w in concept[line[0]] ]
            query_an[line[0]] = [w[:-5] if '$#' in w else w for w in query_an[line[0]] ]
        if pre_process:
            for k in query_an : 
                query_an[k] = preprocess_string(' '.join(query_an[k]),CUSTOM_FILTERS)
                concept[k] = preprocess_string(' '.join(concept[k]),[lambda x : x.lower()]) 
            
    return query_an,concept

In [63]:
q,c = load_all_query_annotated_robust4(delete_meaning=True)

In [64]:
q['301']

['international', '$#!international', 'organized', 'crime', '$#!crime']

In [65]:
c['301']

['$#!international', '$#!crime']

In [66]:
q

{'301': ['international',
  '$#!international',
  'organized',
  'crime',
  '$#!crime'],
 '302': ['poliomyelitis',
  '$#!poliomyelitis',
  'post',
  '$#!post',
  'polio',
  '$#!poliomyelitis'],
 '303': ['hubble',
  'telescope',
  '$#!telescope',
  'achievements',
  '$#!accomplishment'],
 '304': ['endangered', 'species', '$#!species', 'mammals', '$#!mammal'],
 '305': ['dangerous', 'vehicles', '$#!vehicle'],
 '306': ['african', 'civilian', '$#!civilian', 'deaths', '$#!end'],
 '307': ['new', 'hydroelectric', 'projects', '$#!undertaking'],
 '308': ['implant', 'dentistry', '$#!dentistry'],
 '309': ['rap', '$#!rap', 'crime', '$#!crime'],
 '310': ['radio',
  '$#!radio',
  'waves',
  '$#!wave',
  'brain',
  '$#!mind',
  'cancer',
  '$#!cancer'],
 '311': ['industrial', 'espionage', '$#!espionage'],
 '312': ['hydroponics', '$#!hydroponics'],
 '313': ['magnetic',
  'levitation',
  '$#!levitation',
  'maglev',
  '$#!magnetic_levitation'],
 '314': ['marine', '$#!marine', 'vegetation', '$#!vegetatio

In [67]:
c

{'301': ['$#!international', '$#!crime'],
 '302': ['$#!poliomyelitis', '$#!post', '$#!poliomyelitis'],
 '303': ['$#!telescope', '$#!accomplishment'],
 '304': ['$#!species', '$#!mammal'],
 '305': ['$#!vehicle'],
 '306': ['$#!civilian', '$#!end'],
 '307': ['$#!undertaking'],
 '308': ['$#!dentistry'],
 '309': ['$#!rap', '$#!crime'],
 '310': ['$#!radio', '$#!wave', '$#!mind', '$#!cancer'],
 '311': ['$#!espionage'],
 '312': ['$#!hydroponics'],
 '313': ['$#!levitation', '$#!magnetic_levitation'],
 '314': ['$#!marine', '$#!vegetation'],
 '315': ['$#!highway', '$#!accident'],
 '316': ['$#!polygamy', '$#!polyandry', '$#!polygyny'],
 '317': ['$#!facsimile'],
 '318': ['$#!best', '$#!retirement', '$#!state'],
 '319': ['$#!fuel', '$#!source'],
 '320': ['$#!fiber', '$#!eye', '$#!cable'],
 '321': ['$#!womanhood', '$#!parliament'],
 '322': ['$#!international', '$#!artwork', '$#!crime'],
 '323': ['$#!plagiarism'],
 '324': ['$#!argentine', '$#!sexual_intercourse'],
 '325': ['$#!fad', '$#!life_style'],
 

In [13]:
def load_doc(file_doc,all_docs={},all_concept={},pre_process=True):
        """
            Fonction qui load un fichier file_doc. 
            pre_process -> Bool qui dit si on effectue le preprocessing ou non. 
        """
        
        with codecs.open(file_doc,'r',encoding='utf-8',errors='ignore') as f_:
            soup = BeautifulSoup(f_.read(),"html.parser")
        docs = soup.find_all('doc')
        for d_ in docs :
            text = np.array(d_.text.split()[1:])
            doc_id = d_.docno.text.strip()
            all_docs[doc_id] = list(text)
            index = np.where(np.char.find(text, '$#')>=0)
            all_concept[doc_id] = list(text[index])
        return all_docs,all_concept

In [14]:
a_doc,a_concept=load_doc('/local/karmim/Stage_M1_RI/data/annotated_collection_tagme_score/015/FR94/10/FR941003.2')

In [15]:
a_concept['FR941003-2-00002']

['$#!Dwelling',
 '$#!Guarantee',
 '$#!Education',
 '$#!Investment',
 '$#!Equal_opportunity',
 '$#!United_States_Agency_for_International_Development',
 '$#!Guarantee',
 '$#!Loan',
 '$#!Indonesia',
 '$#!Education',
 '$#!Loan',
 '$#!Deed',
 '$#!Infrastructure',
 '$#!Welfare',
 '$#!Indonesia',
 '$#!Indonesia',
 '$#!Loan',
 '$#!Loan',
 '$#!Debtor',
 '$#!Loan',
 '$#!Indonesia',
 '$#!Guarantee',
 '$#!Attention_deficit_hyperactivity_disorder',
 '$#!Budget',
 '$#!Timor',
 '$#!Jakarta',
 '$#!Indonesia',
 '$#!Communication',
 '$#!Telephone',
 '$#!Jakarta',
 '$#!Indonesia',
 '$#!Telephone',
 '$#!Fax',
 '$#!Telephone',
 '$#!Creditor',
 '$#!Debtor',
 '$#!Interest',
 '$#!Finance',
 '$#!Dwelling',
 '$#!Guarantee',
 '$#!Jakarta',
 '$#!United_States_Agency_for_International_Development',
 '$#!Medan',
 '$#!Jakarta',
 '$#!Indonesia',
 '$#!Fax',
 '$#!Telecommunication',
 '$#!Telephone',
 '$#!Natural_environment',
 '$#!Telecommunication',
 '$#!Telephone',
 '$#!Interest',
 '$#!Loan',
 '$#!Interest_rate',
 '

In [16]:
def load_all_doc(all_file,doc_json="/local/karmim/Stage_M1_RI/data/object_python/concept_part/anotated_doc.json",concept_doc_json="/local/karmim/Stage_M1_RI/data/object_python/concept_part/all_concept_doc.json"):
    
    exists1 = os.path.isfile(doc_json)
    exists2 = os.path.isfile(concept_doc_json)
    all_doc = {}
    all_concept={}
    if not exists1 or not exists2:
        
        for f in all_file:

            print("f -> ",f)
            load_doc(f,all_doc,all_concept)

        save = json.dumps(all_doc)
        f = open(doc_json,"w")
        f.write(save)
        f.close()
        print("document annoté sauvegardé...")
        save = json.dumps(concept_doc_json)
        f = open(doc_json,"w")
        f.write(save)
        f.close()
        print(" concept des documents sauvegardé...")
    else:
        
        print("Chargement du fichier json : anotated_doc.json ...")
        with open(doc_json) as json_file:
            all_doc = json.load(json_file)
        print("Chargement du fichier json : all_concept_doc.json ...")
        with open(concept_doc_json) as json_file:
            all_concept = json.load(json_file)

    #
    return all_doc,all_concept

def pre_process_doc(all_document={},all_concept={},CUSTOM_FILTERS=[lambda x: x.lower(),remove_stopwords],json_doc_preprocess="/local/karmim/Stage_M1_RI/data/object_python/concept_part/preprocess_doc.json",json_concept_preprocess="/local/karmim/Stage_M1_RI/data/object_python/concept_part/preprocess_concept.json"):
    exists1 = os.path.isfile(json_doc_preprocess)
    exists2 = os.path.isfile(json_concept_preprocess)
    if not exists1 or not exists2:
        for k in all_document : 
            all_document[k] = preprocess_string(' '.join(all_document[k]),CUSTOM_FILTERS)
            all_concept[k] = preprocess_string(' '.join(all_concept[k]),[lambda x : x.lower()]) # We only need to low the concept
        
        save = json.dumps(all_document)
        f = open(json_doc_preprocess,"w")
        f.write(save)
        f.close()
        print("document annoté preprocessé sauvegardé...")
        save = json.dumps(all_concept)
        f = open(json_concept_preprocess,"w")
        f.write(save)
        f.close()
        print("concept des documents preprocessé sauvegardé...")
    
    else:     
        print("Chargement du fichier json : preprocess_doc.json ...")
        with open(json_doc_preprocess) as json_file:
            all_document = json.load(json_file)
        print("Chargement du fichier json : preprocess_concept.json ...")
        with open(json_concept_preprocess) as json_file:
            all_concept = json.load(json_file)
        
        
    return all_document,all_concept
CUSTOM_FILTERS = [lambda x: x.lower(),remove_stopwords]   
#ad,ac = pre_process_doc()     

In [17]:
#ad,ac = load_all_doc(all_file_name)
#ad,ac = pre_process_doc(ad,ac)

Chargement du fichier json : anotated_doc.json ...
Chargement du fichier json : all_concept_doc.json ...
Chargement du fichier json : preprocess_doc.json ...
Chargement du fichier json : preprocess_concept.json ...


In [18]:
ad,ac = pre_process_doc() # Execute load_all_doc() first if the files json are not saved. cellul before

In [27]:
ad['LA052889-0021']

['new',
 'jazz',
 '$#!jazz',
 'oriented',
 'vocal',
 'groups',
 'rare',
 'ensemble',
 'members',
 'studied',
 'ithaca',
 'college',
 'welcomed',
 'step',
 'right',
 'direction',
 'darmon',
 'meader',
 'key',
 'figure',
 'singer',
 '$#!singing',
 'composer',
 '$#!composer',
 'arranger',
 '$#!arrangement',
 'tenor',
 'saxophonist',
 'peter',
 'eldridge',
 'kim',
 'nazarian',
 'sara',
 'krieger',
 'caprice',
 'fox',
 'writers',
 'contributing',
 'lyrics',
 '$#!lyrics',
 'music',
 '$#!music',
 'originals',
 'halfway',
 'profound',
 'trivial',
 'way',
 'blend',
 'splendid',
 'instrumental',
 'backing',
 '$#!backing_vocalist',
 'synthesizer',
 '$#!synthesizer',
 'sequencer',
 '$#!music_sequencer',
 'programming',
 '$#!programming_(music)',
 'aggressive',
 'times',
 'group',
 'little',
 'anxious',
 'commercial',
 'success',
 'caravan',
 '$#!caravan_(1936_song)',
 'interludes',
 '$#!break_(music)',
 'scatting',
 'polysyllabic',
 'vocalese',
 'inclusion',
 'printed',
 'lyrics',
 'invaluable',
 

In [20]:
ac['LA052889-0051']


['$#!spear',
 '$#!elder_(administrative_title)',
 '$#!war',
 '$#!risk',
 '$#!rite_of_passage',
 '$#!tribe',
 '$#!hospitality_industry',
 '$#!today_(u.s._tv_program)',
 '$#!people_(magazine)',
 '$#!calypso_(comics)',
 '$#!seattle',
 '$#!aircraft',
 '$#!ship',
 '$#!mediterranean_sea',
 '$#!seattle',
 '$#!ship',
 '$#!australia',
 '$#!military_base',
 '$#!rabaul',
 '$#!rabaul',
 '$#!airplane',
 '$#!planing_(boat)',
 '$#!crew',
 '$#!shipwreck',
 '$#!sea',
 '$#!underwater_diving',
 '$#!airplane',
 '$#!metre',
 '$#!calypso_music',
 '$#!decompression_(diving)',
 '$#!decade',
 '$#!history',
 '$#!airplane',
 '$#!fuselage',
 '$#!skeleton',
 '$#!shrine',
 '$#!propeller',
 '$#!spirit',
 '$#!vegetation',
 '$#!ocean_current',
 '$#!réunion',
 '$#!beauty',
 '$#!war',
 '$#!peace']

## Pre-processing

In [21]:

def count_concept(ac):
    all_concept = []
    for k in ac: 
        for w in ac[k]:
            all_concept.append(w)

    unique, counts = np.unique(all_concept, return_counts=True)
    dico_concept = dict(zip(unique, counts))
    return np.array(all_concept),dico_concept
all_c,dico_concept = count_concept(ac)
print("There is",len(all_c),"concepts in the collection Robust4.")

There is 17008375 concepts in the collection Robust4.


In [22]:
print("There is",len(dico_concept),"unique concepts in the collection Robust4")

There is 242663 unique concepts in the collection Robust4


In [23]:
dico_concept

{'$#!"heroes"_(david_bowie_album)': 4,
 '$#!"heroes"_(david_bowie_song)': 47,
 '$#!"mad"_mike_whiddett': 1,
 '$#!"v"_device': 4,
 '$#!"weird_al"_yankovic': 10,
 '$#!"wild_bill"_hickok': 4,
 '$#!$_(film)': 3,
 '$#!&tv': 2,
 "$#!'abd_allah_ibn_rawahah": 2,
 "$#!'amr_ibn_al-'as": 203,
 "$#!'amran": 2,
 "$#!'amran_governorate": 2,
 "$#!'aoa": 5,
 "$#!'aparima": 1,
 "$#!'aql": 16,
 "$#!'asir_region": 13,
 "$#!'atika_wahbi_al-khazraji": 2,
 "$#!'ndrangheta": 47,
 "$#!'ote'a": 13,
 "$#!'s-hertogenbosch": 2,
 "$#!'umayri": 1,
 '$#!(do_the)_mashed_potatoes': 3,
 '$#!(e,e)-2,4-decadienal': 1,
 '$#!(hydroxyethyl)methacrylate': 1,
 "$#!(i_can't_get_no)_satisfaction": 79,
 '$#!(keep_feeling)_fascination': 1,
 '$#!(methyl-co(iii)_methylamine-specific_corrinoid_protein):coenzyme_m_methyltransferase': 1,
 '$#!(you_drive_me)_crazy': 2,
 '$#!(z)-4-amino-2-butenoic_acid': 1,
 '$#!(ε,_δ)-definition_of_limit': 19,
 "$#!+'justments": 1,
 '$#!-elect': 3,
 '$#!-ene': 1,
 '$#!-gram': 2,
 '$#!-gry': 1,
 '$#!-is

Now i wanna see the proportion of concept who occurs less than 5 times

In [24]:
cpt=0
for k in dico_concept : 
    if dico_concept[k] < 5:
        cpt+=1

print(round((cpt/len(dico_concept))*100,2),"% of the concept appears less than 5 times.")


64.15 % of the concept appears less than 5 times.


In [25]:
def delete_low_frq(ad,ac,dico_concept):
    delete = []
    for k in dico_concept: 
        if dico_concept[k]<5:
            delete.append(k)
    
    for k in ad:
        for m in ad[k]:
            if m in delete:
                ad[k].remove(m)
                ac[k].remove(m)
    return ad,ac

In most paper, we should have more than 5 occurrences of a word to learn a good representation of it. However in this case, the majority of concepts appears less than this number so we will try differents methods to resolve this prob. 

## Learning part with W2V

### Try with low frequency concept anyway...

In this part we will try to learn the representation without removing the low frequency concept. 

In [29]:
"""               PARAMETERS                  """

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 1 # Usually is 5 But in this case we will try to learn even for a word that occurs 1 time

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 30 # Some concept are really long and we want to catch other concept in the window
                  # So we put a large context_size.

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1



In [30]:

allRobustConcept2v = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling,
)

In [31]:
allRobustConcept2v.build_vocab(list(ad.values())) # We need list of sentences (doc) 

In [33]:
print("Word2Vec vocabulary length:", len(allRobustConcept2v.wv.vocab))

Word2Vec vocabulary length: 792756


#### Training 

In [42]:
allRobustConcept2v.train(list(ad.values()),total_examples=allRobustConcept2v.corpus_count,epochs=7)

(989606702, 1001807443)

In [43]:
pathw2v = "/local/karmim/Stage_M1_RI/data/object_python/concept_part"
path_all_concept = os.path.join(pathw2v, "allRobustConcept2v.w2v")
allRobustConcept2v.save(path_all_concept)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [46]:
allRobustConcept2v.wv.most_similar('$#!risk', topn=10)

[('risks', 0.9223514795303345),
 ('risk', 0.8827913999557495),
 ('danger', 0.7749564051628113),
 ('dangers', 0.7668755650520325),
 ('dangerous', 0.7022541761398315),
 ('potentially', 0.6876526474952698),
 ('risky', 0.6718818545341492),
 ('exposure', 0.6668649911880493),
 ('exposed', 0.6455274820327759),
 ('greater', 0.6365047693252563)]

In [47]:
allRobustConcept2v.wv.most_similar('$#!international', topn=10) # Not really nice...

[('$#!andy_reed_(rugby_union)', 0.5546355247497559),
 ('intl', 0.554165244102478),
 ('$#!jimmy_baxter_(basketball)', 0.5525927543640137),
 ('mandain', 0.5486284494400024),
 ('$#!paessler_router_traffic_grapher', 0.548347532749176),
 ('dvlpt', 0.5477840900421143),
 ('$#!lionheart_(comics)', 0.5476211905479431),
 ('japn', 0.5429699420928955),
 ('transcomms', 0.5429620146751404),
 ('trininty', 0.5423702001571655)]

In [48]:
allRobustConcept2v.wv.most_similar('$#!beauty', topn=10)

[('beauty', 0.8481885194778442),
 ('beautiful', 0.6803887486457825),
 ('aesthetic', 0.6532540321350098),
 ('$#!milarepa', 0.608664870262146),
 ('$#!the_touch_(2002_film)', 0.6047132015228271),
 ('discursiveness', 0.6024684906005859),
 ('impressionnisme', 0.6017350554466248),
 ('mosaicists', 0.5994110703468323),
 ('sensual', 0.5985092520713806),
 ('urbanities', 0.5976210832595825)]

In [49]:
allRobustConcept2v.wv.most_similar('trafic', topn=10)

[('$#!renault_trafic', 0.8822504878044128),
 ('$#!xiaogan', 0.7465230226516724),
 ('jettas', 0.7232366800308228),
 ('$#!sipani', 0.7162378430366516),
 ('sipani', 0.7063448429107666),
 ('$#!société_des_usines_chausson', 0.6881568431854248),
 ('ciezarowych', 0.6837400197982788),
 ('batilly', 0.6835135221481323),
 ('heuliez', 0.6811609268188477),
 ('$#!toyota_land_cruiser', 0.67405104637146)]

In [50]:
allRobustConcept2v.wv.most_similar('france', topn=10)

[('$#!france', 0.9830839037895203),
 ('french', 0.7742928266525269),
 ('$#!france_national_football_team', 0.750748872756958),
 ('paris', 0.7189632654190063),
 ('italy', 0.6941613554954529),
 ('$#!paris', 0.6804600954055786),
 ('$#!french_poetry', 0.6797875165939331),
 ('britain', 0.6785295009613037),
 ('$#!germany', 0.6755388379096985),
 ('$#!italy', 0.6731325387954712)]

In [51]:
allRobustConcept2v.wv.most_similar('bush', topn=10)

[('$#!george_w._bush', 0.9117322564125061),
 ('$#!george_h._w._bush', 0.7876810431480408),
 ('administration', 0.7690497636795044),
 ('$#!bush_(band)', 0.7602911591529846),
 ('reagan', 0.7442588806152344),
 ('$#!president_of_the_united_states', 0.7436239719390869),
 ('fitzwater', 0.7412006855010986),
 ('congress', 0.7260851860046387),
 ('sununu', 0.6991183757781982),
 ('$#!united_states_congress', 0.6896796822547913)]

It seems to work pretty good, even with the low frequency concept ! Now i'll relaunch another training on the most frequent concept.

In [52]:
dico_concept['$#!george_w._bush']

3872

In [71]:
# Count how much terms of the querys are not in the allRobustConcept2v model.
nb_word = 0
cpt = 0
for k in q : 
    for w in q[k]:
        nb_word +=1
        if w not in allRobustConcept2v :

            cpt+=1
                
print(round((cpt/nb_word)*100,2),"terms of the query are not in the model")

18.14 terms of the query are not in the model


  import sys


### Remove low frequency terms and concepts

In [72]:
"""
num_features = 300
min_word_count = 5 
num_workers = multiprocessing.cpu_count()
context_size = 30 
downsampling = 1e-3
seed = 1

allRobustConcept2v = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling,
    )
    
    allRobustConcept2v.build_vocab(list(ad.values())) # We need list of sentences (doc) 
    print("Building vocabulary, done.")
    allRobustConcept2v.train(list(ad.values()),total_examples=allRobustConcept2v.corpus_count,epochs=7)
    print("Training model, done.")
    allRobustConcept2v.save(path_all_concept)
"""



'allRobustConcept2v = w2v.Word2Vec(\n    sg=1,\n    seed=seed,\n    workers=num_workers,\n    size=num_features,\n    min_count=min_word_count,\n    window=context_size,\n    sample=downsampling,\n)'

### PPMI 

Now we'll use a metric called PPMI to face the problem of low frequency occurences of some of the concept in our dataset.

https://en.wikipedia.org/wiki/Pointwise_mutual_information

## Some visualisations 

In [73]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0) # We'll try to visualize our data in 2D with a T-SNE.

In [76]:
all_word_vectors_matrix = allRobustConcept2v.wv.vectors

In [None]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix) 

In [None]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[allRobustConcept2v.wv.vocab[word].index])
            for word in allRobustConcept2v.wv.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

