# Learning concept representation with Word2Vec

With the objective of improving the DRMM neural architecture for a classic ad-hoc Information Retrieval problem.
Using the Robust4 dataset and the concept build with Wordnet : http://wordnetweb.princeton.edu/perl/webwn

In [41]:
from __future__ import absolute_import, division, print_function
import codecs 
import glob 
import logging 
import multiprocessing
import os 
import pprint
import re
import ast
import json
import operator
import collections


In [2]:
from bs4 import BeautifulSoup
import nltk 
import gensim.models.word2vec as w2v 
import sklearn.manifold # dimensionality reduction for visualisation.
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns

In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
def load_all_path_docs_robust4(folder="/local/karmim/Stage_M1_RI/data/annotated_collection_tagme_score/015"):
        """
            We load all the path of all the annotated document of the robust4 collection.
        """
        
        all_file_name =[]
        for r,_,f in os.walk(folder): 
            for file in f: 
                all_file_name.append(os.path.join(r,file))
        return all_file_name

In [5]:
all_file_name = load_all_path_docs_robust4()
len(all_file_name)

2295

In [6]:
all_file_name[0]

'/local/karmim/Stage_M1_RI/data/annotated_collection_tagme_score/015/FR94/10/FR941003.2'

In [7]:
def load_all_query_annotated_robust4(file = '/local/karmim/Stage_M1_RI/data/topics-title.annotated.csv'):
    query_an = {} # Dict with words and concept for a query id
    concept = {} # Dict with only the concept for a query id
    f = codecs.open(file,'r',encoding='utf-8',errors='ignore')
    for line in f: 
        #print(line.split())
        line = np.array(line.split())
        index = np.where(np.char.find(line, '$#')>=0)
        concept[line[0]] = line[index]
        query_an[line[0]] = line[1:]
    return query_an,concept

In [8]:
q,c = load_all_query_annotated_robust4()

In [9]:
q['301']

array(['International', '$#!international.n.01', 'Organized', 'Crime',
       '$#!crime.n.02'], dtype='<U21')

In [10]:
c

{'301': array(['$#!international.n.01', '$#!crime.n.02'], dtype='<U21'),
 '302': array(['$#!poliomyelitis.n.01', '$#!post.n.09', '$#!poliomyelitis.n.01'],
       dtype='<U21'),
 '303': array(['$#!telescope.n.01', '$#!accomplishment.n.01'], dtype='<U22'),
 '304': array(['$#!species.n.02', '$#!mammal.n.01'], dtype='<U15'),
 '305': array(['$#!vehicle.n.03'], dtype='<U15'),
 '306': array(['$#!civilian.n.01', '$#!end.n.06'], dtype='<U16'),
 '307': array(['$#!undertaking.n.01'], dtype='<U19'),
 '308': array(['$#!dentistry.n.01'], dtype='<U17'),
 '309': array(['$#!rap.n.05', '$#!crime.n.02'], dtype='<U13'),
 '310': array(['$#!radio.n.03', '$#!wave.n.09', '$#!mind.n.01', '$#!cancer.n.03'],
       dtype='<U14'),
 '311': array(['$#!espionage.n.01'], dtype='<U17'),
 '312': array(['$#!hydroponics.n.01'], dtype='<U19'),
 '313': array(['$#!levitation.n.01', '$#!magnetic_levitation.n.01'], dtype='<U27'),
 '314': array(['$#!marine.n.02', '$#!vegetation.n.03'], dtype='<U18'),
 '315': array(['$#!highway

In [11]:
def load_doc(file_doc,all_docs={},all_concept={},pre_process=True):
        """
            Fonction qui load un fichier file_doc. 
            pre_process -> Bool qui dit si on effectue le preprocessing ou non. 
        """
        
        with codecs.open(file_doc,'r',encoding='utf-8',errors='ignore') as f_:
            soup = BeautifulSoup(f_.read(),"html.parser")
        docs = soup.find_all('doc')
        for d_ in docs :
            text = np.array(d_.text.split()[1:])
            doc_id = d_.docno.text.strip()
            all_docs[doc_id] = list(text)
            index = np.where(np.char.find(text, '$#')>=0)
            all_concept[doc_id] = list(text[index])
        return all_docs,all_concept

In [12]:
a_doc,a_concept=load_doc('/local/karmim/Stage_M1_RI/data/annotated_collection_tagme_score/015/FR94/10/FR941003.2')

In [13]:
a_concept['FR941003-2-00002']

['$#!Dwelling',
 '$#!Guarantee',
 '$#!Education',
 '$#!Investment',
 '$#!Equal_opportunity',
 '$#!United_States_Agency_for_International_Development',
 '$#!Guarantee',
 '$#!Loan',
 '$#!Indonesia',
 '$#!Education',
 '$#!Loan',
 '$#!Deed',
 '$#!Infrastructure',
 '$#!Welfare',
 '$#!Indonesia',
 '$#!Indonesia',
 '$#!Loan',
 '$#!Loan',
 '$#!Debtor',
 '$#!Loan',
 '$#!Indonesia',
 '$#!Guarantee',
 '$#!Attention_deficit_hyperactivity_disorder',
 '$#!Budget',
 '$#!Timor',
 '$#!Jakarta',
 '$#!Indonesia',
 '$#!Communication',
 '$#!Telephone',
 '$#!Jakarta',
 '$#!Indonesia',
 '$#!Telephone',
 '$#!Fax',
 '$#!Telephone',
 '$#!Creditor',
 '$#!Debtor',
 '$#!Interest',
 '$#!Finance',
 '$#!Dwelling',
 '$#!Guarantee',
 '$#!Jakarta',
 '$#!United_States_Agency_for_International_Development',
 '$#!Medan',
 '$#!Jakarta',
 '$#!Indonesia',
 '$#!Fax',
 '$#!Telecommunication',
 '$#!Telephone',
 '$#!Natural_environment',
 '$#!Telecommunication',
 '$#!Telephone',
 '$#!Interest',
 '$#!Loan',
 '$#!Interest_rate',
 '

In [14]:
def load_all_doc(all_file,doc_json="/local/karmim/Stage_M1_RI/data/object_python/concept_part/anotated_doc.json",concept_doc_json="/local/karmim/Stage_M1_RI/data/object_python/concept_part/all_concept_doc.json"):
    exists1 = os.path.isfile(doc_json)
    exists2 = os.path.isfile(concept_doc_json)
    all_doc = {}
    all_concept={}
    if not exists1 or not exists2:
        
        for f in all_file:

            print("f -> ",f)
            load_doc(f,all_doc,all_concept)

        save = json.dumps(all_doc)
        f = open(doc_json,"w")
        f.write(save)
        f.close()
        print("document annoté sauvegardé...")
        save = json.dumps(concept_doc_json)
        f = open(doc_json,"w")
        f.write(save)
        f.close()
        print(" concept des documents sauvegardé...")
    else:
        
        print("Chargement du fichier json : anotated_doc.json ...")
        with open(doc_json) as json_file:
            all_doc = json.load(json_file)
        print("Chargement du fichier json : all_concept_doc.json ...")
        with open(concept_doc_json) as json_file:
            all_concept = json.load(json_file)

    #
    return all_doc,all_concept

In [15]:
ad,ac = load_all_doc(all_file_name)

Chargement du fichier json : anotated_doc.json ...
Chargement du fichier json : all_concept_doc.json ...


In [18]:
ad.keys()

dict_keys(['FR941003-2-00001', 'FR941003-2-00002', 'FR941003-2-00003', 'FR941003-2-00004', 'FR941003-2-00005', 'FR941003-2-00006', 'FR941003-2-00007', 'FR941003-2-00008', 'FR941003-2-00009', 'FR941003-2-00010', 'FR941003-2-00011', 'FR941003-2-00012', 'FR941003-2-00013', 'FR941003-2-00014', 'FR941003-2-00015', 'FR941003-2-00016', 'FR941003-2-00017', 'FR941003-2-00018', 'FR941003-2-00019', 'FR941003-2-00020', 'FR941003-2-00021', 'FR941003-2-00022', 'FR941003-2-00023', 'FR941003-2-00024', 'FR941003-2-00025', 'FR941003-2-00026', 'FR941003-2-00027', 'FR941003-2-00028', 'FR941003-2-00029', 'FR941003-2-00030', 'FR941003-2-00031', 'FR941003-2-00032', 'FR941003-2-00033', 'FR941003-2-00034', 'FR941003-2-00035', 'FR941003-2-00036', 'FR941003-2-00037', 'FR941003-2-00038', 'FR941003-2-00039', 'FR941003-2-00040', 'FR941003-2-00041', 'FR941003-2-00042', 'FR941003-2-00043', 'FR941003-2-00044', 'FR941003-2-00045', 'FR941003-2-00046', 'FR941003-2-00047', 'FR941003-2-00048', 'FR941003-2-00049', 'FR941003

In [31]:
#ad['FBIS4-54174']
ac['LA052889-0051']

['$#!Spear',
 '$#!Elder_(administrative_title)',
 '$#!War',
 '$#!Risk',
 '$#!Rite_of_passage',
 '$#!Tribe',
 '$#!Hospitality_industry',
 '$#!Today_(U.S._TV_program)',
 '$#!People_(magazine)',
 '$#!Calypso_(comics)',
 '$#!Seattle',
 '$#!Aircraft',
 '$#!Ship',
 '$#!Mediterranean_Sea',
 '$#!Seattle',
 '$#!Ship',
 '$#!Australia',
 '$#!Military_base',
 '$#!Rabaul',
 '$#!Rabaul',
 '$#!Airplane',
 '$#!Planing_(boat)',
 '$#!Crew',
 '$#!Shipwreck',
 '$#!Sea',
 '$#!Underwater_diving',
 '$#!Airplane',
 '$#!Metre',
 '$#!Calypso_music',
 '$#!Decompression_(diving)',
 '$#!Decade',
 '$#!History',
 '$#!Airplane',
 '$#!Fuselage',
 '$#!Skeleton',
 '$#!Shrine',
 '$#!Propeller',
 '$#!Spirit',
 '$#!Vegetation',
 '$#!Ocean_current',
 '$#!Réunion',
 '$#!Beauty',
 '$#!War',
 '$#!Peace']

In [35]:
def count_concept(ac):
    all_concept = []
    for k in ac: 
        for w in ac[k]:
            all_concept.append(w)
    return np.array(all_concept)
all_c = count_concept(ac)
print("There is",len(all_c),"concepts in the collection Robust4.")

There is 17008375 concepts in the collection Robust4.


In [45]:
unique, counts = numpy.unique(all_c, return_counts=True)
dico_concept = dict(zip(unique, counts))


In [50]:
print("There is",len(unique),"unique concepts in the collection Robust4")

There is 242761 unique concepts in the collection Robust4


242761