In [1]:
# ignore this, only useful when testing it on Colab

#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.de.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.it.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fi.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.pl.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.sl.vec

In [2]:
import wikipedia
import pandas as pd
import numpy as np
from sklearn import svm
import nltk, string, pickle
from urllib.request import urlopen
from bs4 import BeautifulSoup
from collections import Counter
from gensim.models import KeyedVectors
from sklearn.model_selection import KFold # import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity as cs
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

exclude = set(string.punctuation)

exclude.add("-")

# this function convert a string of text to a cross-lingual doc embedding capturing its semantics
def text_embedding(text,model):
    
    text = text.lower()
    
    text = ''.join(ch for ch in text if ch not in exclude)
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token.isalpha()]
    
    doc_embedd = []
    
    for word in text:
            try:
                embed_word = model[word]
                doc_embedd.append(embed_word)
            except KeyError:
                continue
    if len(doc_embedd)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embedd)]
        return avg
    else:
        return None

In [3]:
# we load the dataset

with open('dataset.pickle', 'rb') as f:
    df = pickle.load(f)    

In [4]:
df.head()

Unnamed: 0,id,langMaterial,unitTitle,titleProper,scopeContent,topic,filename
0,C122304196,fr,Documents généraux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Historique par monsieur Hervé Arnoux (1993). C...,[economics],economics.json
1,C122304197,fr,Réparations et représentations pour les automo...,"119 J - Arnoux, fabrique de tracteurs, Miramas...","Garage Arnoux : vue de la façade sud, le long ...",[economics],economics.json
2,C122304198,fr,Motoculteurs Arnoux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Brevet d'invention pour un petit tracteur moto...,[economics],economics.json
3,C122304200,fr,Tracteurs Arnoux et leur outillage.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Tracteurs type VM 10 et VM 15 : prospectus et ...,[economics],economics.json
4,C122304201,fr,Documentation générale.,"119 J - Arnoux, fabrique de tracteurs, Miramas...","L'Officiel des marques, 1er trimestre 1957, 3e...",[economics],economics.json


In [5]:
# for each language under study you need to download its related cross-lingual embeddings from here: https://github.com/facebookresearch/MUSEœ

de_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.de.vec')
fr_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.fr.vec')
en_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.en.vec')
it_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.it.vec')
fi_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.fi.vec')
pl_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.pl.vec')
sl_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.sl.vec')

In [6]:
# we just map the language with the word embeddings model

model_dict = {"fr":fr_model,"en":en_model,"english":en_model,"de":de_model,"it":it_model,"fi":fi_model,"pl":pl_model,"sl":sl_model,"German":de_model,"English":en_model,"Finnish":fi_model,"French":fr_model,"Italian":it_model,"Italiano":it_model}
language_uniform = {"fr":"fr","en":"en","english":"en","de":"de","it":"it","fi":"fi","pl":"pl","sl":"sl","German":"de","English":"en","Finnish":"fi","French":"fr","Italian":"it","Italiano":"it"}


In [7]:
# for each document we create a document embedding and collect its topic label

embs = []
labels = []
doc_names = []
selected_langs = []
texts = []

for index, row in df.iterrows():
    try:
        lang = language_uniform[row["langMaterial"]]
        label = row["filename"].replace(".json","")    
        title = row["titleProper"]

        if lang in model_dict:
            model = model_dict[lang]
            text = row["unitTitle"] +" "+ row["titleProper"]+" "+ row["scopeContent"]
            emb = text_embedding(text,model)
            if emb != None:
                embs.append(emb)
                labels.append(label)
                selected_langs.append(lang)
                doc_names.append(title)
                texts.append(text)
    except KeyError:
        pass
print (len(embs),len(labels))

122824 122824


In [8]:
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cossim(v1,v2):
    v1 = np.array(v1).reshape(1, -1)
    v2 = np.array(v2).reshape(1, -1)
    score = cs(v1,v2)[0][0]
    return score

tfidf_vectorizer=TfidfVectorizer()

def rank_by_freq(query,doc,allow_partial_match):
    if allow_partial_match:
        tfidf=tfidf_vectorizer.fit_transform([query,doc])
        cs = cosine_similarity(tfidf)[0][1]
        return cs
    else:
        n = doc.lower().count(query.lower())
        n = n/len(doc.split(" "))
        return n

def entity_processing(entity):
    entity = entity.split("(")[0]
    entity = entity.translate(str.maketrans('', '', string.punctuation))
    entity = entity.strip()
    return entity


In [14]:
# apply the classifier to a new description

description = "Napoleone"
lang = "it"
how_many_results = 100

#work in progress
query_type = "entity" # either concept or entity

# only when searching for entities
allow_partial_match = True

In [15]:
from urllib.parse import unquote,quote

if query_type == "concept":
    model = model_dict[lang]
    emb = text_embedding(description,model)
    
    ranking = [[doc_names[x],labels[x],texts[x],embs[x],cossim(embs[x],emb),selected_langs[x]] for x in range(len(embs))]

    ranking.sort(key=lambda x: x[4],reverse=True)

    ranking = ranking[:how_many_results]

    rel_topics = set([x[1] for x in ranking])

    all_res_embs = [float(sum(col))/len(col) for col in zip(*[x[3] for x in ranking])]
    
if query_type == "entity":
    wikipedia.set_lang(lang)
    try:
        res = wikipedia.search(description)[0]
        wiki_url = "https://"+lang+".wikipedia.org/wiki/"+quote(res.replace(" ","_"))
        resource = urlopen(wiki_url)
        content =  resource.read()

        soup = BeautifulSoup(content)

        translations = {el.get('lang'): unquote(el.get('href')).split("/")[-1].replace("_"," ") for el in soup.select('li.interlanguage-link > a')}

        translations[lang] = description


        translations = {x:entity_processing(y) for x,y in translations.items() if x in selected_langs}

        print (translations)

        ranking = [[[doc_names[id_],labels[id_],texts[id_],embs[id_], rank_by_freq(query,texts[id_],allow_partial_match),selected_langs[id_]] for id_ in range(len(selected_langs)) if selected_langs[id_]==lang] for lang,query in translations.items() ]

        ranking = [y for x in ranking for y in x if y[4]>0.0]
        print ("Documents mentioning the entity '",description,"' :", len(ranking),"among",len(labels),".")

        ranking.sort(key=lambda x: x[4],reverse=True)

        ranking = ranking[:how_many_results]

        rel_topics = set([x[1] for x in ranking])

        all_res_embs = [float(sum(col))/len(col) for col in zip(*[x[3] for x in ranking])]
    except IndexError:
        print ("The entity",description,"is not currently present in our Knowledge Base.")
    

{'de': 'Napoleon Bonaparte', 'en': 'Napoleon', 'fi': 'Napoleon I', 'fr': 'Napoléon Ier', 'pl': 'Napoleon Bonaparte', 'it': 'Napoleone'}
Documents mentioning the entity ' Napoleone ' : 59 among 122824 .


In [16]:
import os,pandas

all_topic_words = []

for topic in rel_topics:
    for file in os.listdir("taxonomies/"):
        if topic.lower() in file.lower():
            taxonomy_location = 'taxonomies/'+file
            df = pd.read_excel(taxonomy_location)
            words = df.iloc[:,0].tolist()
            words = [x.split("/")[-1].replace("_"," ") for x in words if type(x)!= float]
            langs = df.iloc[:,1].tolist()
            emb = text_embedding(description,model_dict[lang])
            topic_words = [[words[x],langs[x],text_embedding(words[x],model_dict[langs[x]]),topic] for x in range(len(words)) if langs[x] in model_dict]
            miss_emb = len(topic_words)
            topic_words = [x for x in topic_words if x[2] is not None]
            miss_emb -= len(topic_words)
            print ("Taxonomy:",topic,"\n We currently do not cover the following langs:",set([langs[x] for x in range(len(langs)) if langs[x] not in model_dict]), "Number of excluded words:",miss_emb,"\n")
            all_topic_words += topic_words
    
        
topic_ranking = [[all_topic_words[x][0],cossim(all_topic_words[x][2],all_res_embs),all_topic_words[x][-1]] for x in range(len(all_topic_words))]

topic_ranking.sort(key=lambda x: x[1],reverse=True)          

Taxonomy: economics 
 We currently do not cover the following langs: {'Spanish', 'Norwegian', 'Maltese'} Number of excluded words: 13 

Taxonomy: maps 
 We currently do not cover the following langs: {'Spanish', 'Norwegian', 'Maltese'} Number of excluded words: 3 

Taxonomy: germanDemocraticRepublic 
 We currently do not cover the following langs: {'Norwegian', 'Maltese'} Number of excluded words: 18 

Taxonomy: firstWorldWar 
 We currently do not cover the following langs: {'Spanish', 'Norwegian', 'Maltese'} Number of excluded words: 9 

Taxonomy: slavery 
 We currently do not cover the following langs: {nan, 'Norwegian', 'Portuguese', 'Spanish', 'Russian'} Number of excluded words: 48 

Taxonomy: notaries 
 We currently do not cover the following langs: {'Spanish', 'Norwegian', 'Maltese'} Number of excluded words: 4 



In [17]:
content_words = 100

printmd("**Most relevant topical words:**")
print ("- "+ "\n- ".join(["\n".join([x[0],"Topic: "+x[-1]]) for x in topic_ranking[:10]]))
print (" ")
printmd("**Most relevant documents:**")

for i in range(len(ranking)):
    doc = ranking[i]
    doc_id = doc[0]
    topic = doc[1]
    content = doc[2][:content_words]
    score = round(doc[-2],3)
    lang = doc[-1]
    print (str(i+1)+")",doc_id)
    print ("Snippet:",content)
    print ("topic:",topic,"\nscore:",score,"\nlang:",lang,"\n")
   

**Most relevant topical words:**

- Comité National pour l'Histoire et la Mémoire de l'Esclavage
Topic: slavery
- Direction de la cartographie des Alpes 
Topic: maps
- François Tubeuf de Norfolk et de Petersburg
Topic: slavery
- Hague Convention Abolishing the Requirement of Legalisation for Foreign Public Documents
Topic: notaries
-  Charles Fournier de la Chapelle
Topic: slavery
- Duboys de La Vrillière
Topic: slavery
- Treaty on the Final Settlement with Respect to Germany
Topic: germanDemocraticRepublic
- Minutier central des notaires de Paris
Topic: notaries
- Retz et de Pradt (familles)
Topic: slavery
- Charles Eugène Gabriel de La Croix marquis de Castries 
Topic: slavery
 


**Most relevant documents:**

1) Kriegsgeschichtliche Forschungsanstalt des Heeres:F1536668
Snippet: Napoleon als Feldherr Kriegsgeschichtliche Forschungsanstalt des Heeres:F1536668 Enthält: Korrigiert
topic: firstWorldWar 
score: 0.127 
lang: de 

2) Série HH - Affaires économiques à Nice avant la Révolution:F1225336
Snippet: Interdiction faite par Charles Ier interdisant au gouverneur de concéder des sauf-conduits et représ
topic: economics 
score: 0.119 
lang: fr 

3) Plans isolés:F626904
Snippet: 90Fi 0037. Grasse .- Carrière du plateau Napoléon. 28/12/1958 Plans isolés:F626904 Plan, de situatio
topic: maps 
score: 0.105 
lang: fr 

4) Plans isolés:F626904
Snippet: 90Fi 0025. Grasse (Plateau Roquevignon) .- Projet d'un ensemble sportif "stade Napoléon". 01/05/1941
topic: maps 
score: 0.089 
lang: fr 

5) Plans isolés:F626904
Snippet: 90Fi 0026. Grasse (Plateau Roquevignon) .- Projet d'un ensemble sportif "Stade Napoléon" Plans isolé
topic: maps 
score: 0.089 
lang: fr 

6) Liste des tabellions et notaires de la

In [18]:
content_words = 100

printmd("**Most relevant topical words:**")
print ("- "+ "\n- ".join(["\n".join([x[0],"Topic: "+x[-1]]) for x in topic_ranking[:10]]))
print (" ")
printmd("**Most relevant documents:**")

for i in range(len(ranking)):
    doc = ranking[i]
    doc_id = doc[0]
    topic = doc[1]
    content = doc[2][:content_words]
    score = round(doc[-2],3)
    lang = doc[-1]
    print (str(i+1)+")",doc_id)
    print ("Snippet:",content)
    print ("topic:",topic,"\nscore:",score,"\nlang:",lang,"\n")
   

**Most relevant topical words:**

- Comité National pour l'Histoire et la Mémoire de l'Esclavage
Topic: slavery
- Direction de la cartographie des Alpes 
Topic: maps
- François Tubeuf de Norfolk et de Petersburg
Topic: slavery
- Hague Convention Abolishing the Requirement of Legalisation for Foreign Public Documents
Topic: notaries
-  Charles Fournier de la Chapelle
Topic: slavery
- Duboys de La Vrillière
Topic: slavery
- Treaty on the Final Settlement with Respect to Germany
Topic: germanDemocraticRepublic
- Minutier central des notaires de Paris
Topic: notaries
- Retz et de Pradt (familles)
Topic: slavery
- Charles Eugène Gabriel de La Croix marquis de Castries 
Topic: slavery
 


**Most relevant documents:**

1) Kriegsgeschichtliche Forschungsanstalt des Heeres:F1536668
Snippet: Napoleon als Feldherr Kriegsgeschichtliche Forschungsanstalt des Heeres:F1536668 Enthält: Korrigiert
topic: firstWorldWar 
score: 0.127 
lang: de 

2) Série HH - Affaires économiques à Nice avant la Révolution:F1225336
Snippet: Interdiction faite par Charles Ier interdisant au gouverneur de concéder des sauf-conduits et représ
topic: economics 
score: 0.119 
lang: fr 

3) Plans isolés:F626904
Snippet: 90Fi 0037. Grasse .- Carrière du plateau Napoléon. 28/12/1958 Plans isolés:F626904 Plan, de situatio
topic: maps 
score: 0.105 
lang: fr 

4) Plans isolés:F626904
Snippet: 90Fi 0025. Grasse (Plateau Roquevignon) .- Projet d'un ensemble sportif "stade Napoléon". 01/05/1941
topic: maps 
score: 0.089 
lang: fr 

5) Plans isolés:F626904
Snippet: 90Fi 0026. Grasse (Plateau Roquevignon) .- Projet d'un ensemble sportif "Stade Napoléon" Plans isolé
topic: maps 
score: 0.089 
lang: fr 

6) Liste des tabellions et notaires de la