In [1]:
# ignore this, only useful when testing it on Colab

#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.de.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.it.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fi.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.pl.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.sl.vec

In [61]:
import pandas as pd
import numpy as np
from sklearn import svm
import nltk, string, pickle
from collections import Counter
from gensim.models import KeyedVectors
from sklearn.model_selection import KFold # import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity as cs
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

exclude = set(string.punctuation)

exclude.add("-")

# this function convert a string of text to a cross-lingual doc embedding capturing its semantics
def text_embedding(text,model):
    
    text = text.lower()
    
    text = ''.join(ch for ch in text if ch not in exclude)
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token.isalpha()]
    
    doc_embedd = []
    
    for word in text:
            try:
                embed_word = model[word]
                doc_embedd.append(embed_word)
            except KeyError:
                continue
    if len(doc_embedd)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embedd)]
        return avg
    else:
        return None

In [3]:
# we load the dataset

with open('dataset.pickle', 'rb') as f:
    df = pickle.load(f)    

In [4]:
df.head()

Unnamed: 0,id,langMaterial,unitTitle,titleProper,scopeContent,topic,filename
0,C122304196,fr,Documents généraux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Historique par monsieur Hervé Arnoux (1993). C...,[economics],economics.json
1,C122304197,fr,Réparations et représentations pour les automo...,"119 J - Arnoux, fabrique de tracteurs, Miramas...","Garage Arnoux : vue de la façade sud, le long ...",[economics],economics.json
2,C122304198,fr,Motoculteurs Arnoux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Brevet d'invention pour un petit tracteur moto...,[economics],economics.json
3,C122304200,fr,Tracteurs Arnoux et leur outillage.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Tracteurs type VM 10 et VM 15 : prospectus et ...,[economics],economics.json
4,C122304201,fr,Documentation générale.,"119 J - Arnoux, fabrique de tracteurs, Miramas...","L'Officiel des marques, 1er trimestre 1957, 3e...",[economics],economics.json


In [5]:
# for each language under study you need to download its related cross-lingual embeddings from here: https://github.com/facebookresearch/MUSEœ

de_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.de.vec')
fr_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.fr.vec')
en_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.en.vec')
it_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.it.vec')
fi_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.fi.vec')
pl_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.pl.vec')
sl_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.sl.vec')

In [50]:
# we just map the language with the word embeddings model

model_dict = {"fr":fr_model,"en":en_model,"english":en_model,"de":de_model,"it":it_model,"fi":fi_model,"pl":pl_model,"sl":sl_model,"German":de_model,"English":en_model,"Finnish":fi_model,"French":fr_model,"Italian":it_model}


In [7]:
# for each document we create a document embedding and collect its topic label

embs = []
labels = []
doc_names = []
selected_langs = []
texts = []

for index, row in df.iterrows():
    lang = row["langMaterial"]
    label = row["filename"].replace(".json","")    
    title = row["titleProper"]

    if lang in model_dict:
        model = model_dict[lang]
        text = row["unitTitle"] +" "+ row["titleProper"]+" "+ row["scopeContent"]
        emb = text_embedding(text,model)
        if emb != None:
            embs.append(emb)
            labels.append(label)
            selected_langs.append(lang)
            doc_names.append(title)
            texts.append(text)
print (len(embs),len(labels))

122824 122824


In [8]:
def cossim(v1,v2):
    v1 = np.array(v1).reshape(1, -1)
    v2 = np.array(v2).reshape(1, -1)
    score = cs(v1,v2)[0][0]
    return score

In [72]:
# apply the classifier to a new description

description = "Napoleone e la Russia"
lang = "it"
how_many_results = 50

model = model_dict[lang]
emb = text_embedding(description,model)

In [73]:
ranking = [[doc_names[x],labels[x],texts[x],embs[x],cossim(embs[x],emb)] for x in range(len(embs))]

ranking.sort(key=lambda x: x[4],reverse=True)

ranking = ranking[:how_many_results]
 
rel_topics = set([x[1] for x in ranking])

all_res_embs = [float(sum(col))/len(col) for col in zip(*[x[3] for x in ranking])]

In [59]:
import os,pandas

all_topic_words = []

for topic in rel_topics:
    for file in os.listdir("taxonomies/"):
        if topic.lower() in file.lower():
            taxonomy_location = 'taxonomies/'+file
            df = pd.read_excel(taxonomy_location)
            words = df.iloc[:,0].tolist()
            words = [x.split("/")[-1].replace("_"," ") for x in words if type(x)!= float]
            langs = df.iloc[:,1].tolist()
            emb = text_embedding(description,model_dict[lang])
            topic_words = [[words[x],langs[x],text_embedding(words[x],model_dict[langs[x]])] for x in range(len(words)) if langs[x] in model_dict]
            miss_emb = len(topic_words)
            topic_words = [x for x in topic_words if x[2] is not None]
            miss_emb -= len(topic_words)
            print ("Taxonomy:",topic,"\n We currently do not cover the following langs:",set([langs[x] for x in range(len(langs)) if langs[x] not in model_dict]), "Number of excluded words:",miss_emb,"\n")
            all_topic_words += topic_words
    
        
topic_ranking = [[all_topic_words[x][0],cossim(all_topic_words[x][2],all_res_embs)] for x in range(len(all_topic_words))]

topic_ranking.sort(key=lambda x: x[1],reverse=True)          

Taxonomy: notaries 
 We currently do not cover the following langs: {'Spanish', 'Maltese', 'Norwegian'} Number of excluded words: 4 

Taxonomy: slavery 
 We currently do not cover the following langs: {nan, 'Italiano', 'Norwegian', 'Portuguese', 'Spanish', 'Russian'} Number of excluded words: 44 

Taxonomy: germanDemocraticRepublic 
 We currently do not cover the following langs: {'Maltese', 'Norwegian'} Number of excluded words: 18 

Taxonomy: economics 
 We currently do not cover the following langs: {'Spanish', 'Maltese', 'Norwegian'} Number of excluded words: 13 

Taxonomy: maps 
 We currently do not cover the following langs: {'Spanish', 'Maltese', 'Norwegian'} Number of excluded words: 3 

Taxonomy: catholicism 
 We currently do not cover the following langs: {'Norwegian', 'Maltese', 'Portuguese', 'Spanish', 'Latin', 'Greek'} Number of excluded words: 1 

Taxonomy: firstWorldWar 
 We currently do not cover the following langs: {'Spanish', 'Maltese', 'Norwegian'} Number of exclude

In [71]:
content_words = 100

printmd("**Most relevant topics:**")
print ("- "+ "\n- ".join([x[0] for x in topic_ranking[:10]]))
print (" ")
printmd("**Most relevant documents:**")

for i in range(len(ranking)):
    doc = ranking[i]
    doc_id = doc[0]
    topic = doc[1]
    content = doc[2][:content_words]
    score = round(doc[-1],3)
    print (str(i+1)+")",doc_id,content)
    print ("topic:",topic,"\nscore:",score,"\n")
   

**Most relevant topics:**

- Comité National pour l'Histoire et la Mémoire de l'Esclavage
- Direction de la cartographie des Alpes 
- François Tubeuf de Norfolk et de Petersburg
- Treaty on the Final Settlement with Respect to Germany
- Retz et de Pradt (familles)
- London Society for Effecting the Abolition of the Slave Trade
- Hague Convention Abolishing the Requirement of Legalisation for Foreign Public Documents
- Treaty for the Suppression of the African Slave Trade
- Treaty for the Suppression of the African Slave Trade
- Society for Effecting the Abolition of the Slave Trade
 


**Most relevant documents:**

1) Erzberger, Matthias:F1538386 Politische Auseinandersetzungen innerhalb des Katholizismus während des Krieges Erzberger, Matthias:
topic: firstWorldWar 
score: 0.798 

2) Série D - Administration générale de la commune de Nice (première partie):F1584207 Pianta perimetrale della chiesa del Voto della città di Nizza e sue adiacenze coll'indicazione dei t
topic: maps 
score: 0.791 

3) Série O - Consiglio d'Ornato:F1512204 Prospetto delle due facciate della casa che il sig. Pietro Bouchon possiede nel territorio di questa
topic: maps 
score: 0.781 

4) Guide des sources de la traite négrière, de l'esclavage et de leurs abolitions:S13 Guide des sources de la traite négrière, de l'esclavage et de leurs abolitions Guide des sources de 
topic: slavery 
score: 0.779 

5) Intérieur ; Direction générale de la Police nationale ; Division Archives, documentation (1959-1989):F969150 religions , 1972-1989 Intérieur ; Direction générale de la Police nationale ; Division Archives, doc
topic: economi