In [None]:
# ignore this, only useful when testing it on Colab

#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.de.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.it.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fi.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.pl.vec
#!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.sl.vec

In [None]:
import pandas as pd
import numpy as np
from sklearn import svm
import nltk, string, pickle
from collections import Counter
from gensim.models import KeyedVectors
from sklearn.model_selection import KFold # import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity as cs

exclude = set(string.punctuation)

exclude.add("-")

# this function convert a string of text to a cross-lingual doc embedding capturing its semantics
def text_embedding(text,model):
    
    text = text.lower()
    
    text = ''.join(ch for ch in text if ch not in exclude)
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token.isalpha()]
    
    doc_embedd = []
    
    for word in text:
            try:
                embed_word = model[word]
                doc_embedd.append(embed_word)
            except KeyError:
                continue
    if len(doc_embedd)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embedd)]
        return avg
    else:
        return None

In [None]:
# we load the dataset

with open('dataset.pickle', 'rb') as f:
    df = pickle.load(f)    

In [None]:
df.head()

In [None]:
# for each language under study you need to download its related cross-lingual embeddings from here: https://github.com/facebookresearch/MUSEœ

de_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.de.vec')
fr_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.fr.vec')
en_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.en.vec')
it_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.it.vec')
fi_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.fi.vec')
pl_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.pl.vec')
sl_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.sl.vec')

In [None]:
# we just map the language with the word embeddings model

model_dict = {"fr":fr_model,"en":en_model,"de":de_model,"it":it_model,"fi":fi_model,"pl":pl_model,"sl":sl_model,"German":de_model,"English":en_model,"Finnish":fi_model,"French":fr_model}

In [None]:
# for each document we create a document embedding and collect its topic label

embs = []
labels = []
doc_names = []
selected_langs = []
texts = []

for index, row in df.iterrows():
    lang = row["langMaterial"]
    label = row["filename"].replace(".json","")    
    title = row["titleProper"]

    if lang in model_dict:
        model = model_dict[lang]
        text = row["unitTitle"] +" "+ row["titleProper"]+" "+ row["scopeContent"]
        emb = text_embedding(text,model)
        if emb != None:
            embs.append(emb)
            labels.append(label)
            selected_langs.append(lang)
            doc_names.append(title)
            texts.append(text)
print (len(embs),len(labels))

In [None]:
def cossim(v1,v2):
    v1 = np.array(v1).reshape(1, -1)
    v2 = np.array(v2).reshape(1, -1)
    score = cs(v1,v2)[0][0]
    return score

In [None]:
# apply the classifier to a new description

description = "this is a text about the GDR and Berlin"
lang = "en"
how_many_results = 50

model = model_dict[lang]
emb = text_embedding(description,model)

In [None]:
ranking = [[doc_names[x],labels[x],texts[x],embs[x],cossim(embs[x],emb)] for x in range(len(embs))]

ranking.sort(key=lambda x: x[4],reverse=True)

ranking = ranking[:how_many_results]
 
rel_topics = set([x[1] for x in ranking])

all_res_embs = [float(sum(col))/len(col) for col in zip(*[x[3] for x in ranking])]

In [None]:
import os,pandas

all_topic_words = []

for topic in rel_topics:
    for file in os.listdir("taxonomies/"):
        if topic.lower() in file.lower():
            file_errors_location = 'taxonomies/'+file
            df = pd.read_excel(file_errors_location)
            words = df.iloc[:,0].tolist()
            words = [x.split("/")[-1].replace("_"," ") for x in words]
            langs = df.iloc[:,1].tolist()
            emb = text_embedding(description,model_dict[lang])
            topic_words = [[words[x],langs[x],text_embedding(words[x],model_dict[langs[x]])] for x in range(len(words)) if langs[x] in model_dict]
            miss_emb = len(topic_words)
            topic_words = [x for x in topic_words if x[2] is not None]
            miss_emb -= len(topic_words)
            print ("Missing Langs:",set([langs[x] for x in range(len(langs)) if langs[x] not in model_dict]), "Missing Embs:",miss_emb)
            all_topic_words += topic_words

            
topic_ranking = [[topic_words[x][0],cossim(topic_words[x][2],all_res_embs)] for x in range(len(topic_words))]

topic_ranking.sort(key=lambda x: x[1],reverse=True)          

In [None]:
print ("Most relevant topics:", ", ".join([x[0] for x in topic_ranking[:10]]))
print (" ")
for doc in ranking:
    print (doc[0:2],round(doc[-1],3))
   