In [71]:
import pandas as pd
import numpy as np
from sklearn import svm
import nltk, string, pickle
from collections import Counter
from gensim.models import KeyedVectors
from sklearn.model_selection import KFold # import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity as cs

exclude = set(string.punctuation)

exclude.add("-")

# this function convert a string of text to a cross-lingual doc embedding capturing its semantics
def text_embedding(text,model):
    
    text = text.lower()
    
    text = ''.join(ch for ch in text if ch not in exclude)
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token.isalpha()]
    
    doc_embedd = []
    
    for word in text:
            try:
                embed_word = model[word]
                doc_embedd.append(embed_word)
            except KeyError:
                continue
    if len(doc_embedd)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embedd)]
        return avg
    else:
        return None

In [2]:
# we load the dataset

with open('dataset.pickle', 'rb') as f:
    df = pickle.load(f)    

In [3]:
df.head()

Unnamed: 0,id,langMaterial,unitTitle,titleProper,scopeContent,topic,filename
0,C122304196,fr,Documents généraux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Historique par monsieur Hervé Arnoux (1993). C...,[economics],economics.json
1,C122304197,fr,Réparations et représentations pour les automo...,"119 J - Arnoux, fabrique de tracteurs, Miramas...","Garage Arnoux : vue de la façade sud, le long ...",[economics],economics.json
2,C122304198,fr,Motoculteurs Arnoux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Brevet d'invention pour un petit tracteur moto...,[economics],economics.json
3,C122304200,fr,Tracteurs Arnoux et leur outillage.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Tracteurs type VM 10 et VM 15 : prospectus et ...,[economics],economics.json
4,C122304201,fr,Documentation générale.,"119 J - Arnoux, fabrique de tracteurs, Miramas...","L'Officiel des marques, 1er trimestre 1957, 3e...",[economics],economics.json


In [4]:
# for each language under study you need to download its related cross-lingual embeddings from here: https://github.com/facebookresearch/MUSEœ

de_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.de.vec')
fr_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.fr.vec')
en_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.en.vec')
it_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.it.vec')
fi_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.fi.vec')
pl_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.pl.vec')
sl_model = KeyedVectors.load_word2vec_format('word-embs/wiki.multi.sl.vec')

In [48]:
# we just map the language with the word embeddings model

model_dict = {"fr":fr_model,"en":en_model,"de":de_model,"it":it_model,"fi":fi_model,"pl":pl_model,"sl":sl_model,"German":de_model,"English":en_model,"Finnish":fi_model,"French":fr_model}

In [6]:
# for each document we create a document embedding and collect its topic label

embs = []
labels = []
doc_names = []
selected_langs = []
texts = []

for index, row in df.iterrows():
    lang = row["langMaterial"]
    label = row["filename"].replace(".json","")    
    title = row["titleProper"]

    if lang in model_dict:
        model = model_dict[lang]
        text = row["unitTitle"] +" "+ row["titleProper"]+" "+ row["scopeContent"]
        emb = text_embedding(text,model)
        if emb != None:
            embs.append(emb)
            labels.append(label)
            selected_langs.append(lang)
            doc_names.append(title)
            texts.append(text)
print (len(embs),len(labels))

122824 122824


In [7]:
def cossim(v1,v2):
    v1 = np.array(v1).reshape(1, -1)
    v2 = np.array(v2).reshape(1, -1)
    score = cs(v1,v2)[0][0]
    return score

In [13]:
# apply the classifier to a new description

description = "this is a text about the GDR and Berlin"
lang = "en"
how_many_results = 50

model = model_dict[lang]
emb = text_embedding(description,model)

In [40]:
ranking = [[doc_names[x],labels[x],texts[x],embs[x],cossim(embs[x],emb)] for x in range(len(embs))]

ranking.sort(key=lambda x: x[4],reverse=True)

ranking = ranking[:how_many_results]
 
rel_topics = set([x[1] for x in ranking])

all_res_embs = [float(sum(col))/len(col) for col in zip(*[x[3] for x in ranking])]

{'germanDemocraticRepublic'}
[-0.012079556865407579, -0.010485933730658467, -0.014580406899242053, 0.031358638020258615, -0.029932400377748348, 0.010504350412787609, -0.011128717253720161, -0.04463270201177341, 0.007910225738675933, 0.04641123058832468, 0.019241045910076658, 0.0027314927562094252, -0.0027289968233613206, 0.001794673255026209, 0.00761892810634334, -0.05021791710987082, -0.024420454189086272, -0.011123543782360974, 0.016019292179520993, 0.049480504048255686, -0.015610096776421185, 0.04155833052725311, -0.05802365635689005, -0.027019738614002472, -0.026283239246494592, -0.01017227928151301, 0.008088999880262751, -0.005203467523201399, 0.007093956546378419, 0.017059169028154127, -0.03473544589487915, 0.06447181237441654, -0.058598261331002266, 0.04361457112771845, 0.01740793189406, -0.039732568453874094, -0.017632184570687895, -0.015770965631290827, 0.0234168842573575, -0.020006101962598312, 0.024343995485479487, -0.0162207009670641, -0.004553796852408873, -0.0010764973474

In [82]:
import os,pandas

all_topic_words = []

for topic in rel_topics:
    for file in os.listdir("Taxonomies/"):
        if topic.lower() in file.lower():
            file_errors_location = 'Taxonomies/'+file
            df = pd.read_excel(file_errors_location)
            words = df.iloc[:,0].tolist()
            words = [x.split("/")[-1].replace("_"," ") for x in words]
            langs = df.iloc[:,1].tolist()
            emb = text_embedding(description,model_dict[lang])
            topic_words = [[words[x],langs[x],text_embedding(words[x],model_dict[langs[x]])] for x in range(len(words)) if langs[x] in model_dict]
            miss_emb = len(topic_words)
            topic_words = [x for x in topic_words if x[2] is not None]
            miss_emb -= len(topic_words)
            print ("Missing Langs:",set([langs[x] for x in range(len(langs)) if langs[x] not in model_dict]), "Missing Embs:",miss_emb)
            all_topic_words += topic_words

            
topic_ranking = [[topic_words[x][0],cossim(topic_words[x][2],all_res_embs)] for x in range(len(topic_words))]

topic_ranking.sort(key=lambda x: x[1],reverse=True)          

Missing Langs: {'Maltese', 'Norwegian'} Missing Embs: 18


In [85]:
print ("Most relevant topics:", ", ".join([x[0] for x in topic_ranking[:10]]))
print (" ")
for doc in ranking:
    print (doc[0:2],round(doc[-1],3))
   

Most relevant topics: Treaty on the Final Settlement with Respect to Germany, Bundesbeauftragter für die Stasi-Unterlagen, Die Wende, Democratic Women's Federation of Germany, Allemagne de l'Est, State Secretariat for Church Affairs, Communist Party of Germany (KPD), Two Plus Four Agreement, Ministerium für Staatssicherheit, Ernst Thälmann Pioneer Organisation
 
['MfS–Wachregiment „Feliks E. Dzierzynski“ (WR Berlin) – Tonaufnahmen [= WR Berlin-Tb]:F952135', 'germanDemocraticRepublic'] 0.868
['Verzeichnis der Filme und Videos des Ministeriums für Staatssicherheit der DDR:F921180', 'germanDemocraticRepublic'] 0.866
['Verzeichnis der Filme und Videos des Ministeriums für Staatssicherheit der DDR:F921180', 'germanDemocraticRepublic'] 0.866
['SED-Kreisleitung im Ministerium für Staatssicherheit [= SED-KL]:F921175', 'germanDemocraticRepublic'] 0.865
['Verzeichnis der Filme und Videos des Ministeriums für Staatssicherheit der DDR:F921180', 'germanDemocraticRepublic'] 0.863
['MfS–Hauptabteilun