In [1]:
import nltk, string

exclude = set(string.punctuation)

# input should be a string
def text_embedding(text,model):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    doc_embedd = []
    
    for word in text:
            try:
                embed_word = model[word]
                doc_embedd.append(embed_word)
            except KeyError:
                continue
    if len(doc_embedd)>1:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embedd)]
        return avg
    else:
        return None

In [2]:
import pickle
import pandas as pd

langs_dict = {"ger":"de","fre":"fr","it":"it","en":"en"}

with open('selected_dataset.pickle', 'rb') as f:
    df = pickle.load(f)
    
df['langMaterial'] = df['langMaterial'].map(langs_dict)


In [3]:
df.head()

Unnamed: 0,id,langMaterial,unitTitle,titleProper,scopeContent,topic,filename
0,C122304196,fr,Documents généraux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Historique par monsieur Hervé Arnoux (1993). C...,[economics],economics.json
1,C122304197,fr,Réparations et représentations pour les automo...,"119 J - Arnoux, fabrique de tracteurs, Miramas...","Garage Arnoux : vue de la façade sud, le long ...",[economics],economics.json
2,C122304198,fr,Motoculteurs Arnoux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Brevet d'invention pour un petit tracteur moto...,[economics],economics.json
3,C122304200,fr,Tracteurs Arnoux et leur outillage.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Tracteurs type VM 10 et VM 15 : prospectus et ...,[economics],economics.json
4,C122304201,fr,Documentation générale.,"119 J - Arnoux, fabrique de tracteurs, Miramas...","L'Officiel des marques, 1er trimestre 1957, 3e...",[economics],economics.json


In [4]:
from gensim.models import KeyedVectors
de_model = KeyedVectors.load_word2vec_format('/Users/fnanni/Resources/word-embs/wiki.multi.de.vec')
fr_model = KeyedVectors.load_word2vec_format('/Users/fnanni/Resources/word-embs/wiki.multi.fr.vec')
en_model = KeyedVectors.load_word2vec_format('/Users/fnanni/Resources/word-embs/wiki.multi.en.vec')
it_model = KeyedVectors.load_word2vec_format('/Users/fnanni/Resources/word-embs/wiki.multi.it.vec')

In [5]:
model_dict = {"fr":fr_model,"en":en_model,"de":de_model,"it":it_model}

In [6]:
from langdetect import detect

langs = []

for index, row in df.iterrows():
    if row["langMaterial"] not in langs_dict:
        try:
            l = detect(row["unitTitle"] +" "+ row["titleProper"]+" "+ row["scopeContent"])
            row["langMaterial"] = l
            langs.append(l)
        except Exception as e:
            print (e)
            continue
    else:
        langs.append(row["langMaterial"])
        
from collections import Counter

Counter(langs).most_common()

[('de', 55078), ('fr', 10578), ('en', 17), ('it', 4)]

In [8]:
embs = {}

for index, row in df.iterrows():
    lang = row["langMaterial"]
    model = model_dict[lang]
    text = row["unitTitle"] +" "+ row["titleProper"]+" "+ row["scopeContent"]
    label = row["filename"]
    emb = text_embedding(text,model)
    if emb != None:
        if label in embs:
            embs[label].append(emb)
        else:
            embs[label] = [emb]



ValueError: too many values to unpack (expected 2)

In [9]:
avg_embs = {}
for topic,e in embs.items():
    avg = [float(sum(col))/len(col) for col in zip(*e)]
    avg_embs[topic] = avg

In [44]:
words_dict = {}

import numpy as np
for topic,e in avg_embs.items():
    e = np.array(e)
    words = en_model.similar_by_vector(e,topn=10000)
    for word in words:
        if word[0] not in words_dict:
            words_dict[word[0]] = {t:0.0 for t in avg_embs.keys()}
            words_dict[word[0]][topic] = word[1]
        else:
            words_dict[word[0]][topic] = word[1]


In [49]:
ranking = {t:[] for t in avg_embs.keys()}

for topic,e in avg_embs.items():
    e = np.array(e)
    words = en_model.similar_by_vector(e,topn=10000)
    for word in words:
        mean = [y for x,y in words_dict[word[0]].items() if x!=topic]
        mean = sum(mean)/len(mean)
        if mean != 0:
            score = word[1] - mean
            ranking[topic].append([word[0],score])

In [50]:
for topic,rank in ranking.items():
    print (topic)
    rank.sort(key=lambda x: x[1],reverse= True)
    for el in rank[:50]:
        print (el)
    print (" ")

economics.json
['saclay', 0.38051149249076843]
['manufacturing', 0.3737958297133446]
['paristech', 0.36965280026197433]
['consultancy', 0.368753120303154]
['engineering,', 0.3673355355858803]
['smes', 0.36631694436073303]
['technology', 0.3641170561313629]
['warehousing', 0.362179696559906]
['industrie', 0.36028093844652176]
['sustainable', 0.3575575649738312]
['tourism', 0.3573366850614548]
['sustainability', 0.35684599727392197]
['manufactures', 0.3566678911447525]
['optimisation', 0.3560803756117821]
['#national', 0.35547294467687607]
['ingénieur', 0.35538846254348755]
['supérieur', 0.35503873229026794]
['housebuilding', 0.35458170622587204]
['postsecondary', 0.3543931543827057]
['musicaindustrial', 0.3541184663772583]
['ict', 0.3528296649456024]
['biomedical', 0.3521682322025299]
['multidisciplinary', 0.35155318677425385]
['geomatics', 0.3515200987458229]
['biostatistics', 0.3514222130179405]
['industrialize', 0.3511848971247673]
['aeronautic', 0.35018324851989746]
['supérieure', 0