In [10]:
import nltk, string

exclude = set(string.punctuation)

# input should be a string
def text_embedding(text,model):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    doc_embedd = []
    
    for word in text:
            try:
                embed_word = model[word]
                doc_embedd.append(embed_word)
            except KeyError:
                continue
    if len(doc_embedd)>1:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embedd)]
        return avg
    else:
        return None

In [1]:
import pickle
import pandas as pd

langs_dict = {"ger":"de","fre":"fr","it":"it","en":"en"}

with open('selected_dataset.pickle', 'rb') as f:
    df = pickle.load(f)
    
df['langMaterial'] = df['langMaterial'].map(langs_dict)


In [2]:
df.head()

Unnamed: 0,id,langMaterial,unitTitle,titleProper,scopeContent,topic,filename
0,C122304196,fr,Documents généraux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Historique par monsieur Hervé Arnoux (1993). C...,[economics],economics.json
1,C122304197,fr,Réparations et représentations pour les automo...,"119 J - Arnoux, fabrique de tracteurs, Miramas...","Garage Arnoux : vue de la façade sud, le long ...",[economics],economics.json
2,C122304198,fr,Motoculteurs Arnoux.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Brevet d'invention pour un petit tracteur moto...,[economics],economics.json
3,C122304200,fr,Tracteurs Arnoux et leur outillage.,"119 J - Arnoux, fabrique de tracteurs, Miramas...",Tracteurs type VM 10 et VM 15 : prospectus et ...,[economics],economics.json
4,C122304201,fr,Documentation générale.,"119 J - Arnoux, fabrique de tracteurs, Miramas...","L'Officiel des marques, 1er trimestre 1957, 3e...",[economics],economics.json


In [3]:
from gensim.models import KeyedVectors
de_model = KeyedVectors.load_word2vec_format('/Users/fnanni/Resources/word-embs/wiki.multi.de.vec')
fr_model = KeyedVectors.load_word2vec_format('/Users/fnanni/Resources/word-embs/wiki.multi.fr.vec')
en_model = KeyedVectors.load_word2vec_format('/Users/fnanni/Resources/word-embs/wiki.multi.en.vec')
it_model = KeyedVectors.load_word2vec_format('/Users/fnanni/Resources/word-embs/wiki.multi.it.vec')

In [13]:
model_dict = {"fr":fr_model,"en":en_model,"de":de_model,"it":it_model}

In [11]:
from langdetect import detect

langs = []

for index, row in df.iterrows():
    if row["langMaterial"] not in langs_dict:
        try:
            l = detect(row["unitTitle"] +" "+ row["titleProper"]+" "+ row["scopeContent"])
            row["langMaterial"] = l
            langs.append(l)
        except Exception as e:
            print (e)
            continue
    else:
        langs.append(row["langMaterial"])
        
from collections import Counter

Counter(langs).most_common()

[('de', 55078), ('fr', 10582), ('en', 11), ('it', 6)]

In [None]:
# prepare doc embedding function
# match correct embedding for each language

In [None]:
embs = []
labels = []

for index, row in df.iterrows():
    lang = row["langMaterial"]
    model = model_dict[lang]
    text = row["unitTitle"] +" "+ row["titleProper"]+" "+ row["scopeContent"]
    label = row["filename"]
    emb = text_embedding(text,model)
    if emb != None:
        embs.append(emb)
        labels.append(label)
print (len(embs),len(labels))