In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix

In [15]:
# reading data from files
data = pd.read_csv('eurlex_data.txt', sep=',', header=None, encoding='utf-8', names=['id', 'text'])
labels = pd.read_csv('eurlex_labels.txt', sep=' ', header=None, names=['label', 'text_id'], usecols=[0, 1])

In [3]:
# tokenizing
tokenized_texts = []
for _, row in data.iterrows():
    text = row['text'][2:-1].split(" ")
    tokenized_texts.append(text)

In [4]:
# doing some util staff
# numering every label and text_id
labels_unique = labels['label'].unique()
labels_dict = dict(zip(labels_unique, np.arange(labels_unique.size)))
ids_unique = labels['text_id'].unique()
ids_dict = dict(zip(ids_unique, np.arange(ids_unique.size)))

In [5]:
# collecting indices of i,j for sparse label matrix
i_indices, j_indices = [], []
for _, row in labels.iterrows():
    i_indices.append(ids_dict[row['text_id']])
    j_indices.append(labels_dict[row['label']])

In [6]:
# creating sparse matrix
labels_matrix = coo_matrix(
    (np.ones(len(labels)), (i_indices, j_indices)),
    (len(data), labels_unique.size),
    dtype=np.dtype(np.uint8)
)

In [7]:
# creating and training W2V
model = gensim.models.Word2Vec(tokenized_texts, size=100, workers=4)

In [8]:
# extracting words vectors
w2v = dict(zip(model.wv.index2word, model.wv.vectors))

In [9]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(w2v.popitem()[1])

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec], axis=0)
            for words in X
        ])

In [10]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(w2v.popitem()[1])

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import *

etree_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("extra trees", RandomForestClassifier(n_estimators=200, n_jobs=-1))])
etree_w2v_tfidf = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("extra trees", RandomForestClassifier(n_estimators=200))])

In [12]:
mwe = MeanEmbeddingVectorizer(w2v)
train_ = mwe.transform(tokenized_texts)

X_train, X_test, y_train, y_test = train_test_split(train_, labels_matrix, test_size=0.33)