In [4]:
import os
import pickle as pkl
import warnings

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

from utils import _tokenizer

np.random.seed = 42

warnings.filterwarnings("ignore")

In [5]:
# Загружаем данные
data = pd.read_csv(os.path.join('data', 'fully_prepared_data.csv'))

# Шафлим датафрейм
data = data.sample(frac=1).reset_index(drop=True)

# Разбиваем датафрейм на трейн и тест с отношением 0.9/0.1
train_index = np.random.rand(len(data)) < 0.9
train_data = data[train_index].reset_index(drop=True)
test_data = data[~train_index].reset_index(drop=True)

In [6]:
# Инициализируем и обучаем векторайзер
vectorizer = TfidfVectorizer(tokenizer=_tokenizer)
vectorizer.fit(data.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function _tokenizer at 0x7fcaf34d02f0>, use_idf=True,
        vocabulary=None)

In [7]:
# Инициализируем X и Y
X_train, X_test = vectorizer.transform(train_data.text), vectorizer.transform(test_data.text)
X_train, X_test = pd.DataFrame(X_train.toarray()), pd.DataFrame(X_test.toarray())
Y_category_train, Y_category_test = train_data.category, test_data.category
Y_executor_train, Y_executor_test = train_data.executor, test_data.executor
Y_theme_train, Y_theme_test = train_data.theme, test_data.theme

In [8]:
# Инициализируем объекты XGBClassifier
clf_category = LinearSVC()
clf_executor = LinearSVC()
clf_theme = LinearSVC()

In [9]:
# Учим модели первого уровня
clf_category.fit(X_train, Y_category_train)
clf_executor.fit(X_train, Y_executor_train)
clf_theme.fit(X_train, Y_theme_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [10]:
# Определяем точность моделей первого уровня
clf_category_accuracy = clf_category.score(X_test, Y_category_test)
clf_executor_accuracy = clf_executor.score(X_test, Y_executor_test)
clf_theme_accuracy = clf_theme.score(X_test, Y_theme_test)

print('Prediction accuracy of lvl1 models: category = {0}, executor = {1}, theme = {2}\n'.format(clf_category_accuracy,
                                                                                                 clf_executor_accuracy,
                                                                                                 clf_theme_accuracy))

Prediction accuracy of lvl1 models: category = 0.6781609195402298, executor = 0.6149425287356322, theme = 0.5344827586206896



In [47]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import matplotlib.pyplot as plt

In [54]:
train_tagged = train_data.apply(
    lambda r: TaggedDocument(words=(r['text']), tags=[r.category]), axis=1)
test_tagged = test_data.apply(
    lambda r: TaggedDocument(words=(r['text']), tags=[r.category]), axis=1)
data_tagged = data.apply(
    lambda r: TaggedDocument(words=(r['text']), tags=[r.category]), axis=1)

In [55]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [57]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(data_tagged.values)])

100%|██████████| 1786/1786 [00:00<00:00, 621301.07it/s]


In [58]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(data_tagged.values)]), total_examples=len(data_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 1786/1786 [00:00<00:00, 614925.87it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3004824.29it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3213653.77it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3374336.46it/s]
100%|██████████| 1786/1786 [00:00<00:00, 2106588.00it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3536839.92it/s]
100%|██████████| 1786/1786 [00:00<00:00, 1735239.04it/s]
100%|██████████| 1786/1786 [00:00<00:00, 2869026.02it/s]
100%|██████████| 1786/1786 [00:00<00:00, 2187161.15it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3502116.38it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3401919.59it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3476114.59it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3622353.45it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3408110.53it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3728734.17it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3610133.47it/s]
100%|██████████| 1786/1786 [00:00<00:00, 3560374.02it/s]
100%|██████████| 1786/1786 [00:0

CPU times: user 19.9 s, sys: 226 ms, total: 20.2 s
Wall time: 6.34 s


In [59]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector([doc.words], steps=20)) for doc in sents])
    return targets, regressors

In [60]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

logreg = LinearSVC()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.26436781609195403
Testing F1 score: 0.11055381400208987
