In [155]:
import pandas as pd
pd.set_option("display.max_colwidth", 50)
import numpy as np
import re

In [289]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [262]:
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords as sw
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import RussianStemmer
from nltk.stem import WordNetLemmatizer

import pymorphy2

custom_stopwords = ['сколько','во-сколько','здравствовать','здрасте', 'вообще','это', 'ещё', 'значит', 'значить','этмый','либо','хотя','таки','кроме','просто','её','сей','оно','ничто','го', 'ой', 'сегодня', 'спасибо','зеленоград','москва','пермь',
'январь','февраль', 'март', 'апрель', 'май', 'июнь', 'июль', 'август', 'сентябрь', 'октябрь', 'ноябрь', 'декабрь']

[nltk_data] Downloading package wordnet to /home/bobkovs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [263]:
stopwords = sw.words('russian')
en_stopwords = sw.words('english')
tokenizer = RegexpTokenizer(r'\w+')
morph = pymorphy2.MorphAnalyzer()
stemmer = RussianStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
def _tokenizer(s):
    return s.split()

In [264]:
def preprocess_word(word):
    
    return morph.parse(word)[0].normal_form.lower()

def preprocess_list(list_):
    new_list, res_list = [], []
    for l in list_:
        words = tokenizer.tokenize(l)
        
        new_words = [preprocess_word(word) for word in words 
                        if morph.parse(word)[0].normal_form not in stopwords
                        and not any(char.isdigit() for char in word)
                        and not bool(re.search(r'[a-zA-Z]', word))
                        and morph.parse(word)[0].normal_form.lower() not in custom_stopwords
                    ]
        new_list.append(' '.join(w for w in new_words))
    for elem in new_list:
        if elem != '':
            res_list.append(elem)
    return res_list

In [265]:
data = pd.read_csv('data.csv')

In [266]:
data = data.dropna()

data.drop(data[data.category=='Тестовая категория'].index, inplace=True)

data = data.reset_index(drop=True)

In [267]:
for i in range(len(data.category)):
    data.category[i] = data.category[i].strip().replace(' ', '_')
for i in range(len(data.executor)):
    data.executor[i] = data.executor[i].strip().replace(' ', '_')
for i in range(len(data.theme)):
    data.theme[i] = data.theme[i].strip().replace(' ', '_')

In [268]:
for i in range(len(data.text)):
    prep_list = preprocess_list(data.text[i].split())
    data.text[i] = ' '.join(prep_list)

In [269]:
data.to_csv('preproc_data.csv', index=False)

In [277]:
data = pd.read_csv('preproc_data.csv')

In [278]:
cat_encoder, exec_encoder, theme_encoder = LabelEncoder(), LabelEncoder(), LabelEncoder()

In [279]:
data.category = cat_encoder.fit_transform(data.category)
data.executor = exec_encoder.fit_transform(data.executor)
data.theme = theme_encoder.fit_transform(data.theme)

In [282]:
X = data.text
y = data.theme

In [291]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [284]:
vectorizer = CountVectorizer(tokenizer=_tokenizer)

In [285]:
vectorizer.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function _tokenizer at 0x7f014dfdf1e0>, vocabulary=None)

In [292]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [293]:
clf = XGBClassifier()

In [295]:
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [297]:
clf.score(X_test, y_test)

0.48044692737430167