In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('queries_base_result.csv')

In [8]:
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Natasha',
               'Natasha + new rule', 'Deepmipt'], axis=1)

In [9]:
df = df.rename(columns={'Без удаления NE': 'Текст с препроцессингом'})

In [10]:
df.head()

Unnamed: 0,Текст вопроса,Номер связки\n,Тематика,Текст с препроцессингом
0,с уважением Вероника Игоревна Ильич\n\nПосле ...,308.0,"ЗАКРЫТИЕ ГРАНИЦ, ОТКРЫТИЕ ГРАНИЦ РОССИИ И АВИА...",уважение вероника игорь илья август год москва...
1,"Здравствуйте! Проинформируйте, пожалуйста, нуж...",324.0,ОРГАНИЗАЦИИ ОТДЫХА ДЕТЕЙ И ИХ ЗДОРОВЛЕНИЯ,здравствовать проинформировать пожалуйста нужн...
2,"--\nДобрый день!\n Меня, Сидельникова Андрея...",57.0,БОЛЬНИЧНЫЙ ЛИСТ,добрый день сидельников андрей олег 30071989гр...
3,Добрый день.\nВ Кемеровской области согласно п...,45.0,"ШТРАФЫ, НОРМАТИВНЫЕ АКТЫ И РЕКОМЕНДАЦИИ",добрый день кемеровский область согласно поста...
4,"Здравствуйте, в моем городе Кострома введено о...",3.0,"ШТРАФЫ, НОРМАТИВНЫЕ АКТЫ И РЕКОМЕНДАЦИИ",здравствовать мыть город кострома ввести огран...


In [11]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)

In [15]:
segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

def ner(text: str) -> int:
    text = str(text)
    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)
    ner = [word.text for word in doc.spans]
    #  если в тексте нет именованных сущностей, то возвращает 0
    if len(ner) == 0:
        return 0
    else:
        return 1

In [16]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas('PROGRESS>>>')

In [17]:
# добавляем признак -- наличие (1) или отсутствие (0) именованных сущностей в тексте
df['Наличие именованных сущностей'] = df['Текст вопроса'].progress_apply(ner)

HBox(children=(FloatProgress(value=0.0, max=2295.0), HTML(value='')))




In [18]:
def special_words(text):
    if 'коронавирус' or 'ковид' in text.split():
        return 1
    else:
        return 0

In [19]:
# добавляем признак -- наличие (1) или отсутствие (0) в тексте слова 'коронавирус'
df['Наличие специальных слов'] = df['Текст с препроцессингом'].progress_apply(special_words)

HBox(children=(FloatProgress(value=0.0, max=2295.0), HTML(value='')))




In [20]:
# добавляем признак -- количество слов в тексте
df['Количество слов в запросе'] = [len(str(text).split()) for text in df['Текст с препроцессингом']]

In [21]:
df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]

In [22]:
df.head()

Unnamed: 0,Текст вопроса,Номер связки\n,Тематика,Текст с препроцессингом,Наличие именованных сущностей,Наличие специальных слов,Количество слов в запросе
0,с уважением Вероника Игоревна Ильич\n\nПосле ...,308.0,"ЗАКРЫТИЕ ГРАНИЦ, ОТКРЫТИЕ ГРАНИЦ РОССИИ И АВИА...",уважение вероника игорь илья август год москва...,1,1,22
1,"Здравствуйте! Проинформируйте, пожалуйста, нуж...",324.0,ОРГАНИЗАЦИИ ОТДЫХА ДЕТЕЙ И ИХ ЗДОРОВЛЕНИЯ,здравствовать проинформировать пожалуйста нужн...,0,1,9
2,"--\nДобрый день!\n Меня, Сидельникова Андрея...",57.0,БОЛЬНИЧНЫЙ ЛИСТ,добрый день сидельников андрей олег 30071989гр...,1,1,62
3,Добрый день.\nВ Кемеровской области согласно п...,45.0,"ШТРАФЫ, НОРМАТИВНЫЕ АКТЫ И РЕКОМЕНДАЦИИ",добрый день кемеровский область согласно поста...,1,1,33
4,"Здравствуйте, в моем городе Кострома введено о...",3.0,"ШТРАФЫ, НОРМАТИВНЫЕ АКТЫ И РЕКОМЕНДАЦИИ",здравствовать мыть город кострома ввести огран...,1,1,39


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()

In [24]:
x = vec.fit_transform(df['Текст с препроцессингом'])

In [25]:
df1 = pd.DataFrame(x.toarray(), columns=vec.get_feature_names())
df1.head()

Unnamed: 0,00569гра,01082020г,010920г,0109семья,0207092020г,03082020г,03082020положительный,03карантин,05041969г,05071983гра,05092020выставка,06012007захаров,06082020г,07082020г,07082020гести,07082020должный,0709говорятчто,07862638611тест,079,08я,09082020г,1000ный,100819г,10082020г,100820г,10е,10и,10ий,10й,10ть,10чело,10ый,11041984гра,1105тг,11081986гра,11082020г,11год,11й,11переходить,12082020г,...,юридический,юрий,юрк,юров,юрчук,юсахалинск,явиться,явка,явлеиться,явление,являться,явно,явный,яворац,ягановый,язык,языкеполучить,яким,якимов,якобы,яковлев,якутия,якутск,якушев,ялта,ян,янао,январь,яндекспочта,янина,янкова,япония,ярасовый,ярмарка,ярославль,ярый,ясно,ясность,ясный,яхромский
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
res = pd.concat([df, df1], axis=1)
res.head()

Unnamed: 0,Текст вопроса,Номер связки\n,Тематика,Текст с препроцессингом,Наличие именованных сущностей,Наличие специальных слов,Количество слов в запросе,00569гра,01082020г,010920г,0109семья,0207092020г,03082020г,03082020положительный,03карантин,05041969г,05071983гра,05092020выставка,06012007захаров,06082020г,07082020г,07082020гести,07082020должный,0709говорятчто,07862638611тест,079,08я,09082020г,1000ный,100819г,10082020г,100820г,10е,10и,10ий,10й,10ть,10чело,10ый,11041984гра,...,юридический,юрий,юрк,юров,юрчук,юсахалинск,явиться,явка,явлеиться,явление,являться,явно,явный,яворац,ягановый,язык,языкеполучить,яким,якимов,якобы,яковлев,якутия,якутск,якушев,ялта,ян,янао,январь,яндекспочта,янина,янкова,япония,ярасовый,ярмарка,ярославль,ярый,ясно,ясность,ясный,яхромский
0,с уважением Вероника Игоревна Ильич\n\nПосле ...,308.0,"ЗАКРЫТИЕ ГРАНИЦ, ОТКРЫТИЕ ГРАНИЦ РОССИИ И АВИА...",уважение вероника игорь илья август год москва...,1.0,1.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Здравствуйте! Проинформируйте, пожалуйста, нуж...",324.0,ОРГАНИЗАЦИИ ОТДЫХА ДЕТЕЙ И ИХ ЗДОРОВЛЕНИЯ,здравствовать проинформировать пожалуйста нужн...,0.0,1.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"--\nДобрый день!\n Меня, Сидельникова Андрея...",57.0,БОЛЬНИЧНЫЙ ЛИСТ,добрый день сидельников андрей олег 30071989гр...,1.0,1.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Добрый день.\nВ Кемеровской области согласно п...,45.0,"ШТРАФЫ, НОРМАТИВНЫЕ АКТЫ И РЕКОМЕНДАЦИИ",добрый день кемеровский область согласно поста...,1.0,1.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Здравствуйте, в моем городе Кострома введено о...",3.0,"ШТРАФЫ, НОРМАТИВНЫЕ АКТЫ И РЕКОМЕНДАЦИИ",здравствовать мыть город кострома ввести огран...,1.0,1.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
res = res[~res.isin([np.nan, np.inf, -np.inf]).any(1)]

In [28]:
target = res['Номер связки\n']

In [29]:
# создаем датасет с тфидф и дополнительными признаками
df_features = res.drop(['Текст вопроса', 'Номер связки\n', 'Тематика ', 'Текст с препроцессингом'], axis=1)

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_features, target, 
                                                    test_size=0.3, random_state=42)

# Классификация

Будем предсказывать номер связки (номер тематики запроса)

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
clf = LogisticRegression()
clf.fit(X_train, pd.to_numeric(y_train))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
predicted = clf.predict(X_test)

In [34]:
true = pd.to_numeric(y_test)

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
acc = accuracy_score(true, predicted)
print('acc={0:1.4f}'.format(acc))

acc=0.4167


Попробуем добавить в TfidfVectorizer параметры и не использовать дополнительные признаки (наличие и отсутствие именных сущностей и др.)

In [37]:
def get_X(text):
    data = vec.fit_transform(text)
    return pd.DataFrame(data.todense(), columns=vec.get_feature_names(), index=df.index)

In [38]:
vec = TfidfVectorizer(ngram_range=(1,3), max_df=0.5, max_features=100)
new_df = get_X(df['Текст с препроцессингом'])
new_df.head(5)

Unnamed: 0,абхазия,август,анализ,больничный,ваш,весь,взять,возвращение,вопрос,врач,год,госуслуга,гражданин,граница,дать,действие,делать,добрый день,документ,должный,дом,здравствовать,иметь,инфекция,информация,карантин,ковид,контакт,коронавирус,коронавирусный,коронавирусный инфекция,короновирус,который,лицо,мазка,медицинский,место,москва,мочь,находиться,...,просить,работа,работать,ребёнок,режим,результат,результат тест,роспотребнадзор,российский,российский федерация,россия,сайт,самоизоляция,сдавать,сдавать тест,сдать,сдать тест,сдача,сделать,сегодня,симптом,ситуация,сказать,случай,сообщить,спасибо,справка,срок,температура,территория,тест,тест ковид,тест коронавирус,тестирование,течение,турция,уважение,федерация,человек,это
0,0.0,0.323874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.341535,0.0,0.0,...,0.0,0.0,0.0,0.0,0.417174,0.0,0.0,0.0,0.0,0.0,0.352877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.559779,0.0,0.0,0.0
1,0.0,0.541691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.477908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.403578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.253768,0.0,0.228304,0.0,0.0,0.21655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132416,0.0,0.214322,0.0,0.0,0.27349,0.0,0.0,0.461467,0.0,0.0,0.199294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235263,...,0.234972,0.245933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266459,0.218615,0.284056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192145
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.481651,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346975,0.0,0.0,0.0,0.39103,0.267386,0.0,0.0,0.0,0.0,0.0,0.0,0.312925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.294065,0.0,...,0.0,0.386155,0.0,0.0,0.0,0.0,0.0,0.353576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3017


In [39]:
X_train, X_test, y_train, y_test = train_test_split(new_df, df['Номер связки\n'], 
                                                    test_size=0.3, random_state=42)

In [40]:
clf.fit(X_train, pd.to_numeric(y_train))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
predicted = clf.predict(X_test)
true = pd.to_numeric(y_test)
acc = accuracy_score(true, predicted)
print('acc={0:1.4f}'.format(acc))

acc=0.6122


Попробуем другой классификатор

In [42]:
from sklearn.tree import DecisionTreeClassifier

In [43]:
clf_2 = DecisionTreeClassifier()

In [None]:
#new_df['Наличие именованных сущностей'] = df['Наличие именованных сущностей']
#new_df['Наличие специальных слов'] = df['Наличие специальных слов']
#new_df['Количество слов в запросе'] = df['Количество слов в запросе']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df_features, target, 
                                                    test_size=0.3, random_state=42)

In [45]:
clf_2.fit(X_train, pd.to_numeric(y_train))

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [46]:
predicted = clf_2.predict(X_test)
true = pd.to_numeric(y_test)
acc = accuracy_score(true, predicted)
print('acc={0:1.4f}'.format(acc))

acc=0.2997


И его же, но без использования дополнительных признаков

In [47]:
X_train, X_test, y_train, y_test = train_test_split(new_df, df['Номер связки\n'], 
                                                    test_size=0.3, random_state=42)

In [48]:
clf_2.fit(X_train, pd.to_numeric(y_train))

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [49]:
predicted = clf_2.predict(X_test)
true = pd.to_numeric(y_test)
acc = accuracy_score(true, predicted)
print('acc={0:1.4f}'.format(acc))

acc=0.5058


Видно, что самое хорошее качество получается на классификации с логистической регрессией без использования каких-либо признаков, кроме слов из словаря. Попробуем использовать CountVectorizer вместо TfidfVectorizer 

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
vec = CountVectorizer()

In [52]:
new_df = get_X(df['Текст с препроцессингом'])
new_df.head(5)

Unnamed: 0,00569гра,01082020г,010920г,0109семья,0207092020г,03082020г,03082020положительный,03карантин,05041969г,05071983гра,05092020выставка,06012007захаров,06082020г,07082020г,07082020гести,07082020должный,0709говорятчто,07862638611тест,079,08я,09082020г,1000ный,100819г,10082020г,100820г,10е,10и,10ий,10й,10ть,10чело,10ый,11041984гра,1105тг,11081986гра,11082020г,11год,11й,11переходить,12082020г,...,юридический,юрий,юрк,юров,юрчук,юсахалинск,явиться,явка,явлеиться,явление,являться,явно,явный,яворац,ягановый,язык,языкеполучить,яким,якимов,якобы,яковлев,якутия,якутск,якушев,ялта,ян,янао,январь,яндекспочта,янина,янкова,япония,ярасовый,ярмарка,ярославль,ярый,ясно,ясность,ясный,яхромский
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [53]:
X_train, X_test, y_train, y_test = train_test_split(new_df, df['Номер связки\n'], 
                                                    test_size=0.3, random_state=42)

In [54]:
clf.fit(X_train, pd.to_numeric(y_train))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
predicted = clf.predict(X_test)
true = pd.to_numeric(y_test)
acc = accuracy_score(true, predicted)
print('acc={0:1.4f}'.format(acc))

acc=0.6749
