In [1]:
import numpy as np
import pandas as pd

import re
from functools import lru_cache
from pymorphy3 import MorphAnalyzer

from nltk.corpus import stopwords

from tqdm.notebook import tqdm

from sklearn import model_selection, metrics

In [2]:
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv('../datasets/intent_dataset.csv')

RANDOM_STATE = 42

In [4]:
m = MorphAnalyzer()
regex = re.compile("[а-яa-zёЁ]+")

class_map = {
    'open': 0,
    'write': 1,
    'close': 2,
    'delete': 3,
    'mute': 4
}

In [5]:
data['intent'] = data['intent'].map(class_map)

In [6]:
def words_only(text, regex=regex):
    try:
        return regex.findall(text.lower())
    except:
        return []

In [7]:
@lru_cache(maxsize=128)
def lemmatize_word(token, pymorphy=m):
    return pymorphy.parse(token)[0].normal_form

def lemmatize_text(text):
    return [lemmatize_word(w) for w in text]


mystopwords = stopwords.words('russian') 
def remove_stopwords(lemmas, stopwords = mystopwords):
    return [w for w in lemmas if not w in stopwords and len(w) > 3]

def clean_text(text):
    tokens = words_only(text)
    lemmas = lemmatize_text(tokens)
    
    return ' '.join(remove_stopwords(lemmas))

In [8]:
train_df, test_df, y_train, y_test = model_selection.train_test_split(data.drop('intent', axis=1), data['intent'], 
                                                                      test_size=0.1,
                                                                      random_state=RANDOM_STATE, 
                                                                      stratify=data['intent'])

In [9]:
%%time
train_df['lemmas'] = train_df['text'].map(clean_text)
test_df['lemmas'] = test_df['text'].map(clean_text)

CPU times: user 822 ms, sys: 0 ns, total: 822 ms
Wall time: 825 ms


In [10]:
%%time
vec = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vec.fit_transform(train_df['lemmas'])

clf = LogisticRegression(random_state=42)
clf.fit(tfidf, y_train)

pred = clf.predict(vec.transform(test_df['lemmas']))
metrics.accuracy_score(pred, y_test)

CPU times: user 2.68 s, sys: 4.75 s, total: 7.43 s
Wall time: 580 ms


0.98046875

In [11]:
%%time
pred = clf.predict(vec.transform(test_df['lemmas']))
metrics.accuracy_score(pred, y_test)

CPU times: user 4.02 ms, sys: 21 µs, total: 4.04 ms
Wall time: 3.35 ms


0.98046875

In [34]:
print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        55
           1       0.96      0.96      0.96        51
           2       0.98      1.00      0.99        51
           3       0.98      0.98      0.98        53
           4       0.98      0.96      0.97        46

    accuracy                           0.98       256
   macro avg       0.98      0.98      0.98       256
weighted avg       0.98      0.98      0.98       256



In [32]:
%%time
train_df['text'].sample().map(clean_text)

CPU times: user 1.6 ms, sys: 0 ns, total: 1.6 ms
Wall time: 1.67 ms


407    открыть страница доставка
Name: text, dtype: object