In [35]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

import re
from functools import lru_cache
from pymorphy3 import MorphAnalyzer

from nltk.corpus import stopwords

from tqdm.notebook import tqdm

from sklearn import model_selection, metrics

In [39]:
#1
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [71]:
#2
from gensim.models import Word2Vec, FastText

In [41]:
data = pd.read_csv('../datasets/intent_dataset.csv')

RANDOM_STATE = 42

In [42]:
m = MorphAnalyzer()
regex = re.compile("[а-яa-zёЁ]+")

class_map = {
    'open': 0,
    'write': 1,
    'close': 2,
    'delete': 3,
    'mute': 4
}

In [43]:
data['intent'] = data['intent'].map(class_map)

In [44]:
def words_only(text, regex=regex):
    try:
        return regex.findall(text.lower())
    except:
        return []

In [45]:
@lru_cache(maxsize=128)
def lemmatize_word(token, pymorphy=m):
    return pymorphy.parse(token)[0].normal_form

def lemmatize_text(text):
    return [lemmatize_word(w) for w in text]


mystopwords = stopwords.words('russian') 
def remove_stopwords(lemmas, stopwords = mystopwords):
    return [w for w in lemmas if not w in stopwords and len(w) > 3]

def clean_text(text):
    tokens = words_only(text)
    lemmas = lemmatize_text(tokens)
    
    return ' '.join(remove_stopwords(lemmas))

## logisticreg + tfidf + base preprocessing

In [60]:
train_df, test_df, y_train, y_test = model_selection.train_test_split(data.drop('intent', axis=1), data['intent'], 
                                                                      test_size=0.1,
                                                                      random_state=RANDOM_STATE, 
                                                                      stratify=data['intent'])

In [61]:
%%time
train_df['lemmas'] = train_df['text'].map(clean_text)
test_df['lemmas'] = test_df['text'].map(clean_text)

CPU times: user 839 ms, sys: 1.88 ms, total: 841 ms
Wall time: 840 ms


In [69]:
%%time
vec = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vec.fit_transform(train_df['lemmas'])

clf = LogisticRegression(random_state=RANDOM_STATE)
clf.fit(tfidf, y_train)

pred = clf.predict(vec.transform(test_df['lemmas']))
metrics.accuracy_score(pred, y_test)

CPU times: user 1.92 s, sys: 3.11 s, total: 5.03 s
Wall time: 347 ms


0.98046875

In [11]:
%%time
pred = clf.predict(vec.transform(test_df['lemmas']))
metrics.accuracy_score(pred, y_test)

CPU times: user 4.02 ms, sys: 21 µs, total: 4.04 ms
Wall time: 3.35 ms


0.98046875

In [34]:
print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        55
           1       0.96      0.96      0.96        51
           2       0.98      1.00      0.99        51
           3       0.98      0.98      0.98        53
           4       0.98      0.96      0.97        46

    accuracy                           0.98       256
   macro avg       0.98      0.98      0.98       256
weighted avg       0.98      0.98      0.98       256



In [32]:
%%time
train_df['text'].sample().map(clean_text)

CPU times: user 1.6 ms, sys: 0 ns, total: 1.6 ms
Wall time: 1.67 ms


407    открыть страница доставка
Name: text, dtype: object

## logisticreg + Word2Vec(FastText)

In [108]:
word2vec_model = Word2Vec(sentences=list(train_df.lemmas.str.split()), 
                          vector_size=50, window=5, workers=4, min_count=0).wv

In [109]:
fasttext_model = FastText(sentences=list(train_df.lemmas.str.split()),
                         vector_size=50, window=5, workers=4, min_count=0).wv

In [110]:
def get_embeddings(df, model, embed_size=300):
    doc_vectors = []
    
    for doc in tqdm(df.lemmas.str.split()):
        res = np.zeros(embed_size)
        cnt = 0
        for word in doc:
            res += model[word]
            cnt += 1
        if cnt != 0:
            res /= cnt
        
        doc_vectors.append(res)
    
    return np.array(doc_vectors)

In [111]:
def get_embeddings_w2v(df, model, embed_size=300):
    doc_vectors = []
    
    for doc in tqdm(df.lemmas.str.split()):
        res = np.zeros(embed_size)
        cnt = 0
        for word in doc:
            if model.__contains__(word):
                res += model[word]
                cnt += 1
        if cnt != 0:
            res /= cnt
        doc_vectors.append(res)
        
    return np.array(doc_vectors)

In [112]:
%%time
#word2vec
train_w2v = get_embeddings_w2v(train_df, word2vec_model, 50)
test_w2v = get_embeddings_w2v(test_df, word2vec_model, 50)

  0%|          | 0/2301 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]

CPU times: user 123 ms, sys: 1.82 ms, total: 124 ms
Wall time: 120 ms


In [113]:
clf.fit(pd.DataFrame(train_w2v), y_train)

pred = clf.predict(pd.DataFrame(test_w2v))
metrics.accuracy_score(pred, y_test)

0.6640625

In [114]:
%%time
#fasttext
train_fasttext = get_embeddings(train_df, fasttext_model, 50)
test_fasttext = get_embeddings(test_df, fasttext_model, 50)

  0%|          | 0/2301 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]

CPU times: user 96.6 ms, sys: 12.3 ms, total: 109 ms
Wall time: 109 ms


In [115]:
clf.fit(pd.DataFrame(train_fasttext), y_train)

pred = clf.predict(pd.DataFrame(test_fasttext))
metrics.accuracy_score(pred, y_test)

0.34375