In [26]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

import re
from functools import lru_cache
from pymorphy3 import MorphAnalyzer

from nltk.corpus import stopwords

from tqdm.notebook import tqdm

from sklearn import model_selection, metrics

In [27]:
#1
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [28]:
#2
from gensim.models import Word2Vec, FastText

In [29]:
data = pd.read_csv('../datasets/intent_dataset.csv')

RANDOM_STATE = 42

In [30]:
m = MorphAnalyzer()
regex = re.compile("[а-яёЁ]+")

class_map = {
    'open': 0,
    'write': 1,
    'close': 2,
    'delete': 3,
    'mute': 4
}

In [31]:
data['intent'] = data['intent'].map(class_map)

In [32]:
def words_only(text, regex=regex):
    try:
        return regex.findall(text.lower())
    except:
        return []

In [33]:
@lru_cache(maxsize=128)
def lemmatize_word(token, pymorphy=m):
    return pymorphy.parse(token)[0].normal_form

def lemmatize_text(text):
    return [lemmatize_word(w) for w in text]


mystopwords = stopwords.words('russian') 
def remove_stopwords(lemmas, stopwords = mystopwords):
    return [w for w in lemmas if not w in stopwords and len(w) > 3]

def clean_text(text):
    tokens = words_only(text)
    lemmas = lemmatize_text(tokens)
    
    return ' '.join(remove_stopwords(lemmas))

## logisticreg + tfidf + base preprocessing

In [13]:
train_df, test_df, y_train, y_test = model_selection.train_test_split(data.drop('intent', axis=1), data['intent'], 
                                                                      test_size=0.1,
                                                                      random_state=RANDOM_STATE, 
                                                                      stratify=data['intent'])

In [14]:
%%time
train_df['lemmas'] = train_df['text'].map(clean_text)
test_df['lemmas'] = test_df['text'].map(clean_text)

CPU times: user 803 ms, sys: 297 µs, total: 804 ms
Wall time: 805 ms


In [15]:
%%time
vec = TfidfVectorizer(ngram_range=(1, 3))
tfidf = vec.fit_transform(train_df['lemmas'])

clf = LogisticRegression(random_state=RANDOM_STATE, max_iter=10000)
clf.fit(tfidf, y_train)

pred = clf.predict(vec.transform(test_df['lemmas']))
metrics.accuracy_score(pred, y_test)

CPU times: user 2.79 s, sys: 5.39 s, total: 8.18 s
Wall time: 613 ms


0.984375

In [16]:
%%time
pred = clf.predict(vec.transform(test_df['lemmas']))
metrics.accuracy_score(pred, y_test)

CPU times: user 654 µs, sys: 3.83 ms, total: 4.49 ms
Wall time: 3.86 ms


0.984375

In [279]:
test_df['lemmas']

191                 открыть сообщение друг который жить страна
2424                       отключить уведомление выходной день
160                                  запустить новый настройка
243                                                  запустить
1667                         удалить сообщение отметить важный
                                 ...                          
2545    хотеть получать уведомление активный приложение минута
1563                               закрыть подменить настройка
2055                           убрать свой сообщение последний
1077                                   закрыть диалоговый окно
1658          удалить сообщение который отправить определённый
Name: lemmas, Length: 256, dtype: object

In [211]:
print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        55
           1       0.98      0.96      0.97        51
           2       0.98      1.00      0.99        51
           3       0.98      1.00      0.99        53
           4       0.98      0.96      0.97        46

    accuracy                           0.98       256
   macro avg       0.98      0.98      0.98       256
weighted avg       0.98      0.98      0.98       256



In [32]:
%%time
train_df['text'].sample().map(clean_text)

CPU times: user 1.6 ms, sys: 0 ns, total: 1.6 ms
Wall time: 1.67 ms


407    открыть страница доставка
Name: text, dtype: object

## logisticreg + Word2Vec(FastText)

In [108]:
word2vec_model = Word2Vec(sentences=list(train_df.lemmas.str.split()), 
                          vector_size=50, window=5, workers=4, min_count=0).wv

In [109]:
fasttext_model = FastText(sentences=list(train_df.lemmas.str.split()),
                         vector_size=50, window=5, workers=4, min_count=0).wv

In [110]:
def get_embeddings(df, model, embed_size=300):
    doc_vectors = []
    
    for doc in tqdm(df.lemmas.str.split()):
        res = np.zeros(embed_size)
        cnt = 0
        for word in doc:
            res += model[word]
            cnt += 1
        if cnt != 0:
            res /= cnt
        
        doc_vectors.append(res)
    
    return np.array(doc_vectors)

In [111]:
def get_embeddings_w2v(df, model, embed_size=300):
    doc_vectors = []
    
    for doc in tqdm(df.lemmas.str.split()):
        res = np.zeros(embed_size)
        cnt = 0
        for word in doc:
            if model.__contains__(word):
                res += model[word]
                cnt += 1
        if cnt != 0:
            res /= cnt
        doc_vectors.append(res)
        
    return np.array(doc_vectors)

In [112]:
%%time
#word2vec
train_w2v = get_embeddings_w2v(train_df, word2vec_model, 50)
test_w2v = get_embeddings_w2v(test_df, word2vec_model, 50)

  0%|          | 0/2301 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]

CPU times: user 123 ms, sys: 1.82 ms, total: 124 ms
Wall time: 120 ms


In [113]:
clf.fit(pd.DataFrame(train_w2v), y_train)

pred = clf.predict(pd.DataFrame(test_w2v))
metrics.accuracy_score(pred, y_test)

0.6640625

In [114]:
%%time
#fasttext
train_fasttext = get_embeddings(train_df, fasttext_model, 50)
test_fasttext = get_embeddings(test_df, fasttext_model, 50)

  0%|          | 0/2301 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]

CPU times: user 96.6 ms, sys: 12.3 ms, total: 109 ms
Wall time: 109 ms


In [115]:
clf.fit(pd.DataFrame(train_fasttext), y_train)

pred = clf.predict(pd.DataFrame(test_fasttext))
metrics.accuracy_score(pred, y_test)

0.34375

## logisticreg + Word2Vec(FastText) + TF-IDF(CountVec) weights

In [188]:
def w2v_tfidf(df, model, embed_size=50):
    doc_vectors = []
    
    for doc in tqdm(df.lemmas.str.split()):
        res = np.zeros(embed_size)
        cnt = 0
        for word in doc:
            if model.__contains__(word):
                res += tf_idf_voc[word] * model[word]
                cnt += 1
        if cnt != 0:
            res /= cnt
        doc_vectors.append(res)
    
    return doc_vectors

In [198]:
vec = TfidfVectorizer(ngram_range=(1, 1))
tfidf = vec.fit_transform(train_df['lemmas'])

tf_idf_voc = {word: vec.idf_[i] for word, i in vec.vocabulary_.items()}

In [199]:
%%time
#fasttext
train_w2v = w2v_tfidf(train_df, word2vec_model, 50)
test_w2v = w2v_tfidf(test_df, word2vec_model, 50)

  0%|          | 0/2301 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]

CPU times: user 82 ms, sys: 5.79 ms, total: 87.8 ms
Wall time: 85.6 ms


In [200]:
clf.fit(pd.DataFrame(train_w2v), y_train)

pred = clf.predict(pd.DataFrame(test_w2v))
metrics.accuracy_score(pred, y_test)

0.80078125

In [196]:
#тестить предобученные не буду\ нет смысла.

## conv

In [13]:
import torchtext
import torch
from torchtext.vocab import vocab
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
from torch import nn

from collections import OrderedDict, Counter

In [59]:
max_len = 30
embedding_dim = 50
num_epochs = 15
batch_size = 16
lr = 1e-4

In [35]:
counter = Counter()

for line in data.itertuples():
    tokens = clean_text(line.text).split()
    sup_counter = Counter(tokens)
    counter.update(sup_counter)
    1
specials = ['<pad>', '<unk>']
for special in specials:
    counter[special] = 0
    
ordered_dict = OrderedDict(sorted(counter.items(), key=lambda x: x[1], reverse=True))

In [36]:
vocabular = vocab(ordered_dict, min_freq=1, specials=['<pad>', '<unk>'])
vocabular.set_default_index(vocabular['<unk>'])

In [37]:
vocabular.lookup_indices(['<pad>', '<unk>']), len(vocabular)

([0, 1], 1032)

In [63]:
#mapping
sent = list(data.text.map(clean_text).str.split())
fasttext_model = FastText(sentences=sent,
                         vector_size=50, window=5, workers=4, min_count=0).wv

In [39]:
class CustomDataset(Dataset):
    
    def __init__(self, df, vocabular, max_len):
        super().__init__()
        
        self.df = df
        self.vocabular = vocabular
        self.max_len = max_len
        
        self.texts = []
        self.tokens = []
        self.labels = []
        
        for line in df.itertuples():
            self.texts.append(line.text)
            toks = clean_text(line.text).split()[:max_len]
            self.tokens.append([self.vocabular[t] for t in toks])
            self.labels.append(line.intent)
            
    def __getitem__(self, idx):
        res = {}
        
        res['text'] = self.texts[idx]
        res['labels'] = torch.as_tensor(self.labels[idx], dtype=torch.int64)
        res['tokens'] = torch.as_tensor(self.tokens[idx], dtype=torch.int64)
        res['tokens_len'] = torch.as_tensor(len(self.tokens[idx]), dtype=torch.int64)
        
        return res
    
    def __len__(self):
        return len(self.texts)

In [40]:
def collate_fn(batch, padding_value=0, batch_first=True):
    
    labels, tokens, tokens_lens = [], [], []
    for b in batch:
        labels.append(b['labels'])
        tokens.append(b['tokens'])
        tokens_lens.append(b['tokens_len'])
    
    res = {}
    res['tokens'] = torch.nn.utils.rnn.pad_sequence(tokens, batch_first=batch_first, padding_value=padding_value)
    res['tokens_lens'] = torch.stack(tokens_lens)
    res['labels'] = torch.stack(labels)
    return res

In [41]:
class ConvNet(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=5):
        super().__init__()
        self.embeddings = nn.Embedding(len(vocabular), embedding_dim=embed_size)
        self.cnn = nn.Sequential(
            nn.Conv1d(embed_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
        )
        self.cl = nn.Sequential(
            nn.Linear(hidden_size, 2 * hidden_size),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(2 * hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        x = self.embeddings(x)  # (batch_size, seq_len, embed_dim)
        x = x.permute(0, 2, 1)  # (batch_size, embed_dim, seq_len)
        x = self.cnn(x)
        prediction = self.cl(x)
        return prediction

In [78]:
def train_val(model, trainloader, validloader, num_epochs, optimizer, loss_func, scheduler,
              num_freeze_iter=100, max_grad_norm=1):
    
    train_losses, valid_losses = [], []
    valid_score = []
    
    last_loss = 100
    best_val_loss = 100
    patience = 6
    trigger_times = 0
    
    num_iter = 0
    
    freeze_embedding(model)
    
    for epoch in tqdm(range(num_epochs)):
        
        torch.cuda.empty_cache()
        model.train()
        running_loss = 0

        training_bar = tqdm(trainloader, unit='batch')

        for batch in training_bar:
            
            if num_iter > num_freeze_iter:
                freeze_embeddings(model, True)
            
            training_bar.set_description(f'Epoch {epoch + 1}, train stage')
            x_batch, y_batch = batch['tokens'].to(device), batch['labels'].to(device)
            y_pred = model(x_batch)
            
            loss = loss_func(y_pred, y_batch)
            running_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
            optimizer.step()
            num_iter += 1

            training_bar.set_postfix(loss=loss.item())
        
        train_losses.append(running_loss / len(trainloader))
        
        model.eval()
        running_loss = 0
        
        validation_bar = tqdm(validloader, unit='batch')
        
        with torch.no_grad():
            correct = 0
            num_obj = 0
            
            for batch in validation_bar:
                validation_bar.set_description(f'Epoch {epoch + 1}, validation stage')
                
                x_batch, y_batch = batch['tokens'].to(device), batch['labels'].to(device)
                y_pred = model(x_batch)
                
                loss = loss_func(y_pred, y_batch)
                running_loss += loss.item()
                
                correct += (y_batch == y_pred.argmax(-1)).float().sum()
                num_obj += len(y_batch)
                
                validation_bar.set_postfix(loss=loss.item())
            
        scheduler.step()
        
        valid_losses.append(running_loss / len(validloader))
        valid_score.append(correct / num_obj)
        
        ###Early stopping
        if valid_losses[-1] > last_loss:
            trigger_times += 1
            if trigger_times >= patience:
                print('Early stopping!\n')
                return model, train_losses, valid_losses, valid_score
        else:
            trigger_times = 0       
        last_loss = valid_losses[-1]
        
        print(f'Epoch {epoch + 1}... training loss: {train_losses[-1]}, validation loss: {valid_losses[-1]},\
              validation_score: {valid_score[-1]}')
        
    return model, train_losses, valid_losses, valid_score

In [50]:
train_df, test_df = model_selection.train_test_split(data, test_size=0.1, 
                                                     random_state=RANDOM_STATE, 
                                                     stratify=data['intent'])

In [51]:
train_df, val_df = model_selection.train_test_split(train_df, test_size=0.1,
                                                   random_state=RANDOM_STATE,
                                                   stratify=train_df['intent'])

In [52]:
train_df = CustomDataset(train_df, vocabular, max_len)
val_df = CustomDataset(val_df, vocabular, max_len)
test_df = CustomDataset(test_df, vocabular, max_len)

In [53]:
trainloader = DataLoader(train_df, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=collate_fn) 
validloader = DataLoader(val_df, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn)
testloader = DataLoader(test_df, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn)

In [79]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = ConvNet(fasttext_model.vector_size, 50).to(device)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
exp_lr_scheduler = lr_scheduler.CyclicLR(optimizer, 1e-4, 1e-3, cycle_momentum=False)

In [80]:
with torch.no_grad():
    for word, idx in vocabular.get_stoi().items():
        model.embeddings.weight[idx] = torch.from_numpy(fasttext_model.get_vector(word))

In [83]:
def freeze_embeddings(model, req_grad=False):
    embeddings = model.embeddings
    for param in embeddings.parameters():
        param.requires_grad = req_grad

In [84]:
model, train_losses, valid_losses, valid_score = train_val(model, trainloader, validloader, num_epochs,
                                                          optimizer, loss_fn, exp_lr_scheduler)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 1... training loss: 1.4639740861379182, validation loss: 1.3807529926300048,              validation_score: 0.3982684016227722


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 2... training loss: 1.3081502648500296, validation loss: 1.1729651967684427,              validation_score: 0.4502164423465729


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 3... training loss: 1.085155734190574, validation loss: 0.9765977462132772,              validation_score: 0.5281385183334351


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 4... training loss: 0.8910312056541443, validation loss: 0.7228906929492951,              validation_score: 0.6320346593856812


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 5... training loss: 0.681978969849073, validation loss: 0.5207610348860423,              validation_score: 0.7316017150878906


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 6... training loss: 0.4703687499348934, validation loss: 0.34068605800469715,              validation_score: 0.7489177584648132


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 7... training loss: 0.28890938248771886, validation loss: 0.22594904924432438,              validation_score: 0.7575757503509521


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 8... training loss: 0.1803579911016501, validation loss: 0.1561252428839604,              validation_score: 0.7532467246055603


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 9... training loss: 0.11973123482356851, validation loss: 0.12179198122272888,              validation_score: 0.761904776096344


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 10... training loss: 0.08758716990168278, validation loss: 0.07025497717161973,              validation_score: 0.7662337422370911


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 11... training loss: 0.0633208802399727, validation loss: 0.1780216848788162,              validation_score: 0.7575757503509521


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 12... training loss: 0.0593154930554402, validation loss: 0.08551458166524147,              validation_score: 0.7575757503509521


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 13... training loss: 0.04259786212328786, validation loss: 0.10138404192402958,              validation_score: 0.7705627679824829


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 14... training loss: 0.030895390369942807, validation loss: 0.14761297792720143,              validation_score: 0.7705627679824829


  0%|          | 0/130 [00:00<?, ?batch/s]

  0%|          | 0/15 [00:00<?, ?batch/s]

Epoch 15... training loss: 0.04239993996729586, validation loss: 0.14710899038376132,              validation_score: 0.7662337422370911


In [None]:
#result: winner - simple logreg with tf-idf!!!! )))))))))))))))))))))))))))))))))