# Рекуррентные нейронные сети RNN LSTM GRU

In [1]:
!pip install stop_words
!pip install pymorphy2

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


## Импорты

In [2]:
import re
import gensim
import numpy as np
import pandas as pd
from tqdm import tqdm
from string import punctuation
from functools import lru_cache

from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer

import keras
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Dense, Dropout, Activation, Input, Embedding
from keras.layers import Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking
from keras.callbacks import EarlyStopping

## Настройки

In [3]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
DATA_PATH = './data/'
EPOCHS = 10
BATCH_SIZE = 256

tqdm.pandas()

## Загрузка данных

In [5]:
df_train = pd.read_csv(DATA_PATH+"train.csv")
df_test = pd.read_csv(DATA_PATH+"test.csv")
df_val = pd.read_csv(DATA_PATH+"val.csv")

In [6]:
df_train.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


## Предобработка текста

In [7]:
stop_words = set(get_stop_words('ru')) - {'не', 'ни', 'нет'}
exclude = set(punctuation)
morpher = MorphAnalyzer()

@lru_cache(None)
def lemmatize(word):
    return morpher.parse(word)[0].normal_form

def preprocess_text(text):
    text = str(text)
    text.lower()
    text = re.sub(r'@\S+|\sRT\s', ' ', ' '+text+' ').strip()
    text = re.sub(r'(не|ни|нет)\s+', 'не', text)
    
    for char in exclude:
        text = text.replace(char, ' ')

    words = [word for word in text.split() if word not in stop_words]
    words = [word for word in words if len(word) >= 2]
    words = [lemmatize(word) for word in words]
    return ' '.join(words)

In [8]:
df_train['text'] = df_train['text'].progress_apply(preprocess_text)
df_val['text'] = df_val['text'].progress_apply(preprocess_text)
df_test['text'] = df_test['text'].progress_apply(preprocess_text)
df_train.head()

100%|██████████| 181467/181467 [00:17<00:00, 10391.30it/s]
  6%|▌         | 1265/22683 [00:00<00:01, 14055.76it/s]


KeyboardInterrupt: 

In [None]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

In [None]:
tokenizer = Tokenizer(num_words=None, 
                      filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                      lower = False,
                      split = ' ')

tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

In [None]:
X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

y_train = df_train['class'].values
y_val = df_val['class'].values

## На вебинаре мы говорили что долгое время CNN и RNN архитектуры были конурируещими выяснить какая архитектура больше подходит для задачи сантимент анализа на данных с вебинара

In [None]:
def make_model(*layers):
    model = Sequential()
    model.add(Embedding(input_dim=word_count,
                        input_length=training_length,
                        output_dim=128,
                        trainable=True,
                        mask_zero=True))

    model.add(Masking(mask_value=0.0))
    for layer in layers:
        model.add(layer)
    
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def train_model(model):
    early_stopping=EarlyStopping(monitor='val_loss') 
    history = model.fit(X_train, 
                        y_train, 
                        batch_size=BATCH_SIZE, 
                        epochs=EPOCHS, 
                        verbose=1, 
                        validation_split=0.1, 
                        callbacks=[early_stopping])
    
    score = model.evaluate(X_valid, y_val, batch_size=BATCH_SIZE, verbose=1)
    print('\n')
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    
    return history

## 1. построить свёрточные архитектуры

In [None]:
model = make_model(Conv1D(128, 3),
                   Activation("relu"),
                   GlobalMaxPool1D())

train_model(model)

## 2. построить различные архитектуры с RNN

In [None]:
model = make_model(SimpleRNN(64))
train_model(model)

In [None]:
model = make_model(GRU(64))
train_model(model)

In [None]:
model = make_model(LSTM(64))
train_model(model)

## 3. построить совместные архитектуры CNN -> RNN  и (RNN -> CNN)

In [None]:
model = make_model(Conv1D(128, 3),
                   Activation("relu"),
                   SimpleRNN(64))
train_model(model)

In [None]:
model = make_model(SimpleRNN(64, return_sequences=True),
                   Conv1D(64, 3),
                   GlobalMaxPool1D())
train_model(model)

## 4. сделать выводы что получилось лучше

Лучший результат на данном датасете показали сверточная сеть и CNN -> RNN. Рекуррентные сети показали себя незначительно хуже.