На вебинаре мы говорили что долгое время CNN и RNN архитектуры были конурируещими выяснить какая архитектура больше подходит для задачи сантимент анализа на данных с вебинара 

In [1]:
import numpy as np
import pandas as pd
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation

import re

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dropout, Dense, Flatten, LSTM, GRU, SimpleRNN, Conv1D, MaxPooling1D, Masking
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
train_df = pd.read_csv('../lesson_8/lection8_materials_actual/data/train.csv')
valid_df = pd.read_csv('../lesson_8/lection8_materials_actual/data/val.csv')
test_df = pd.read_csv('../lesson_8/lection8_materials_actual/data/test.csv')

In [3]:
train_df.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181467 entries, 0 to 181466
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      181467 non-null  int64 
 1   text    181467 non-null  object
 2   class   181467 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.2+ MB


In [5]:
train_df['class'].value_counts()

1    92063
0    89404
Name: class, dtype: int64

In [6]:
valid_df['class'].value_counts()

1    11449
0    11234
Name: class, dtype: int64

In [7]:
stopwords_ru = set(get_stop_words('ru'))
exclude = set(punctuation)
morpher = MorphAnalyzer()

In [8]:
def text_preprocess(text):
    text = str(text)
    text = ''.join(c for c in text if c not in exclude)
    text = text.lower()
    text = re.sub("\sне", "не", text)
    text = [morpher.parse(word)[0].normal_form for word in text.split() if word not in stopwords_ru]
    text = ' '.join(text)
    
    return text

In [9]:
train_df['text'] = train_df['text'].apply(text_preprocess)
valid_df['text'] = valid_df['text'].apply(text_preprocess)
test_df['text'] = test_df['text'].apply(text_preprocess)

In [10]:
train_corpus = train_df['text'].values
valid_corpus = valid_df['text'].values
test_corpus = test_df['text'].values

In [11]:
tokenizer = Tokenizer(num_words=None, 
                      filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', 
                      lower=False, 
                      split=' ')

In [12]:
tokenizer.fit_on_texts(train_corpus)

In [13]:
sequences_train = tokenizer.texts_to_sequences(train_corpus)
sequences_valid = tokenizer.texts_to_sequences(valid_corpus)
sequences_test = tokenizer.texts_to_sequences(test_corpus)

In [14]:
word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in train_corpus])

In [15]:
X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_valid, maxlen=training_length)

In [16]:
y_train = train_df['class'].values
y_valid = valid_df['class'].values

In [17]:
y_train = to_categorical(y_train)
y_valid = to_categorical(y_valid)

### Свёрточная архитектура 

In [18]:
conv_model = Sequential()
conv_model.add(Embedding(input_dim=word_count, 
                         input_length=training_length, 
                         output_dim=30, 
                         trainable=True, 
                         mask_zero=True))
conv_model.add(Conv1D(16, 16, activation='relu'))
conv_model.add(MaxPooling1D())
conv_model.add(Conv1D(4, 4, activation='relu'))
conv_model.add(MaxPooling1D())
conv_model.add(Flatten())
conv_model.add(Dense(4, activation='relu'))
conv_model.add(Dense(2, activation='softmax'))
conv_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 27, 30)            7743240   
_________________________________________________________________
conv1d (Conv1D)              (None, 12, 16)            7696      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 6, 16)             0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 3, 4)              260       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 4)              0         
_________________________________________________________________
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 4)                 2

In [19]:
conv_model.compile(loss='binary_crossentropy', 
                   optimizer='adam', 
                   metrics=['accuracy'])

In [20]:
epochs = 10
batch_size = 512

early_stopping=EarlyStopping(monitor='val_loss')

conv_history = conv_model.fit(X_train, y_train, 
                              batch_size=batch_size, 
                              epochs=epochs, 
                              verbose=1, 
                              validation_split=0.1,
                              callbacks=[early_stopping])

Epoch 1/10


In [21]:
conv_model_metrics = conv_model.evaluate(X_valid, y_valid)
metrics_table = {'conv_model': conv_model_metrics}



### Архитектура с RNN

In [22]:
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=word_count, 
                         input_length=training_length, 
                         output_dim=30, 
                         trainable=True, 
                         mask_zero=True))
rnn_model.add(Masking(mask_value=0.0))
rnn_model.add(SimpleRNN(64))
rnn_model.add(Dense(64, activation='relu'))
rnn_model.add(Dropout(0.5))
rnn_model.add(Dense(32, activation='relu'))
rnn_model.add(Dense(16, activation='relu'))
rnn_model.add(Dense(8, activation='relu'))
rnn_model.add(Dense(2, activation='softmax'))
rnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 27, 30)            7743240   
_________________________________________________________________
masking (Masking)            (None, 27, 30)            0         
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 64)                6080      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 16)               

In [23]:
rnn_model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])

In [24]:
rnn_model_history = rnn_model.fit(X_train, y_train,
                                  batch_size=batch_size,
                                  epochs=epochs,
                                  verbose=1,
                                  validation_split=0.1,
                                  callbacks=[early_stopping])

Epoch 1/10


In [25]:
rnn_model_metrics = rnn_model.evaluate(X_valid, y_valid)
metrics_table['rnn_model'] = rnn_model_metrics



### Архитектура с LSTM

In [26]:
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=word_count, 
                         input_length=training_length, 
                         output_dim=30, 
                         trainable=True, 
                         mask_zero=True))
lstm_model.add(Masking(mask_value=0.0))
lstm_model.add(LSTM(64))
lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(32, activation='relu'))
lstm_model.add(Dense(16, activation='relu'))
lstm_model.add(Dense(8, activation='relu'))
lstm_model.add(Dense(2, activation='softmax'))
lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 27, 30)            7743240   
_________________________________________________________________
masking_1 (Masking)          (None, 27, 30)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24320     
_________________________________________________________________
dense_7 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_9 (Dense)              (None, 16)               

In [27]:
lstm_model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])

In [28]:
lstm_model_history = lstm_model.fit(X_train, y_train,
                                  batch_size=batch_size,
                                  epochs=epochs,
                                  verbose=1,
                                  validation_split=0.1,
                                  callbacks=[early_stopping])

Epoch 1/10


In [29]:
lstm_model_metrics = lstm_model.evaluate(X_valid, y_valid)
metrics_table['lstm_model'] = lstm_model_metrics



### Архитектура с GRU

In [30]:
gru_model = Sequential()
gru_model.add(Embedding(input_dim=word_count, 
                         input_length=training_length, 
                         output_dim=30, 
                         trainable=True, 
                         mask_zero=True))
gru_model.add(Masking(mask_value=0.0))
gru_model.add(GRU(64))
gru_model.add(Dense(64, activation='relu'))
gru_model.add(Dense(32, activation='relu'))
gru_model.add(Dense(16, activation='relu'))
gru_model.add(Dense(8, activation='relu'))
gru_model.add(Dense(2, activation='softmax'))
gru_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 27, 30)            7743240   
_________________________________________________________________
masking_2 (Masking)          (None, 27, 30)            0         
_________________________________________________________________
gru (GRU)                    (None, 64)                18432     
_________________________________________________________________
dense_12 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_13 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_14 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_15 (Dense)             (None, 8)                

In [31]:
gru_model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])

In [32]:
gru_model_history = gru_model.fit(X_train, y_train,
                                  batch_size=batch_size,
                                  epochs=epochs,
                                  verbose=1,
                                  validation_split=0.1,
                                  callbacks=[early_stopping])

Epoch 1/10


In [33]:
gru_model_metrics = gru_model.evaluate(X_valid, y_valid)
metrics_table['gru_model'] = gru_model_metrics



### Совместная архитектура CNN -> RNN и (RNN -> CNN)

In [34]:
conv_lstm_model = Sequential()
conv_lstm_model.add(Embedding(input_dim=word_count, 
                         input_length=training_length, 
                         output_dim=30, 
                         trainable=True, 
                         mask_zero=True))
conv_lstm_model.add(Conv1D(16, 16, activation='relu'))
conv_lstm_model.add(Conv1D(4, 4, activation='relu'))
conv_lstm_model.add(MaxPooling1D())
conv_lstm_model.add(Masking(mask_value=0.0))
conv_lstm_model.add(LSTM(32))
conv_lstm_model.add(Dense(32, activation='relu'))
conv_lstm_model.add(Dense(16, activation='relu'))
conv_lstm_model.add(Dense(8, activation='relu'))
conv_lstm_model.add(Dense(2, activation='softmax'))
conv_lstm_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 27, 30)            7743240   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 12, 16)            7696      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 9, 4)              260       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 4, 4)              0         
_________________________________________________________________
masking_3 (Masking)          (None, 4, 4)              0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                4736      
_________________________________________________________________
dense_17 (Dense)             (None, 32)               

In [35]:
conv_lstm_model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])

In [36]:
conv_lstm_model_history = conv_lstm_model.fit(X_train, y_train,
                                              batch_size=batch_size,
                                              epochs=epochs,
                                              verbose=1,
                                              validation_split=0.1,
                                              callbacks=[early_stopping])

Epoch 1/10


In [37]:
conv_lstm_model_metrics = conv_lstm_model.evaluate(X_valid, y_valid)
metrics_table['conv_lstm_model'] = conv_lstm_model_metrics



### Вывод

In [38]:
metrics_table

{'conv_model': [0.6413739919662476, 0.6456817984580994],
 'rnn_model': [0.5134350061416626, 0.7372481822967529],
 'lstm_model': [0.5109298825263977, 0.7445223331451416],
 'gru_model': [0.517645001411438, 0.7401137351989746],
 'conv_lstm_model': [0.5999479293823242, 0.6806418895721436]}

In [39]:
metrics_table = pd.DataFrame(metrics_table, index=['loss', 'accuracy'])

In [40]:
metrics_table

Unnamed: 0,conv_model,rnn_model,lstm_model,gru_model,conv_lstm_model
loss,0.641374,0.513435,0.51093,0.517645,0.599948
accuracy,0.645682,0.737248,0.744522,0.740114,0.680642


In [49]:
metrics_table.sort_values(by='accuracy', axis=1, ascending=False)

Unnamed: 0,lstm_model,gru_model,rnn_model,conv_lstm_model,conv_model
loss,0.51093,0.517645,0.513435,0.599948,0.641374
accuracy,0.744522,0.740114,0.737248,0.680642,0.645682


#### В целом модель на основе LSTM показала лучший результат. Совместная модель CNN и RNN показала результат чуть лучше чем обычная свертоная модель