Провести сравнение RNN, LSTM, GRU на датасете отзывов (из предыдущих занятий/материалов)

In [1]:
import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [3]:
df = pd.read_excel('C:/Users/Aleks/Enter_NLP/Lesson_5/отзывы за лето.xls')
df.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [4]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df['text'] = df['Content'].apply(preprocess_text)

In [5]:
df.head()

Unnamed: 0,Rating,Content,Date,text
0,5,It just works!,2017-08-14,it just works
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,целое удобноной приложениеиз минус хотеть боль...
2,5,Отлично все,2017-08-14,отлично
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,зависать 1 работа антивирус ранее пользоваться...
4,5,"Очень удобно, работает быстро.",2017-08-14,удобно работать быстро


In [6]:
df['target'] = df['Rating'] > 3
df['target'] = df['target'].astype(int)

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.3,  random_state=42)
df_train, df_val  = train_test_split(df_train, test_size=0.3, random_state=42)

In [8]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping  

In [9]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

In [19]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_val = pad_sequences(sequences_val, maxlen=training_length)
X_test = pad_sequences(sequences_test, maxlen=training_length)

In [20]:
y_train = df_train['target'].values
y_val = df_val['target'].values
y_test = df_test['target'].values

# RNN

In [17]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
#model.add(Masking(mask_value=0.0))
model.add(SimpleRNN(128, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

early_stopping=EarlyStopping(monitor='val_loss',
                             patience=5,
                            mode="min",
                            restore_best_weights=True)  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=20,
                    verbose=1,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 113, 30)           250230    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 128)               20352     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 278,903
Trainable params: 278,903
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [23]:
score = model.evaluate(X_test, y_test, batch_size=20, verbose=1)
print('\n')
print('Test loss:', score[0])
print('Test auc:', score[1])



Test loss: 0.2671741247177124
Test auc: 0.8952888250350952


# LSTM

In [24]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
#model.add(Masking(mask_value=0.0))
model.add(LSTM(128, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

early_stopping=EarlyStopping(monitor='val_loss',
                             patience=5,
                            mode="min",
                            restore_best_weights=True)  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=20,
                    verbose=1,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping])

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 113, 30)           250230    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               81408     
_________________________________________________________________
dense_8 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 65        
Total params: 339,959
Trainable params: 339,959
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [25]:
score = model.evaluate(X_test, y_test, batch_size=20, verbose=1)
print('\n')
print('Test loss:', score[0])
print('Test auc:', score[1])



Test loss: 0.2929645776748657
Test auc: 0.8898031711578369


# GRU

In [26]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
#model.add(Masking(mask_value=0.0))
model.add(GRU(128, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

early_stopping=EarlyStopping(monitor='val_loss',
                             patience=5,
                            mode="min",
                            restore_best_weights=True)  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=20,
                    verbose=1,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 113, 30)           250230    
_________________________________________________________________
gru (GRU)                    (None, 128)               61440     
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 319,991
Trainable params: 319,991
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [27]:
score = model.evaluate(X_test, y_test, batch_size=20, verbose=1)
print('\n')
print('Test loss:', score[0])
print('Test auc:', score[1])



Test loss: 0.2893984317779541
Test auc: 0.8864149451255798


RNN
Test loss: 0.2671741247177124
Test auc: 0.8952888250350952
LSTM
Test loss: 0.2929645776748657
Test auc: 0.8898031711578369
RGU
Test loss: 0.2893984317779541
Test auc: 0.8864149451255798

RNN лучше. Но возможно так получилось из за того, что у RN меньше параметров и у нас мало данных.
плюс RNN не имел проблем на GPU