### Урок 6. Рекуррентные нейронные сети. LSTM. GRU.#

### -- Автор: Шенк Евгений Станиславович

#### Провести сравнение RNN, LSTM, GRU на датасете отзывов (из предыдущих занятий/материалов)

Результат:  
LSTM и GRU отработали одинаково хорошо по точности и по времени обучения (результат практически одинаковый).  
SimpleRNN обучался в 20 раз дольше и точность оказалась значительно ниже.  

In [1]:
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
import pandas as pd
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, MaxPool1D, GlobalMaxPool1D, GlobalAveragePooling1D, Bidirectional, SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping  

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
dataset_dir = '..\data\\aclImdb'

#### Удаляем лишнее

In [3]:
train_dir = os.path.join(dataset_dir, 'train')
test_dir = os.path.join(dataset_dir, 'test')
remove_dir = os.path.join(train_dir, 'unsup')
#shutil.rmtree(remove_dir)

In [4]:
train_dir

'..\\data\\aclImdb\\train'

In [5]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir, 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [6]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to neg
Label 1 corresponds to pos


In [7]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir, 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [8]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir, 
    batch_size=batch_size)

Found 25000 files belonging to 2 classes.


#### Обработка данных

In [9]:
max_features = 10000
sequence_length = 200

In [10]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [11]:
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [12]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [13]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [14]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [15]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

### Models

In [16]:
epochs = 20
batch_size = 64
seed = 42
embedding_dim = 64

#### RNN

In [17]:
model_rnn = Sequential()

model_rnn.add(
    Embedding(input_dim=max_features + 1,
              output_dim=embedding_dim,
              trainable=True,
              mask_zero=True))
model_rnn.add(Masking(mask_value=0.0))
model_rnn.add(Bidirectional(SimpleRNN(8)))
model_rnn.add(Dropout(0.5))
model_rnn.add(Dense(64, activation='relu'))
model_rnn.add(Dropout(0.5))
model_rnn.add(Dense(1))

model_rnn.compile(
    optimizer='adam', loss=losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

In [18]:
early_stopping=EarlyStopping(monitor='val_loss')  

history_rnn = model_rnn.fit(train_ds,
                            validation_data=val_ds,
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=1,
                            callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [19]:
loss_rnn, accuracy_rnn = model_rnn.evaluate(test_ds)

print("Loss: ", loss_rnn)
print("Accuracy: ", accuracy_rnn)

Loss:  0.5754722952842712
Accuracy:  0.7222800254821777


#### LSTM

In [20]:
model_lstm = Sequential()

model_lstm.add(
    Embedding(input_dim=max_features + 1,
              output_dim=embedding_dim,
              trainable=True,
              mask_zero=True))
model_lstm.add(Masking(mask_value=0.0))
model_lstm.add(Bidirectional(LSTM(8)))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(64, activation='relu'))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(1))

model_lstm.compile(
    optimizer='adam', loss=losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')

In [21]:
history_lstm = model_lstm.fit(train_ds,
                            validation_data=val_ds,
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=1,
                            callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20


In [22]:
loss_lstm, accuracy_lstm = model_lstm.evaluate(test_ds)

print("Loss: ", loss_lstm)
print("Accuracy: ", accuracy_lstm)

Loss:  0.41392016410827637
Accuracy:  0.83024001121521


#### GRU

In [23]:
model_gru = Sequential()

model_gru.add(
    Embedding(input_dim=max_features + 1,
              output_dim=embedding_dim,
              trainable=True,
              mask_zero=True))
model_gru.add(Masking(mask_value=0.0))
model_gru.add(Bidirectional(GRU(8)))
model_gru.add(Dropout(0.5))
model_gru.add(Dense(64, activation='relu'))
model_gru.add(Dropout(0.5))
model_gru.add(Dense(1))

model_gru.compile(
    optimizer='adam', loss=losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')

In [24]:
history_gru = model_gru.fit(train_ds,
                            validation_data=val_ds,
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=1,
                            callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [25]:
loss_gru, accuracy_gru = model_gru.evaluate(test_ds)

print("Loss: ", loss_gru)
print("Accuracy: ", accuracy_gru)

Loss:  0.45755547285079956
Accuracy:  0.8292800188064575
