In [None]:
import time

import pandas as pd
import numpy as np

import nltk
nltk.download('gutenberg')

import tensorflow as tf
keras = tf.keras

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tqdm import tqdm

import matplotlib.pyplot as plt
plt.style.use('ggplot')

# NLP Concepts #6 - RNNs in practice

## Define helpers

In [None]:
def get_slices(text, slice_len=100):
    
    text_split = text.split(' ')
    
    n_chunks = int(len(text_split) / slice_len)
    current_start_id = 0
    
    slices = []
    
    for i in range(n_chunks + 1):
        current_slice = text_split[current_start_id:current_start_id + slice_len]
        
        if len(current_slice) > 0:
            slices.append(' '.join(current_slice))
        
        current_start_id += slice_len
        
    return slices

## Get and prepare data

In [None]:
# Print corpora and their lengths
for i in nltk.corpus.gutenberg.fileids():
    src = nltk.corpus.gutenberg.words(i)
    print(i, len(src))

### Join and check lengths

In [None]:
# Shakespeare's "Macbeth"
shkspr = nltk.corpus.gutenberg.words('shakespeare-macbeth.txt')
shkspr_join = ' '.join(shkspr)

len(shkspr)

In [None]:
# Carroll's "Alice's adventures (...)"
carroll = nltk.corpus.gutenberg.words('carroll-alice.txt')[:23140]
carroll_join = ' '.join(carroll)

len(carroll)

### Get slices and generate labels

In [None]:
# Get slices
shkspr_slices = get_slices(shkspr_join, 250)
carroll_slices = get_slices(carroll_join, 250)

In [None]:
len(shkspr_slices), len(carroll_slices)

In [None]:
# Create X
X = shkspr_slices + carroll_slices

# Create y
y = np.array([0] * 93 + [1] * 93)

### Train test split

In [None]:
# Train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

### Tokenize texts

In [None]:
# Initialize a tokenizer
VOCAB_SIZE = 20000

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=VOCAB_SIZE,
    lower=True,  
    oov_token=1
)

In [None]:
# Fit the toknizer
tokenizer.fit_on_texts(X_train)

In [None]:
# Tokenize
X_train_tok = tokenizer.texts_to_sequences(X_train) 
X_test_tok = tokenizer.texts_to_sequences(X_test)

In [None]:
# Plot seq lens
seq_lens_train = [len(seq) for seq in X_train_tok]
seq_lens_test = [len(seq) for seq in X_test_tok]

plt.hist(seq_lens_train, density=True, alpha=.7, label='Train')
plt.hist(seq_lens_test, density=True, alpha=.7, label='Test')
plt.legend()
plt.show()

In [None]:
# Find maxlen
MAXLEN = max([len(x.split(' ')) for x in X_train])

In [None]:
# Pad sequences
X_train_tok_pad = pad_sequences(X_train_tok, maxlen=MAXLEN, padding='post')
X_test_tok_pad = pad_sequences(X_test_tok, maxlen=MAXLEN, padding='post')

## Classification example

In [None]:
def train_and_evaluate(model, X_train, y_train, X_val, y_val, epochs=30, lr=1e-4, verbose=2):
    
    # Compile
    model.compile(loss = 'binary_crossentropy',
                  optimizer = tf.keras.optimizers.Adam(lr),
                  metrics = ['accuracy'])
    
    # Callbacks
    early = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
    
    # Time it
    start = time.time()
    
    # Fit 
    history = model.fit(X_train, y_train,
                       validation_data = (X_val, y_val),
                       callbacks = [early],
                       epochs = epochs,
                       verbose = verbose)
    
    # Time it
    training_time = time.time() - start
    
    # Plot learning curve
    plt.figure(figsize=(10, 4))
    plt.subplot(121)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='val')
    plt.legend()
    plt.title('Loss')
    
    plt.subplot(122)
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='val')
    plt.legend()
    plt.title('Accuracy')
    
    plt.show()
    
    # Evaluate
    loss, acc = model.evaluate(X_val, y_val, verbose=0)
    
    print(f'Val. accuracy: {acc}')
    print(f'Training time: {training_time:.02f} seconds')

### Build a simple model

In [None]:
model = tf.keras.Sequential([
    
    tf.keras.layers.Embedding(
        input_dim = VOCAB_SIZE,
        output_dim = 100,
        mask_zero = True,
        input_length = MAXLEN),
    
    tf.keras.layers.LSTM(64),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
train_and_evaluate(model, X_train_tok_pad, y_train, X_test_tok_pad, y_test, verbose=0, epochs=30)

### Build a deeper model

In [None]:
model_2 = tf.keras.Sequential([
    
    tf.keras.layers.Embedding(
        input_dim = VOCAB_SIZE,
        output_dim = 100,
        mask_zero = True,
        input_length = MAXLEN),
    
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(128),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
])


In [None]:
model_2.summary()

In [None]:
train_and_evaluate(model_2, X_train_tok_pad, y_train, X_test_tok_pad, y_test, verbose=0, epochs=30)

## Build a bi-directional model

In [None]:
model_3 = tf.keras.Sequential([
    
    tf.keras.layers.Embedding(
        input_dim = VOCAB_SIZE,
        output_dim = 100,
        mask_zero = True,
        input_length = MAXLEN),
    
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model_3.summary()

In [None]:
train_and_evaluate(model_3, X_train_tok_pad, y_train, X_test_tok_pad, y_test, verbose=0, epochs=30)

## Build a deep bi-directional model

In [None]:
model_4 = tf.keras.Sequential([
    
    tf.keras.layers.Embedding(
        input_dim = VOCAB_SIZE,
        output_dim = 100,
        mask_zero = True,
        input_length = MAXLEN),
    
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model_4.summary()

In [None]:
train_and_evaluate(model_4, X_train_tok_pad, y_train, X_test_tok_pad, y_test, verbose=0, epochs=30)

## Long distance dependencies - `SimpleRNN()` vs `LSTM()`

#### Experiment:

We will put a keyword (***Fußbodenheizung*** - a German word for (under)floor heating) at the beginnig of a random sequence. We'll manipulate sequence length and check how it affects performance of `SimpleRNN` and `LSTM` in a classification task.

<br>

<img src="https://www.heizsparer.de/wp-content/uploads/images/estrich-fussbodenheizung-wolfilser-adobestock.jpg" alt="Drawing" style="width: 400px;"/>



In [None]:
KEYWORD = 'fußbodenheizung'
LENGTHS = [10, 30, 50, 200]
VOCAB_SIZE = 20000

In [None]:
'fußbodenheizung' in ' '.join(carroll).lower()

In [None]:
def build_dataset(length, n_examples):
    
    X = []
    y = []
    
    for i in range(n_examples):
        class_ = np.random.choice([0, 1])
        
        if class_ == 1:
            row = np.array([-1] + list(np.random.choice(np.arange(0, 1, .01), length - 1)))
        elif class_ == 0:
            row = np.random.choice(np.arange(0, 1, .01), length)
            
        X.append(row)
        y.append(class_)
        
    return np.array(X)[:, :, np.newaxis], np.array(y)

In [None]:
def build_model(rnn_type, len_):
    
    if rnn_type == 'rnn':
        rnn_layer = tf.keras.layers.SimpleRNN
    elif rnn_type == 'lstm':
        rnn_layer = tf.keras.layers.LSTM
    
    model = tf.keras.Sequential([

        rnn_layer(64, input_shape=(len_, 1), return_sequences=True),
        rnn_layer(128),
        
        tf.keras.layers.Dense(32, activation='tanh'),
        tf.keras.layers.Dropout(.2),
        
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model

In [None]:
for len_ in LENGTHS:
    
    # Prep data
    print(f'Buidling dataset of length {len_}')
    X, y = build_dataset(len_, 200)
    
    # Train test split 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
    
    # Build models
    rnn_model = build_model('rnn', len_)
    lstm_model = build_model('lstm', len_)
    
    # Train and evaluate
    print(f'\nRNN for {len_}')
    train_and_evaluate(rnn_model, X_train, y_train, X_test, y_test, verbose=0, epochs=30)
    
    print(f'\nLSTM for {len_}')
    train_and_evaluate(lstm_model, X_train, y_train, X_test, y_test, verbose=0, epochs=30)