# Train using Word Embedding, Dense Layer, and Keras
Reference: https://www.kaggle.com/nzw0301/simple-keras-fasttext-val-loss-0-31/notebook  

In [1]:
import pandas as pd
import numpy as np
from multiprocessing import cpu_count

import keras

import spacy

!python -m spacy download en_core_web_sm
!python -m spacy validate

Using TensorFlow backend.


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/home/ec2-user/anaconda3/lib/python3.7/site-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.1.0[0m   [38;5;2m✔[0m



# Read Data Set

In [2]:
filename_dataset = './datasets/train.csv'
df = pd.read_csv(filename_dataset)

df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
y = df['author'].map({'EAP':0, 'HPL':1, 'MWS':2})
y.head()

0    0
1    1
2    0
3    2
4    1
Name: author, dtype: int64

In [4]:
y = keras.utils.to_categorical(y)

In [5]:
y.shape

(19579, 3)

In [6]:
y[:5]

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]], dtype=float32)

# Preprocessing
### Low-frequency words
In my experience, fastText is very fast, but I need to delete rare words to avoid overfitting.

##### NOTE: Some keywords are rare words, such like Cthulhu in Cthulhu Mythos of Howard Phillips Lovecraft. But these are useful for this task.

### Removing Stopwords
Nothing. To identify author from a sentence, some stopwords play an important role because one has specific usages of them.

### Stemming and Lowercase
Nothing. This reason is the same for stopwords removing. And I guess some stemming rules provided by libraries is bad for this task because all author is the older author.

### Cutting long sentence
Too long documents are cut.

### Punctuation
Because I guess each author has unique punctuations's usage in the novel, I separate them from words.

e.g. Don't worry -> Don ' t worry

In [7]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])
nlp.pipeline

[]

# Define Function to Insert White Spaces around Punctuations

In [8]:
def brief_cleaning(raw_docs):
    """
    Normalize the docs and insert white spaces around each puncutation
    """
    punctuations = set(',.:;\"?!')
    
    for doc in raw_docs:
        doc = doc.lower()
        doc = doc.replace('\'', ' \' ')
        
        intersections = set(doc).intersection(punctuations)
        
        if not intersections:
            yield doc
        else:
            # insert white spaces around punctuations
            for punct in intersections:
                doc = doc.replace(punct, ' {} '.format(punct))
                
            yield doc
            

def preprocess(doc_spacy):
    """
    Lemmatize
    
    Args:
        doc_spacy (spacy.tokens.doc.Doc):
    """
    
    return [token.lemma_ for token in doc_spacy if not token.is_space]

In [9]:
text = df['text'][0]
print(text)

This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.


In [10]:
text.lower().replace('\'', ' \' ')

punctuations = set(',.:;\"?!')
intersections = set(text).intersection(punctuations)

print(intersections)

if not intersections:
    print('do nothing')
else:
    for punct in intersections:
        text = text.replace(punct, ' {} '.format(punct))
    
print(text)

{';', '.', ','}
This process ,  however ,  afforded me no means of ascertaining the dimensions of my dungeon ;  as I might make its circuit ,  and return to the point whence I set out ,  without being aware of the fact ;  so perfectly uniform seemed the wall . 


In [11]:
print([token.lemma_ for token in nlp(text) if not token.is_space])

['This', 'process', ',', 'however', ',', 'afford', 'me', 'no', 'mean', 'of', 'ascertain', 'the', 'dimension', 'of', 'my', 'dungeon', ';', 'a', 'I', 'may', 'make', 'its', 'circuit', ',', 'and', 'return', 'to', 'the', 'point', 'whence', 'I', 'set', 'out', ',', 'without', 'be', 'aware', 'of', 'the', 'fact', ';', 'so', 'perfectly', 'uniform', 'seem', 'the', 'wall', '.']


# Start Preprocessing

In [12]:
%time docs = [preprocess(doc) for doc in nlp.pipe(brief_cleaning(df['text']), batch_size=100, n_threads=cpu_count())]

CPU times: user 8.45 s, sys: 52 ms, total: 8.51 s
Wall time: 8.54 s


In [13]:
len(docs)

19579

In [14]:
print(docs[0])

['this', 'process', ',', 'however', ',', 'afford', 'me', 'no', 'mean', 'of', 'ascertain', 'the', 'dimension', 'of', 'my', 'dungeon', ';', 'a', 'i', 'may', 'make', 'its', 'circuit', ',', 'and', 'return', 'to', 'the', 'point', 'whence', 'i', 'set', 'out', ',', 'without', 'be', 'aware', 'of', 'the', 'fact', ';', 'so', 'perfectly', 'uniform', 'seem', 'the', 'wall', '.']


In [15]:
np.max(docs)

['\ufeff1', 'of', 'all', 'i', 'dismember', 'the', 'corpse', '.']

# Create Dictionary

In [16]:
tokenizer = keras.preprocessing.text.Tokenizer(filters='')

In [17]:
# tokenizer.fit_on_texts([token for doc in docs[:2] for token in doc])
tokenizer.fit_on_texts(docs)

In [18]:
len(tokenizer.word_counts)

18048

In [19]:
print(docs[0])

['this', 'process', ',', 'however', ',', 'afford', 'me', 'no', 'mean', 'of', 'ascertain', 'the', 'dimension', 'of', 'my', 'dungeon', ';', 'a', 'i', 'may', 'make', 'its', 'circuit', ',', 'and', 'return', 'to', 'the', 'point', 'whence', 'i', 'set', 'out', ',', 'without', 'be', 'aware', 'of', 'the', 'fact', ';', 'so', 'perfectly', 'uniform', 'seem', 'the', 'wall', '.']


In [20]:
print(tokenizer.texts_to_sequences(docs[0:2]))

[[23, 2200, 1, 151, 1, 646, 27, 38, 170, 3, 1508, 2, 1851, 3, 13, 3312, 14, 7, 9, 58, 64, 49, 3699, 1, 5, 175, 8, 2, 188, 1930, 9, 299, 86, 1, 145, 6, 1009, 3, 2, 303, 14, 40, 1509, 4228, 94, 2, 232, 4], [16, 107, 134, 592, 8, 27, 12, 2, 3700, 58, 6, 7, 449, 889, 4]]


In [21]:
tokenizer.index_word[23]

'this'

In [22]:
tokenizer.index_word[2200]

'process'

# Encode the Train Dataset

In [23]:
docs_encoded = tokenizer.texts_to_sequences(docs)

In [24]:
type(docs_encoded)

list

In [25]:
len(docs_encoded)

19579

In [26]:
seq_length = 256
docs_encoded = keras.preprocessing.sequence.pad_sequences(sequences=docs_encoded, 
                                                          maxlen=seq_length,
                                                          dtype='int32',
                                                         padding='post',
                                                         value=0)

In [27]:
type(docs_encoded)

numpy.ndarray

In [28]:
docs_encoded.shape

(19579, 256)

In [29]:
docs_encoded[:2, :30]

array([[  23, 2200,    1,  151,    1,  646,   27,   38,  170,    3, 1508,
           2, 1851,    3,   13, 3312,   14,    7,    9,   58,   64,   49,
        3699,    1,    5,  175,    8,    2,  188, 1930],
       [  16,  107,  134,  592,    8,   27,   12,    2, 3700,   58,    6,
           7,  449,  889,    4,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]], dtype=int32)

# Define DNN Model

In [30]:
def create_model(input_dim, vocab_size, embedding_size=128, output_dim=3, lr=1e-3):
    
    # Input shape does not include batch_size: (None, seq_length)
    inputs = keras.layers.Input(input_dim[1:])
    
    x_embed = keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size)(inputs)
    
    x_avg = keras.layers.GlobalAveragePooling1D()(x_embed)
    
    x_fc1 = keras.layers.Dense(output_dim)(x_avg)
    
    outputs = keras.layers.Activation('softmax')(x_fc1)
    
    model = keras.models.Model(inputs=inputs, outputs=outputs)
    
    optimizer = keras.optimizers.Adam(lr=lr)
    
    model.compile(loss='categorical_crossentropy',
                 optimizer=optimizer,
                 metrics=['accuracy'])
    return model


# Vocab size includes zero-padding
vocab_size = len(tokenizer.word_counts) + 1 
input_dim = docs_encoded.shape

model = create_model(input_dim=input_dim, vocab_size=vocab_size, embedding_size=128)

model.summary()

W0716 01:54:06.121537 140095028594368 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0716 01:54:06.138678 140095028594368 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0716 01:54:06.144138 140095028594368 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0716 01:54:06.178609 140095028594368 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 256)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 256, 128)          2310272   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 387       
_________________________________________________________________
activation_1 (Activation)    (None, 3)                 0         
Total params: 2,310,659
Trainable params: 2,310,659
Non-trainable params: 0
_________________________________________________________________


In [31]:
epochs=100
%time model.fit(docs_encoded, y, batch_size=128, epochs=epochs, callbacks=[keras.callbacks.EarlyStopping(patience=2, monitor='loss')])

W0716 01:54:06.286807 140095028594368 deprecation.py:323] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0716 01:54:06.350580 140095028594368 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
CPU times: user 13min 37s, sys: 41.6 s, total: 14min 19s
Wall time: 8min 6s


<keras.callbacks.History at 0x7f6a16480748>

# Define RNN Model

In [32]:
def get_model(input_dim, vocab_size, embed_size=64, hidden_size=32, output_dim=3, lr=1e-3):

    # input_dim = (None, seq_length, feature_size)
    inputs = keras.layers.Input(input_dim[1:])
    
    x_embed = keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size)(inputs)
    
    x_rnn = keras.layers.GRU(units=hidden_size, return_sequences=False)(x_embed)
    
    x_fc1 = keras.layers.Dense(output_dim)(x_rnn)
    
    outputs = keras.layers.Activation('softmax')(x_fc1)
    
    model = keras.models.Model(inputs=inputs, outputs=outputs)
    
    optimizer = keras.optimizers.Adam(lr=lr)
    
    model.compile(loss='categorical_crossentropy',
                 optimizer=optimizer,
                 metrics=['accuracy'])
    return model


model = get_model(input_dim=input_dim, vocab_size=vocab_size)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 256)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 256, 64)           1155136   
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                9312      
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 99        
_________________________________________________________________
activation_2 (Activation)    (None, 3)                 0         
Total params: 1,164,547
Trainable params: 1,164,547
Non-trainable params: 0
_________________________________________________________________


In [33]:
epochs = 5

%time model.fit(docs_encoded, y, batch_size=128, epochs=epochs, callbacks=[keras.callbacks.EarlyStopping(patience=2, monitor='loss')])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
CPU times: user 3min 2s, sys: 5.68 s, total: 3min 8s
Wall time: 1min 40s


<keras.callbacks.History at 0x7f6a167112b0>

# Define Bidirectional RNN Model

In [116]:
# Vocab size includes zero-padding
vocab_size = len(tokenizer.word_counts) + 1 
# docs_encoded = np.expand_dims(docs_encoded, axis=len(docs_encoded)-1)
input_dim = docs_encoded.shape

print(input_dim)

(19579, 256)


In [133]:
def get_model(input_dim, vocab_size, embed_size=64, hidden_size=32, output_dim=3, lr=1e-3):

    # input_dim = (None, seq_length, feature_size)
    inputs = keras.layers.Input(input_dim[1:])
    
    x_embed = keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size)(inputs)
    
    x_rnn = keras.layers.Bidirectional(keras.layers.GRU(units=hidden_size, return_sequences=False))(x_embed)
    
    x_fc1 = keras.layers.Dense(output_dim)(x_rnn)
    
    outputs = keras.layers.Activation('softmax')(x_fc1)
    
    model = keras.models.Model(inputs=inputs, outputs=outputs)
    
    optimizer = keras.optimizers.Adam(lr=lr)
    
    model.compile(loss='categorical_crossentropy',
                 optimizer=optimizer,
                 metrics=['accuracy'])
    return model


model = get_model(input_dim=input_dim, vocab_size=vocab_size)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_37 (InputLayer)        (None, 256)               0         
_________________________________________________________________
embedding_35 (Embedding)     (None, 256, 64)           1155136   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                18624     
_________________________________________________________________
dense_22 (Dense)             (None, 3)                 195       
_________________________________________________________________
activation_22 (Activation)   (None, 3)                 0         
Total params: 1,173,955
Trainable params: 1,173,955
Non-trainable params: 0
_________________________________________________________________


In [134]:
epochs = 20

%time model.fit(docs_encoded, y, batch_size=128, epochs=epochs, callbacks=[keras.callbacks.EarlyStopping(patience=2, monitor='loss')])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 35min 28s, sys: 15.5 s, total: 35min 43s
Wall time: 18min 20s


<keras.callbacks.History at 0x7f9f187f5198>