In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K

from tensorflow.keras import regularizers
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Concatenate, BatchNormalization, Average, Convolution2D, MaxPooling2D, Activation, GlobalMaxPooling2D, AveragePooling2D, GlobalAveragePooling2D, Embedding, LSTM, GRU, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model, Sequential
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences, skipgrams
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras import initializers

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import numpy as np
import matplotlib.pyplot as plt

from dz07lib import create_or_check_path, detect_encoding, read_and_modify_vocab, reverse_dict
import threading
import os
import pickle
import random
import time

print(tf.__version__)

1.14.0


In [2]:
if tf.__version__.startswith("1."):
    val_acc_name = "val_acc"
    acc_name = "acc"
else:
    tf.compat.v1.disable_eager_execution()
    val_acc_name = "val_accuracy"
    acc_name = "accuracy"

In [3]:
# total_text_filepath = "data/total_max_frei.txt"
total_text_filepath = "data/raw_eng/train.50k"
print(os.path.isfile(total_text_filepath))

True


In [4]:
with open(total_text_filepath, "rt") as file:
    lines = file.read().split("\n")
lines = [line for line in lines if len(line) > 0]

In [5]:
VOCAB_SIZE_LIMIT = 10000
SEQUENCE_LENGTH = 50

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)

vocab_size = len(tokenizer.word_index) + 1
print("Vocab size before:", vocab_size)
if vocab_size > VOCAB_SIZE_LIMIT:
    vocab_size = VOCAB_SIZE_LIMIT
print("Vocab size after:", vocab_size)

tokenizer.num_words = vocab_size

Vocab size before: 50596
Vocab size after: 10000


In [7]:
sequences = np.array(pad_sequences(tokenizer.texts_to_sequences(lines), maxlen=SEQUENCE_LENGTH+1))

In [8]:
np.max(sequences)

9999

In [9]:
X, y = sequences[:,:-1], sequences[:,-1]

In [10]:
y = to_categorical(y, num_classes=vocab_size)

In [11]:
X.shape

(49998, 50)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=3458273)

In [13]:
seq_length = X.shape[1]
seq_length

50

In [14]:
try:
    del model
except:
    pass
K.clear_session()

model = Sequential()
model.add(Embedding(vocab_size, 60, input_length=seq_length))
model.add(LSTM(200, return_sequences=True, activation='relu', recurrent_activation='relu', dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(200, return_sequences=True, activation='relu', recurrent_activation='relu', dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(200, return_sequences=True, activation='relu', recurrent_activation='relu', dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(200, activation='relu', recurrent_activation='relu', dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.25))
model.add(Dense(200, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 60)            600000    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 200)           208800    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 200)           320800    
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 200)           320800    
_________________________________________________________________
lstm_3 (LSTM)                (None, 200)               320800    
__________________

In [15]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
# save the tokenizer
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [17]:
checkpoint = ModelCheckpoint('lm_for_tokenizer.hdf5',
                            monitor=val_acc_name,
                            save_best_only=True,
                            period=1,
                            verbose=1)
early_stopping_callback = EarlyStopping(monitor=val_acc_name, 
                                        min_delta=1e-9, 
                                        patience=50, 
                                        verbose=1, 
                                        mode='auto', 
                                        baseline=None, 
                                        restore_best_weights=True)
learning_rate_reduction = ReduceLROnPlateau(monitor=val_acc_name,
                                           patience=10,
                                           verbose=1,
                                           factor=0.75,
                                           min_lr=0.00000001)



In [18]:
# fit model
model.fit(X_train, y_train, 
          batch_size=128, 
          epochs=9999999999, 
          callbacks=[checkpoint, learning_rate_reduction, early_stopping_callback],
         validation_data=(X_test, y_test))

Train on 39998 samples, validate on 10000 samples
Epoch 1/9999999999
Epoch 00001: val_acc improved from -inf to 0.03480, saving model to lm_for_tokenizer.hdf5
Epoch 2/9999999999
Epoch 00002: val_acc did not improve from 0.03480
Epoch 3/9999999999
Epoch 00003: val_acc did not improve from 0.03480
Epoch 4/9999999999
Epoch 00004: val_acc did not improve from 0.03480
Epoch 5/9999999999
Epoch 00005: val_acc did not improve from 0.03480
Epoch 6/9999999999
Epoch 00006: val_acc did not improve from 0.03480
Epoch 7/9999999999
Epoch 00007: val_acc did not improve from 0.03480
Epoch 8/9999999999
Epoch 00008: val_acc did not improve from 0.03480
Epoch 9/9999999999
Epoch 00009: val_acc did not improve from 0.03480
Epoch 10/9999999999
Epoch 00010: val_acc did not improve from 0.03480
Epoch 11/9999999999
Epoch 00011: val_acc did not improve from 0.03480

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0007500000356230885.
Epoch 12/9999999999
Epoch 00012: val_acc did not improve from 0.0348

Epoch 00029: val_acc did not improve from 0.03480
Epoch 30/9999999999
Epoch 00030: val_acc did not improve from 0.03480
Epoch 31/9999999999
Epoch 00031: val_acc did not improve from 0.03480

Epoch 00031: ReduceLROnPlateau reducing learning rate to 0.0004218749818392098.
Epoch 32/9999999999
Epoch 00032: val_acc did not improve from 0.03480
Epoch 33/9999999999
Epoch 00033: val_acc did not improve from 0.03480
Epoch 34/9999999999
Epoch 00034: val_acc did not improve from 0.03480
Epoch 35/9999999999
Epoch 00035: val_acc did not improve from 0.03480
Epoch 36/9999999999
Epoch 00036: val_acc did not improve from 0.03480
Epoch 37/9999999999
Epoch 00037: val_acc did not improve from 0.03480
Epoch 38/9999999999
Epoch 00038: val_acc did not improve from 0.03480
Epoch 39/9999999999
Epoch 00039: val_acc did not improve from 0.03480
Epoch 40/9999999999
Epoch 00040: val_acc did not improve from 0.03480
Epoch 41/9999999999
Epoch 00041: val_acc did not improve from 0.03480

Epoch 00041: ReduceLROnPlate

<tensorflow.python.keras.callbacks.History at 0x7f5351c0cd68>

## Generate

In [19]:
# select a seed text
seed_text = ""
while len(seed_text) < 150:
    random.seed(time.time())
    seed_text = lines[random.randint(0,len(lines))]
print(seed_text + '\n')

a.p. a man who committed murder at age eleven pleaded guilty to drug possession monday less than two years after he was released from juvenile detention for the killing



In [20]:
encoded = tokenizer.texts_to_sequences([seed_text])
encoded = pad_sequences(encoded, maxlen=SEQUENCE_LENGTH)
encoded.shape

(1, 50)

In [21]:
# predict probabilities for each word
yhat = model.predict_classes(encoded, verbose=1)



In [22]:
out_word = ''
for word, index in tokenizer.word_index.items():
    if index == yhat:
        out_word = word
        break

In [23]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
#         print(in_text)
        result.append(out_word)
    return ' '.join(result)

In [24]:
print("Seed string:\n", seed_text + '\n')

Seed string:
 a.p. a man who committed murder at age eleven pleaded guilty to drug possession monday less than two years after he was released from juvenile detention for the killing



In [25]:
print("Generated text:")
generated_text = generate_seq(model, tokenizer, seq_length, seed_text, 100)
print(generated_text)

Generated text:
said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said said
