In [1]:
import gc
import tensorflow as tf
import numpy as np
import PyPDF3
import string

In [2]:
from collections import namedtuple
import tensorflow.keras.layers as layers
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
words = ['I have never been there before', 'I am going there']

In [4]:
tok = Tokenizer(num_words = 30, oov_token = '<OOV>')

In [5]:
dir(tok)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_keras_api_names',
 '_keras_api_names_v1',
 'char_level',
 'document_count',
 'filters',
 'fit_on_sequences',
 'fit_on_texts',
 'get_config',
 'index_docs',
 'index_word',
 'lower',
 'num_words',
 'oov_token',
 'sequences_to_matrix',
 'sequences_to_texts',
 'sequences_to_texts_generator',
 'split',
 'texts_to_matrix',
 'texts_to_sequences',
 'texts_to_sequences_generator',
 'to_json',
 'word_counts',
 'word_docs',
 'word_index']

In [6]:
tok.fit_on_texts(words)

In [7]:
w = tok.texts_to_sequences(words)

In [8]:
s = pad_sequences(w, maxlen = 15)

In [9]:
s

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 5, 6, 3, 7],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 8, 9, 3]])

In [10]:
embedder = Embedding(20, output_dim=3, input_length = 15)

In [11]:
r = embedder(s)

In [12]:
r.shape

TensorShape([2, 15, 3])

In [13]:
def get_text(fpath):
    """
    Load text from file into string.
    
    Parameters
    ----------
    fpath (str):
        Path to file. Must have either a .txt or .pdf extension.
    """
    if fpath.endswith('.pdf'):
        with open(fpath, "rb") as f:
            pdf = PyPDF3.PdfFileReader(f)
            text = str()
            for page_num in range(pdf.numPages):
                page = pdf.getPage(page_num)
                text = text + ' ' + page.extractText()
    else:
        with open(fpath, 'r') as f:
            text = f.read()
    return text

In [14]:
def encode_text(text, extend = True, unique_chars = None):
    """
    Tokenize text string
    
    Parameters
    ----------
    text (str):
        String of tet to tokenize.
    extend (bool):
        Add on punctuations to unique character set.
    unique_chars (list):
        Any set of unique characters already in memory.
    """
    result_tuple = namedtuple('results', ['encoded_text', 'unique_char', 'int2char', 'char2int'])
    
    if unique_chars is None:
        unique_chars = list(set(text).union(set(string.punctuation)))
    if extend:
        unique_chars.extend(list(string.punctuation))
        
    char2int = {char : unique_chars.index(char) for char in unique_chars}
    int2char = {v : k for (k, v) in char2int.items()}
    
    encoded_text = np.array(list(map(lambda x: char2int[x], list(text))))
    
    return result_tuple(encoded_text, unique_chars, int2char, char2int)

In [15]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [16]:
def batch_sequence(arr, batch_size, n_labels, seq_length):
    numel_seq = batch_size * seq_length
    num_batches = arr.size // numel_seq
    
    arr = arr[: num_batches * numel_seq].reshape(batch_size, -1)
    #print(arr.shape)
    
    batched_data = [(tf.convert_to_tensor(arr[:, n : n + seq_length]),
                     tf.convert_to_tensor(one_hot_encode(arr[:, n + 1 : n + 1 + seq_length], n_labels)))
                    for n in range(0, arr.shape[1], seq_length)]
    
    ### Finalize final array size
    batched_data[-1] = (batched_data[-1][0],
                        tf.concat([batched_data[-1][1],
                                   tf.expand_dims(one_hot_encode(batched_data[0][0][:, 0].numpy(), n_labels), axis = 1)],
                                  axis = 1))
    
    ###batched_arr = [arr[n : n + numel_seq].reshape(batch_size, seq_length) for n in range(num_batches)]
    return iter(batched_data), num_batches

In [17]:
def batch_sequence(arr, batch_size, n_labels, seq_length):
    numel_seq = batch_size * seq_length
    num_batches = arr.size // numel_seq
    
    arr = arr[: num_batches * numel_seq].reshape(batch_size, -1)
    #print(arr.shape)
    
    batched_data = [(tf.convert_to_tensor(arr[:, n : n + seq_length]),
                     tf.convert_to_tensor(arr[:, n + 1 : n + 1 + seq_length]))
                    for n in range(0, arr.shape[1], seq_length)]
    
    ### Finalize final array size
    batched_data[-1] = (batched_data[-1][0],
                        tf.concat([batched_data[-1][1],
                                   tf.expand_dims(batched_data[0][0][:, 0].numpy(), axis = 1)],
                                  axis = 1))
    
    ###batched_arr = [arr[n : n + numel_seq].reshape(batch_size, seq_length) for n in range(num_batches)]
    return iter(batched_data), num_batches

In [18]:
class CharRNN(tf.keras.Model):
    def __init__(self, vocab_size = 100, output_dim = 32, input_length = 100,
                 bidirectional = False, num_lstm = 2):
        super(CharRNN, self).__init__()
        
        self.vocab_size = vocab_size
        self.output_dim = output_dim
        self.input_length = input_length
        
        self.num_lstm = num_lstm
        self.bidirectional = bidirectional
        
        self.embedder = layers.Embedding(input_dim = self.vocab_size, output_dim = self.output_dim,
                                         input_length = self.input_length)
        lstm_list = list()
        if self.num_lstm > 1:
            for num in range(self.num_lstm):
                lstm_list.append(layers.LSTM(self.output_dim, return_sequences = True))
                if num == self.num_lstm-1:
                    lstm_list.append(layers.LSTM(self.output_dim, return_sequences = True, dropout = 0.25))
        else:
            lstm_list.append(layers.LSTM(self.output_dim, dropout = 0.2))
            
        if self.bidirectional:
            self.lstm = layers.Bidirectional(*lstm_list)
        else:
            self.lstm = tf.keras.models.Sequential(lstm_list)
            
        self.fc1 = layers.Dense(units = 2*self.vocab_size, activation = 'relu')
        self.fc2 = layers.Dense(units = self.vocab_size, activation = None)
        
    def call(self, x):
        y = self.embedder(x)
        y = self.lstm(y)
        y = self.fc1(y)
        
        return self.fc2(y)

In [19]:
help(tf.keras.models.Model)

Help on class Model in module keras.engine.training:

class Model(keras.engine.base_layer.Layer, keras.utils.version_utils.ModelVersionSelector)
 |  Model(*args, **kwargs)
 |  
 |  `Model` groups layers into an object with training and inference features.
 |  
 |  Args:
 |      inputs: The input(s) of the model: a `keras.Input` object or list of
 |          `keras.Input` objects.
 |      outputs: The output(s) of the model. See Functional API example below.
 |      name: String, the name of the model.
 |  
 |  There are two ways to instantiate a `Model`:
 |  
 |  1 - With the "Functional API", where you start from `Input`,
 |  you chain layer calls to specify the model's forward pass,
 |  and finally you create your model from inputs and outputs:
 |  
 |  ```python
 |  import tensorflow as tf
 |  
 |  inputs = tf.keras.Input(shape=(3,))
 |  x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
 |  outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
 |  model = tf.kera

In [20]:
model = CharRNN()

In [21]:
train_data = get_text('anna.txt')
val_data = get_text('The-Prince.pdf')

In [22]:
encoded_val, unique_chars, _, _ = encode_text(val_data)
encoded_train, unique_chars, int2char, char2int = encode_text(train_data, unique_chars = unique_chars, extend = False)

In [23]:
len(set(unique_chars))

96

In [24]:
set(train_data).difference(set(val_data))

{'!', '_', '`'}

In [25]:
set(val_data).difference(set(train_data))

{'#', '+', '[', ']'}

In [26]:
len(set(train_data))

83

In [27]:
len(set(val_data))

84

In [28]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [29]:
int2char.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95])

In [30]:
model = CharRNN(vocab_size = 96, output_dim = 16, input_length = 16)

In [31]:
dir(model)

['_SCALAR_UPRANKING_ON',
 '_TF_MODULE_IGNORED_PROPERTIES',
 '__call__',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activity_regularizer',
 '_add_trackable',
 '_add_trackable_child',
 '_add_variable_with_custom_getter',
 '_assert_compile_was_called',
 '_assert_weights_created',
 '_auto_track_sub_layers',
 '_autocast',
 '_autographed_call',
 '_base_model_initialized',
 '_build_input_shape',
 '_call_accepts_kwargs',
 '_call_arg_was_passed',
 '_call_fn_arg_defaults',
 '_call_fn_arg_positions',
 '_call_fn_args',
 '_call_full_argspec',
 '_callable_losses',
 '_cast_single_input',
 '_check_call_args',
 '_ch

In [32]:
batch_size = 32
sequence_len = 16
epochs = 20

In [33]:
gc.collect()

12069

In [34]:
### Batchify data
train_data, _ = batch_sequence(encoded_train, n_labels = 96, batch_size = batch_size, seq_length = sequence_len)
val_data = batch_sequence(encoded_val, n_labels = 96, batch_size = batch_size, seq_length = sequence_len)[0]

In [35]:
x = next(val_data)

In [36]:
x[0].shape

TensorShape([32, 16])

In [37]:
x[1].shape

TensorShape([32, 16])

In [38]:
gc.collect()

0

In [39]:
_

3877

In [40]:
### Batchify data
train_data, _ = batch_sequence(encoded_train, n_labels = 96, batch_size = batch_size, seq_length = sequence_len)
val_data = batch_sequence(encoded_val, n_labels = 96, batch_size = batch_size, seq_length = sequence_len)[0]

In [42]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-4, beta_1 = 0.9, beta_2 = 0.99),
              loss = tf.keras.losses.SparseCategoricalCrossentropy(), )

model.fit(tf.data.Dataset.from_generator(train_data), epochs = epochs,
          validation_data = tf.data.Dataset.from_generator(val_data))

TypeError: `generator` must be a Python callable.

In [None]:
dir(tf.data.Dataset)