In [1]:
import sys
sys.version

'3.7.5 (default, Nov  7 2019, 10:50:52) \n[GCC 8.3.0]'

In [2]:
import re
import time
from typing import List, Union

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds
from unidecode import unidecode

In [107]:
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.3.0
Eager mode:  True
GPU is NOT AVAILABLE


In [3]:
imdb_reviews, info = tfds.load('imdb_reviews', with_info=True)
# ds = ds.shuffle(1024).batch(32).prefetch(tf.data.experimental.AUTOTUNE)

print('Label info:')
print(info.features["label"].num_classes)
print(info.features["label"].names)
print(info.features["label"].int2str(0))  # Human readable version (8 -> 'cat')
print(info.features["label"].str2int('pos'))

print()
print('Feature info:')
print(info.features.shape)
print(info.features.dtype)
print(info.features['text'].shape)
print(info.features['text'].dtype)

print()
print('Datasplit info:')
print(info.splits.keys())
print(info.splits['train'].num_examples)
print(info.splits['train'].filenames)
print(info.splits['train'].num_shards)

Label info:
2
['neg', 'pos']
neg
1

Feature info:
{'text': (), 'label': ()}
{'text': tf.string, 'label': tf.int64}
()
<dtype: 'string'>

Datasplit info:
dict_keys(['test', 'train', 'unsupervised'])
25000
['imdb_reviews-train.tfrecord-00000-of-00001']
1


In [4]:
SPLIT_PERCENT = 80

imdb_train = tfds.load('imdb_reviews', split=f'train[:{SPLIT_PERCENT}%]', as_supervised=True)
imdb_val = tfds.load('imdb_reviews', split=f'train[{SPLIT_PERCENT}%:]', as_supervised=True)
imdb_test = tfds.load('imdb_reviews', split='test', as_supervised=True)

print(len(imdb_train))
print(len(imdb_val))
print(len(imdb_test))

20000
5000
25000


In [5]:
for elm in imdb_train.take(2):
    print(elm)
#     text, label = elm['text'], elm['label']
#     print('Text:', text)
#     print('Label:', label)
#     print()

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on

In [82]:
for text, label in imdb_val.shuffle(25000, reshuffle_each_iteration=False).take(1):
    print(text)
    print(label)

tf.Tensor(b'Surreal film noir released soon after the "real," genre-defining classics "The Maltese Falcon," "Double Indemnity" and "The Postman Always Rings Twice." Welles films shouldn\'t be evaluated against others. He was playing by different rules. In fact, he was playing. This starts where other femme fatale films leave off, so the vaguely logical (but interesting) whodunit is embellished with a display of Wellesian scenes (typical rapid-fire style), dialog (lots of "hard-boiled" philosophy), and unusual acting (good Hayworth presumably intentionally one-dimensional). To Welles "genre" may have meant "formula" but he seemed to like using "mysteries" as backgrounds for his "entertainments."', shape=(), dtype=string)
tf.Tensor(1, shape=(), dtype=int64)


In [83]:
# Complete punctuation from string.punctuation: !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~

class Tokenizer:
    NON_ASCII_REGEX = re.compile(r"[^\x00-\x7F\u2013]")
    PUNCTUATIONS = '!"#$%&\'()*+/;<=>@?[\\]^_`{|}~'
    PUNCTUATIONS_REGEX = re.compile(r"([%s])" % PUNCTUATIONS)
    REAL_SEPARATOR_REGEX = re.compile(r"(([\.,:][^a-zA-Z0-9])|([\.,:]$))")
    
    def __init__(self, max_vocab=50000, lower=False, normalize=False, remove_non_ascii=False,
                 pad_token='<PAD>', unk_token='<UNK>', bos_token='<BOS>', eos_token='<EOS>'):
        if normalize and remove_non_ascii:
            raise ValueError('You can only choose between normalize/remove_non_ascii!')
        self.max_vocab = max_vocab
        self.lower = lower
        self.normalize = normalize
        self.remove_non_ascii = remove_non_ascii
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 0
        
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.bos_token = bos_token
        self.eos_token = eos_token
        self.add_words([pad_token, unk_token, bos_token, eos_token])
            
    def train_tokenizer(self, text: Union[str, List[str]]):
        tokenized_text = []
        if isinstance(text, str):
            tokenized_text.extend(self.tokenize(text))
        elif isinstance(text, list):
            for sentence in text:
                tokenized_text.extend(self.tokenize(sentence))
                
        self.add_words(tokenized_text)
        self.word2count = {k: v for k, v in sorted(self.word2count.items(), key=lambda item: item[1], reverse=True)}
        if self.n_words > self.max_vocab:
            print(f'Least frequent words will be removed until n_vocab = {self.max_vocab} (excluding special tokens)')
            words_to_remove = list(self.word2count.keys())[self.max_vocab:]
            self.remove_words(words_to_remove, restructure_index=True)
        
    def encode(self, texts:List[str]):
        list_of_tokens = []
        for text in texts:
            tokenized_text = self.tokenize(text)
            list_of_tokens.append([self.word2index.get(token) or self.word2index[self.unk_token] for token in tokenized_text])
        return list_of_tokens
            
    def decode(self, list_of_tokens:List[List[int]], to_string=False):
        output = []
        for tokens in list_of_tokens:
            tokens = [self.index2word[token] for token in tokens]
            if to_string:
                output.append(' '.join(tokens))
            else:
                output.append(tokens)
        return output
    
    def tokenize(self, s):
        if self.lower:
            s = s.lower()
        if self.normalize:
            s = self._unicode_to_ascii(s)
        if self.remove_non_ascii:
            s = _remove_non_ascii(s)
        s = re.sub(self.PUNCTUATIONS_REGEX, r" \1 ", s)
        s = re.sub(self.REAL_SEPARATOR_REGEX, r" \1", s)
        s = s.split()
        return s

    def add_words(self, list_of_words):
        for word in list_of_words:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def remove_words(self, list_of_words, restructure_index=False):
        for word in list_of_words:
            self.remove_word(word)
        if restructure_index:
            self._restructure_index()

    def remove_word(self, word, restructure_index=False):
        if word not in self.word2index:
            raise ValueError(f'{word} does not exist in the dictionary.')
        elif word in [self.pad_token, self.unk_token, self.bos_token, self.eos_token]:
            pass
        else:
            del self.index2word[self.word2index[word]]
            del self.word2index[word]
            del self.word2count[word]
            self.n_words -= 1
        if restructure_index:
            self._restructure_index()
        
    def _unicode_to_ascii(self, s):
        return unidecode(s)

    def _remove_non_ascii(self, s):
        return re.sub(self.NON_ASCII_REGEX, r"", s)

    def _restructure_index(self):
        self.index2word = {}
        i = 0
        for word in self.word2index:
            self.index2word[i] = word
            self.word2index[word] = i            
            i += 1
        assert i == self.n_words

In [81]:
vocab_dict = VocabDict()
tokenizer = Tokenizer(200000)

sentences = []
for text_tensor, _ in imdb_train:
    sentences.append(text_tensor.numpy().decode('utf-8'))
start_time = time.time()
tokenizer.train_tokenizer(sentences)
print('Execution time:', time.time() - start_time)

print(f'n_vocab: {tokenizer.n_words}')

for text_tensor, _ in imdb_train.take(1):
    print(tokenizer.tokenize(text_tensor.numpy().decode('utf-8')))
    encoded = tokenizer.encode([text_tensor.numpy().decode('utf-8')])
    print(encoded)
    decoded = tokenizer.decode(encoded)
    print(decoded)

Execution time: 3.882234811782837
n_vocab: 110396
['This', 'was', 'an', 'absolutely', 'terrible', 'movie', '.', 'Don', "'", 't', 'be', 'lured', 'in', 'by', 'Christopher', 'Walken', 'or', 'Michael', 'Ironside', '.', 'Both', 'are', 'great', 'actors', ',', 'but', 'this', 'must', 'simply', 'be', 'their', 'worst', 'role', 'in', 'history', '.', 'Even', 'their', 'great', 'acting', 'could', 'not', 'redeem', 'this', 'movie', "'", 's', 'ridiculous', 'storyline', '.', 'This', 'movie', 'is', 'an', 'early', 'nineties', 'US', 'propaganda', 'piece', '.', 'The', 'most', 'pathetic', 'scenes', 'were', 'those', 'when', 'the', 'Columbian', 'rebels', 'were', 'making', 'their', 'cases', 'for', 'revolutions', '.', 'Maria', 'Conchita', 'Alonso', 'appeared', 'phony', ',', 'and', 'her', 'pseudo-love', 'affair', 'with', 'Walken', 'was', 'nothing', 'but', 'a', 'pathetic', 'emotional', 'plug', 'in', 'a', 'movie', 'that', 'was', 'devoid', 'of', 'any', 'real', 'meaning', '.', 'I', 'am', 'disappointed', 'that', 'ther

In [162]:
# for el in imdb_train:
#     print(el)
tf.tuple

<function tensorflow.python.ops.control_flow_ops.tuple_v2(tensors, control_inputs=None, name=None)>

In [179]:
def preprocess_input(text, label):
#     _text = pad_sequences(tokenizer.encode([text.numpy().decode('utf-8')]), padding='post')[0]
    return text, label

# imdb_train.take(1).apply(lambda x: tokenizer.encode(x.numpy().decode('utf-8')))
for el in imdb_train.take(5).map(preprocess_input:
    print(el, end='\n')

Tensor("add:0", shape=(), dtype=string)
(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.1">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, re

[[11, 13, 32, 425, 380, 17], [18, 11, 213, 328]]
[[ 11  13  32 425 380  17]
 [ 18  11 213 328   0   0]]


keras_preprocessing.text.Tokenizer