# Development for Serving Model

In [2]:
import re
from tensorflow.python.keras.preprocessing import sequence
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack

In [7]:
# Read in id2word for decoding the encoded examples.
f = BytesIO(file_io.read_file_to_string('Wikimedia-Toxicity-Personal-Attacks/output/Gao-PA-word2id.bin', binary_mode=True))
id2word_dict = msgpack.unpack(f, raw=False)

In [4]:
text_comment = "`-NEWLINE_TOKENThis is not ``creative``.  Those are the dictionary definitions of the terms ``insurance`` and ``ensurance`` as properly applied to ``destruction``.  If you don't understand that, fine, legitimate criticism, I'll write up ``three man cell`` and ``bounty hunter`` and then it will be easy to understand why ``ensured`` and ``insured`` are different - and why both differ from ``assured``.NEWLINE_TOKENNEWLINE_TOKENThe sentence you quote is absolutely neutral.  You just aren't familiar with the underlying theory of strike-back (e.g. submarines as employed in nuclear warfare) guiding the insurance, nor likely the three man cell structure that kept the IRA from being broken by the British.  If that's my fault, fine, I can fix that to explain.  But ther'es nothing ``personal`` or ``creative`` about it.NEWLINE_TOKENNEWLINE_TOKENI'm tired of arguing with you.  Re: the other article, ``multi-party`` turns up plenty, and there is more use of ``mutually`` than ``mutual``.  If I were to apply your standard I'd be moving ``Mutual Assured Destruction`` to ``talk`` for not appealing to a Reagan voter's biases about its effectiveness, and for dropping the ``ly``.NEWLINE_TOKENNEWLINE_TOKENThere is a double standard in your edits.  If it comes from some US history book, like ``peace movement`` or 'M.A.D.' as defined in 1950, you like it, even if the definition is totally useless in 2002 and only of historical interest. "

In [13]:
def _encode_text(text, encoding_dict):

    # Clean text.

    # Remove unwanted tokens.
    text = re.sub('NEWLINE_TOKEN', ' ', text)
    text = re.sub('TAB_TOKEN', ' ', text)

    # Force lowercase.
    text = text.lower()

    # Remove single and double backticks.
    text = re.sub("`", '', text)

    # Remove single quotes.
    text = re.sub("'", '', text)

    # Replace multiple periods in sequence with one period.
    text = re.sub("\.{2,}", '.', text)

    # Replace everything except words, '.', '|', '?', and '!' with space.
    text = re.sub('[^\w_|\.|\?|!]+', ' ', text)

    # Replace periods with ' . '.
    text = re.sub('\.', ' . ', text)

    # Replace '?' with ' ? '.
    text = re.sub('\?', ' ? ', text)

    # Replace '!' with ' ! '.
    text = re.sub('!', ' ! ', text)

    # Tokenize by splitting on whitespace.
    # No leading or trailing whitespace is kept.
    # Consecutive spaces are treated as a single space.
    text = text.split()

    # Split into sentences, store tokens, get max sentence length.
    tokens = []
    maxsentlen = 0
    sentences = []
    sentence = []
    for t in text:
        # Use '.', '!', '?' as markers of end of sentence.
        if t not in ['.', '!', '?']:
            # Not at end of a sentence.
            sentence.append(t)
        else:
            # At end of a sentence.
            sentence.append(t)

            # Add sentence to sentences.
            sentences.append(sentence)

            # Track longest sentence.
            if len(sentence) > maxsentlen:
                maxsentlen = len(sentence)

            # Reset sentence list.
            sentence = []

    # If sentence has word, add to list of sentences.
    if len(sentence) > 0:
        sentences.append(sentence)

    # Add split sentences to tokens.
    tokens.append(sentences)

    # Index for unknown words.
    unk = len(encoding_dict) - 1

    # Convert words to word indices.
    for idx, doc in enumerate(tokens):
        # Build list of indicies representing the words of each sentence,
        # if word is not a key in word2id mapping, use unk.
        encoded_text = []
        for sent in doc:
            encoded_text.append(
                [encoding_dict[word] if word in encoding_dict else unk for word in sent])
    
    # Flatten list of encoded words.
    encoded_text = [item for sublist in encoded_text for item in sublist]

    return encoded_text

In [12]:
_encode_text(text_comment, id2word_dict)

[14,
 9,
 16,
 3450,
 1,
 139,
 21,
 2,
 2165,
 2892,
 5,
 2,
 730,
 5838,
 4,
 36992,
 20,
 1216,
 1939,
 3,
 3670,
 1,
 26,
 7,
 55,
 252,
 11,
 724,
 1362,
 786,
 237,
 376,
 79,
 415,
 406,
 2347,
 4,
 23324,
 6344,
 4,
 87,
 13,
 46,
 19,
 1159,
 3,
 252,
 78,
 21947,
 4,
 16460,
 21,
 272,
 4,
 78,
 226,
 5657,
 36,
 6684,
 1,
 2,
 521,
 7,
 644,
 9,
 976,
 487,
 1,
 7,
 53,
 749,
 1546,
 24,
 2,
 6033,
 701,
 5,
 2972,
 159,
 334,
 1,
 705,
 1,
 24998,
 20,
 4923,
 12,
 3201,
 5393,
 15850,
 2,
 5838,
 584,
 769,
 2,
 415,
 406,
 2347,
 2042,
 11,
 1339,
 2,
 4196,
 36,
 93,
 2141,
 33,
 2,
 574,
 1,
 26,
 178,
 30,
 2655,
 724,
 8,
 43,
 923,
 11,
 3,
 490,
 1,
 29,
 424,
 224,
 189,
 28,
 3450,
 39,
 13,
 1,
 77,
 2381,
 5,
 1986,
 24,
 7,
 1,
 389,
 2,
 66,
 27,
 3786,
 557,
 3463,
 79,
 1996,
 4,
 44,
 9,
 64,
 98,
 5,
 7517,
 97,
 4829,
 1,
 26,
 8,
 83,
 3,
 1281,
 23,
 846,
 333,
 19,
 1541,
 4829,
 6684,
 3670,
 3,
 67,
 15,
 16,
 5434,
 3,
 6,
 7773,
 5797,
 5295,
 39,
