In [None]:
import string


def get_vocabulary_and_dictionaries():
    printable_chars = [char for char in string.printable if char not in ('\t', '\r', '\x0b', '\x0c')]
    extra_chars = ['✅', '🏆', '📈', '📉', '🎥', '💰', '📸', '…']
    vocabulary = sorted(printable_chars + extra_chars)
    char_to_id = dict((char, i + 1) for i, char in enumerate(vocabulary))
    char_to_id[''] = 0
    id_to_char = dict((char_to_id[char], char) for char in char_to_id)
    vocabulary_size = len(char_to_id)
    return vocabulary, char_to_id, id_to_char, vocabulary_size

In [None]:
from tensorflow import gfile


def read_file(file_name):
    with gfile.GFile(file_name, "r") as file:
        return file.readlines()

In [None]:
def clean_tweet(original_tweet, vocabulary):
    tweet = original_tweet\
            .strip()\
            .replace('“', '"')\
            .replace('”', '"')\
            .replace('’', '\'')\
            .replace('‘', '\'')\
            .replace('—', '-')\
            .replace('–', '-')
    return ''.join(list(filter(lambda char: char in vocabulary, tweet)))

In [None]:
# Vocabulary
vocabulary, char_to_id, id_to_char, vocabulary_size = get_vocabulary_and_dictionaries()
print('Vocabulary size: ', vocabulary_size)


# Input tweets
original_tweets = read_file('data/tweets.txt')
tweets = list(map(lambda tweet: clean_tweet(tweet, vocabulary), original_tweets))
print('#Tweets: ', len(tweets))

In [None]:
# Missing characters (not in vocabulary)
missing_chars = {}
for original_tweet in original_tweets:
    for char in original_tweet:
        if char not in vocabulary:
            if char not in missing_chars:
                missing_chars[char] = 0
            missing_chars[char] += 1
print(sorted(missing_chars.items(), key=lambda x: x[1], reverse=True))

In [None]:
# Model

from __future__ import print_function
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation
from keras.optimizers import RMSprop
import numpy as np


maxlen = 25
step_size = 3

lstm_size = 96
learning_rate = 0.003

sentences = []
next_chars = []
for tweet in tweets:
    for j in range(0, len(tweet) - maxlen, step_size):
        sentences.append(tweet[j: j + maxlen])
        next_chars.append(tweet[j + maxlen])
print('#Sentences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, vocabulary_size), dtype=np.bool)
y = np.zeros((len(sentences), vocabulary_size), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_to_id[char]] = 1
    y[i, char_to_id[next_chars[i]]] = 1


# Build the LSTM model
print('Build LSTM model...')
model = Sequential()
model.add(LSTM(lstm_size, input_shape=(maxlen, vocabulary_size), return_sequences=True))
model.add(LSTM(lstm_size, return_sequences=True))
model.add(LSTM(lstm_size))
model.add(Dense(vocabulary_size))
model.add(Activation('softmax'))

# RMSprop is recommended for RNNs
optimizer = RMSprop(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


print('Done!')

In [None]:
# Training

import random
import sys

from keras.callbacks import LambdaCallback, TensorBoard


dropout = 0.3
batch_size = 128
num_epochs = 25
validation_split = 0.05


# helper function to sample an index from a probability array
def sample(input_predictions, temperature=1.0):
    predictions = np.asarray(input_predictions).astype('float64')
    exp_predictions = np.exp(np.log(predictions) / temperature)
    normalized_predictions = exp_predictions / np.sum(exp_predictions)
    probabilities = np.random.multinomial(1, normalized_predictions, 1)
    return np.argmax(probabilities)


def on_epoch_end(epoch, logs):
    print()
    print('----- Generating tweet after Epoch: %d' % epoch)

    random_tweet_index = random.randint(0, len(tweets) - 1)
    random_tweet = tweets[random_tweet_index]
    start_index = random.randint(0, len(random_tweet) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = random_tweet[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(100):
            x_pred = np.zeros((1, maxlen, vocabulary_size))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_id[char]] = 1.0

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = id_to_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


# Callbacks
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
tensorboard_callback = TensorBoard(
    write_grads=True,
    batch_size=batch_size,
)


# Train!
model.fit(x, y,
          batch_size=batch_size,
          epochs=num_epochs,
          callbacks=[print_callback, tensorboard_callback],
          validation_split=validation_split)

In [None]:
# Save model
model.save('model/model.h5')