Trains a RNN to imitate trumps tweets.
Based on [Trung Trans's excellent tutorial](https://chunml.github.io/ChunML.github.io/project/Creating-Text-Generator-Using-Recurrent-Neural-Network/).

In [1]:
import pickle
import numpy as np

import keras
from keras.models import Sequential
from keras import layers

Using TensorFlow backend.


In [40]:
# Hyperparameters
TWEET_LEN = 140
MAX_LEN = 200

HIDDEN_DIM = 1024
DEPTH = 3
FIRST_DROPOUT = 0.3
LATER_DROPOUT = 0.5

BATCH_SIZE = 32
EPOCHS = 8

# Other Constants
DATASET_FILE = "trump_tweets.pickle"
MODEL_FILE = "Models/TrumpTweetRegularized({}-{}).h5".format(DEPTH, HIDDEN_DIM)

# Construct dictionaries to convert from tokens to strings and back
np.random.seed = 42
char_list = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ()#@,.:;-$?!/'\"\n")
num2char = dict(enumerate(char_list, 1))
num2char[0] = "<PAD>"
char2num = dict(zip(num2char.values(), num2char.keys()))
VOCAB_SIZE = len(char_list) + 1
ONEHOT_ARRAY = np.eye(VOCAB_SIZE)

n_epoch = 0

In [3]:
# Load dataset
tweets = pickle.load(open(DATASET_FILE, "rb"))
print("Pickle loaded:", len(tweets), "Tweets")

Pickle loaded: 31227 Tweets


In [4]:
# Reshape Dataset
# Convers string array to char/token arrays
X = list(list(char2num[c] for c in t if c in char2num) for t in tweets)
# Pad tweets to 140 chars
X = np.array(list(tweet + (TWEET_LEN - len(tweet))*[0] for tweet in X if len(tweet) < TWEET_LEN))
# Get onehot-encodings
X = ONEHOT_ARRAY[X]
# Create X -> y pairs to train the network on
y = X[:, 1:, :]
X = X[:, :-1, :]

In [91]:
def generate_tweets(num_tweets):
    # Construct one-hot character array and set initial characters
    ix = np.zeros((num_tweets, 1, VOCAB_SIZE))
    for a in ix:
        a[0, np.random.randint(VOCAB_SIZE)] = 1
    while True:
        # Get the character-probabilities from the model
        iy = model.predict(ix)[:, ix.shape[1]-1, :]
        # Select the next characters based on the output of the model
        c = np.array([np.random.choice(np.arange(VOCAB_SIZE), p=ps) for ps in iy])
        #c = np.array([c = np.argmax(ps)) for ps in iy])   
        # Break if all tweets are over (if we encounter the <PAD>-character) or have reached MAX_LEN 
        if np.all(c==0) or ix.shape[1] >= MAX_LEN:
            break
        # Add new characters to the character array
        nx = ONEHOT_ARRAY[c].reshape(num_tweets, 1, VOCAB_SIZE)
        ix = np.concatenate((ix, nx), axis=1)
    # Convert the one-hot character array to a list of strings
    tweets = ["".join(num2char[n] for n in np.argmax(tweet, axis=1) if n != 0) for tweet in ix]
    return tweets

def beam_search_tweet(b=10, pad_factor=None, init_sequence=None):
    qualities = np.ones(b)
    # Random initial value for hypotheses
    if init_sequence is None:
        hypotheses = np.zeros((b, 1, VOCAB_SIZE))
        hypotheses[:, 0, np.random.randint(VOCAB_SIZE)] = 1
    else:
        hypotheses = np.zeros((b, len(init_sequence), VOCAB_SIZE))
        seq = np.array(list(ONEHOT_ARRAY[char2num[c]] for c in init_sequence))
        for i in range(len(hypotheses)):
            hypotheses[i] = seq
    
    for idx in range(TWEET_LEN-hypotheses.shape[1]):
        hy = model.predict(hypotheses)[:, idx, :]
        possibilities = []
        # Generate value (quality*possibility) with new hypotheses for every new possibility - hypothese combination combination
        for q, quality in enumerate(qualities):
            for p, possibility in enumerate(hy[q]):
                # Account for padding
                if p == 0 and pad_factor is not None:
                    possibility = pad_factor
                possibilities.append([q, p, quality*possibility])
        # Sort by quality
        possibilities.sort(key=lambda p: p[2], reverse=True)
        # Select b best new hypotheses
        chosen = possibilities[:b]
        # Set new qualities
        qualities = np.array(chosen)[:, 2]
        # Construct new hypotheses array
        hypotheses = np.array(list(
                np.concatenate((
                        hypotheses[choice[0]],
                        ONEHOT_ARRAY[choice[1]].reshape(1, -1)))
                for choice in chosen))
    best_hypothese = hypotheses[np.argmax(qualities)]
    tweet = "".join(num2char[n] for n in np.argmax(best_hypothese, axis=1) if n != 0)
    return tweet    
            
        
beam_search_tweet(b=100)

'So many people are all talk and no action. It is a great champion. It is a great champion. It is a great champion.'

In [65]:
# Model
model = Sequential()
model.add(layers.LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True, dropout=FIRST_DROPOUT))
for i in range(DEPTH - 1):
    model.add(layers.LSTM(HIDDEN_DIM, return_sequences=True, dropout=LATER_DROPOUT))
model.add(layers.TimeDistributed(layers.Dense(VOCAB_SIZE)))
model.add(layers.Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, None, 1024)        4526080   
_________________________________________________________________
lstm_5 (LSTM)                (None, None, 1024)        8392704   
_________________________________________________________________
lstm_6 (LSTM)                (None, None, 1024)        8392704   
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 80)          82000     
_________________________________________________________________
activation_2 (Activation)    (None, None, 80)          0         
Total params: 21,393,488
Trainable params: 21,393,488
Non-trainable params: 0
_________________________________________________________________


In [69]:
# Optional: Restore model from checkpoint
model = keras.models.load_model(MODEL_FILE)
print("Restored model")

Restored model


In [8]:
print("Commencing training")
while True:
    hist = model.fit(X, y, validation_split=0.1, batch_size=BATCH_SIZE, epochs=1)
    n_epoch += 1
    model.save(MODEL_FILE, overwrite=True)
    print(generate_tweets(2))
    print("Completed {}. epoch".format(n_epoch))
    print("_"*120)

Commencing training
Train on 26255 samples, validate on 2918 samples
Epoch 1/1
60 stories over the world to see the failing @nytimes is so instincts about me in the polls - they ar
Completed 1. epoch
________________________________________________________________________________________________________________________
Train on 26255 samples, validate on 2918 samples
Epoch 1/1
 6048/26255 [=====>........................] - ETA: 478s - loss: 0.6600 - acc: 0.8058

KeyboardInterrupt: 

In [11]:
beam_search_tweet(b=4)

ValueError: Error when checking : expected lstm_4_input to have 3 dimensions, but got array with shape ()