Trains a RNN to imitate trumps tweets.
Based on [Trung Trans's excellent tutorial](https://chunml.github.io/ChunML.github.io/project/Creating-Text-Generator-Using-Recurrent-Neural-Network/).

In [1]:
import pickle
import numpy as np

import keras
from keras.models import Sequential
from keras import layers

Using TensorFlow backend.


In [2]:
# Hyperparameters
TWEET_LEN = 140
MAX_LEN = 200

HIDDEN_DIM = 1024
DEPTH = 5
FIRST_DROPOUT = 0.3
LATER_DROPOUT = 0.5

BATCH_SIZE = 32

# Other Constants
DATASET_FILE = "trump_tweets.pickle"
MODEL_FILE = "Models/TrumpTweetRegularized({}-{}-{}-{}).h5"

# Construct dictionaries to convert from tokens to strings and back
np.random.seed = 42
char_list = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ()#@,.:-$?!/'\"\n")
num2char = dict(enumerate(char_list, 1))
num2char[0] = "<PAD>"
char2num = dict(zip(num2char.values(), num2char.keys()))
VOCAB_SIZE = len(char_list) + 1
ONEHOT_ARRAY = np.eye(VOCAB_SIZE)

n_epoch = 0

In [3]:
# Load dataset
tweets = pickle.load(open(DATASET_FILE, "rb"))
print("Pickle loaded:", len(tweets), "Tweets")

Pickle loaded: 31227 Tweets


In [4]:
# Reshape Dataset
# Convers string array to char/token arrays
X = list(list(char2num[c] for c in t if c in char2num) for t in tweets)
# Pad tweets to 140 chars
X = np.array(list(tweet + (TWEET_LEN - len(tweet))*[0] for tweet in X if len(tweet) < TWEET_LEN))
# Get onehot-encodings
X = ONEHOT_ARRAY[X]
# Create X -> y pairs to train the network on
y = X[:, 1:, :]
X = X[:, :-1, :]

In [61]:
def generate_tweets(num_tweets):
    # Construct one-hot character array and set initial characters
    ix = np.zeros((num_tweets, 1, VOCAB_SIZE))
    for a in ix:
        a[0, np.random.randint(VOCAB_SIZE)] = 1
    while True:
        # Get the character-probabilities from the model
        iy = model.predict(ix)[:, ix.shape[1]-1, :]
        # Select the next characters based on the output of the model
        c = np.array([np.random.choice(np.arange(VOCAB_SIZE), p=ps) for ps in iy])
        #c = np.array([c = np.argmax(ps)) for ps in iy])   
        # Break if all tweets are over (if we encounter the <PAD>-character) or have reached MAX_LEN 
        if np.all(c==0) or ix.shape[1] >= MAX_LEN:
            break
        # Add new characters to the character array
        nx = ONEHOT_ARRAY[c].reshape(num_tweets, 1, VOCAB_SIZE)
        ix = np.concatenate((ix, nx), axis=1)
    # Convert the one-hot character array to a list of strings
    tweets = ["".join(num2char[n] for n in np.argmax(tweet, axis=1) if n != 0) for tweet in ix]
    return tweets

def beam_search_tweet(b=10, softmax_select=True, pad_factor=None, init_sequence=None, output_all=False):
    qualities = np.ones(b)
    # Random initial value for hypotheses
    if init_sequence is None:
        hypotheses = np.zeros((b, 1, VOCAB_SIZE))
        hypotheses[:, 0, np.random.randint(VOCAB_SIZE)] = 1
    else:
        hypotheses = np.zeros((b, len(init_sequence), VOCAB_SIZE))
        seq = np.array(list(ONEHOT_ARRAY[char2num[c]] for c in init_sequence))
        for i in range(len(hypotheses)):
            hypotheses[i] = seq
    
    for idx in range(hypotheses.shape[1], TWEET_LEN):
        hy = model.predict(hypotheses)[:, idx-1, :]
        possibilities = []
        # Generate value (quality*possibility) with new hypotheses for every new possibility - hypothese combination combination
        for q, quality in enumerate(qualities):
            for p, possibility in enumerate(hy[q]):
                # Account for padding
                if p == 0 and pad_factor is not None:
                    possibility = pad_factor
                possibilities.append([q, p, quality*possibility])
        if softmax_select:
            q = np.array(possibilities)[:, 2]
            probs = q / np.sum(q)
            #print(possibilities[0][2])
            indicies = np.random.choice(probs.shape[0], replace=False, size=(b), p=probs).astype(int)
            chosen = [possibilities[i] for i in indicies]
        else:
            # Sort by quality
            possibilities.sort(key=lambda p: p[2], reverse=True)
            # Select b best new hypotheses
            chosen = possibilities[:b]
        # Set new qualities
        qualities = np.array(chosen)[:, 2]
        # Construct new hypotheses array
        hypotheses = np.array(list(
                np.concatenate((
                        hypotheses[choice[0]],
                        ONEHOT_ARRAY[choice[1]].reshape(1, -1)))
                for choice in chosen))
    if output_all:
        tweets = list(["".join(num2char[n] for n in np.argmax(h, axis=1) if n != 0) for h in hypotheses])
        return tweets
    else:
        best_hypothese = hypotheses[np.argmax(qualities)]
        tweet = "".join(num2char[n] for n in np.argmax(best_hypothese, axis=1) if n != 0)
        return tweet

In [6]:
# Model
model = Sequential()
model.add(layers.LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True, dropout=FIRST_DROPOUT))
for i in range(DEPTH - 1):
    model.add(layers.LSTM(HIDDEN_DIM, return_sequences=True, dropout=LATER_DROPOUT))
model.add(layers.TimeDistributed(layers.Dense(VOCAB_SIZE)))
model.add(layers.Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, None, 1024)        4517888   
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 1024)        8392704   
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 1024)        8392704   
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 1024)        8392704   
_________________________________________________________________
lstm_5 (LSTM)                (None, None, 1024)        8392704   
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 78)          79950     
_________________________________________________________________
activation_1 (Activation)    (None, None, 78)          0         
Total para

In [6]:
# Optional: Restore model from checkpoint
model = keras.models.load_model("Models/TrumpTweetRegularized(4-1024-198-[0.9015714815410788]).h5")
print("Restored model")

Restored model


In [7]:
# Best VAL_ACC: 0.7506, VAL_LOSS: 0.8928
print("Commencing training")
while True:
    hist = model.fit(X, y, validation_split=0.05, batch_size=BATCH_SIZE, epochs=1)
    n_epoch += 1
    model.save(MODEL_FILE.format(DEPTH, HIDDEN_DIM, n_epoch, hist.history["val_loss"]), overwrite=True)
    print(generate_tweets(2))
    print("Completed {}. epoch".format(n_epoch))
    print("_"*120)

Commencing training
Train on 28330 samples, validate on 1492 samples
Epoch 1/1
['Uos Wol omfges crasging,  Feat want Lhacanrs.  rel2', '?zz to mivl theyer 6Bett! Ameeturm chelungss ir osvorg of gabors.']
Completed 1. epoch
________________________________________________________________________________________________________________________
Train on 28330 samples, validate on 1492 samples
Epoch 1/1
['/yhe canf deal we can open a meat even country. Sant of years on outraniig that hellowe thky are your fice!', '$Y santris 111 is "']
Completed 2. epoch
________________________________________________________________________________________________________________________
Train on 28330 samples, validate on 1492 samples
Epoch 1/1
['You make my vorfetitorily @Lacys and healthcare.  True,', 'Xast stop an amazing, Nouglas Coold Aashington as the Leagership @TrumpDatingTwill Vartellaur: Enjoy!']
Completed 3. epoch
_______________________________________________________________________________

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "F:\Programme\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-e2de0602ecd1>", line 4, in <module>
    hist = model.fit(X, y, validation_split=0.05, batch_size=BATCH_SIZE, epochs=1)
  File "F:\Programme\Anaconda\lib\site-packages\keras\models.py", line 863, in fit
    initial_epoch=initial_epoch)
  File "F:\Programme\Anaconda\lib\site-packages\keras\engine\training.py", line 1430, in fit
    initial_epoch=initial_epoch)
  File "F:\Programme\Anaconda\lib\site-packages\keras\engine\training.py", line 1079, in _fit_loop
    outs = f(ins_batch)
  File "F:\Programme\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py", line 2268, in __call__
    **self.session_kwargs)
  File "F:\Programme\Anaconda\lib\site-packages\tensorflow\python\client\session.py", line 767, in run
    run_metadata_ptr)
  File "F:\Programme\Anaconda\lib\s

KeyboardInterrupt: 

In [75]:
# generated_tweets = []
while True:
    new_tweet = beam_search_tweet(b=5)
    if new_tweet not in generated_tweets:
        generated_tweets.append(new_tweet)
    print(generated_tweets[-1])
    print("="*120)
    if len(generated_tweets) % 10 == 0:
        print(len(generated_tweets), "Tweets generated")
        print("_"*120)
        pickle.dump(generated_tweets, open("BeamsearchedTweets.pickle", "wb"))

50 days until I have always been interested in their foreign policy in the debate. We are working out to serve the world, but it is not in t
via @BreitbartNews by @melaniebatley: Donald Trump to speak at the White House today 
810 Tweets generated
________________________________________________________________________________________________________________________
90 stories above the world Trade Center in Washington, D.C. is the best viewers of the ObamaCare website 
zise words from @TrumpTowerNY in New York City, @TrumpTowerNY is one of the greatest opportunities 
You have to learn the presidential election that Bernie Sanders has been a foreign policy in the world. They would have been doing in the wo
80 days until after the element of people in the United States, used to be president, we will MAKE AMERICA GREAT AGAIN!
elections are being recognized and foreign policy is going to be stronger than ever before.
Entrepreneurs: Success is not fatal: it is the best thing you can about 

KeyboardInterrupt: 

In [64]:
beam_search_tweet(b=500, output_all=True)

['President Obama is going to MAKE AMERICA GREAT AGAIN! ',
 'President Obama is going to MAKE AMERICA GREAT AGAIN!',
 'President Obama is going to MAKE AMERICA GREAT AGAIN! #CelebApprentice',
 'President Obama is going to MAKE AMERICA GREAT AGAIN! #AmericaFirst #Trump2016 ',
 'President Obama is going to MAKE AMERICA GREAT AGAIN! MAKE AMERICA GREAT AGAIN!',
 'President Obama is going to MAKE AMERICA GREAT AGAIN! #Trump2016 ',
 'President Obama is going to MAKE AMERICA GREAT AGAIN! Watch here:  ',
 'President Obama is going to MAKE AMERICA GREAT AGAIN!  ',
 'President Obama is going to MAKE AMERICA GREAT AGAIN!\n#TrumpPence16 ',
 'President Obama is going to MAKE AMERICA GREAT AGAIN! #Trump2016  ',
 'President Obama is going to MAKE AMERICA GREAT AGAIN ',
 'President Obama is going to MAKE AMERICA GREAT AGAIN! #Trump2016 #MakeAmericaGreatAgain!',
 'President Obama is going to MAKE AMERICA GREAT AGAIN! Lets MAKE AMERICA GREAT AGAIN!',
 'President Obama is going to MAKE AMERICA GREAT AGAI

In [83]:
seen = []
duplic = 0
for t in generated_tweets:
    if t in seen:
        duplic += 1
    else:
        seen.append(t)
print(duplic)
generated_tweets = seen
pickle.dump(generated_tweets, open("BeamsearchedTweets.pickle", "wb"))

0
