In [1]:
from hyperdash import monitor_cell

import pickle
import collections
import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import layers

Using TensorFlow backend.


In [10]:
# Hyperparameters
TWEET_LEN = 140
MAX_LEN = 200
#VOC_SIZE = 50000
#EMBEDDING_DIM = 100

HIDDEN_DIM = 1024
DEPTH = 3
FIRST_DROPOUT = 0.3
LATER_DROPOUT = 0.5

BATCH_SIZE = 32
EPOCHS = 8

# Other Constants
DATASET_FILE = "trump_tweets.pickle"
MODEL_FILE = "Models/TrumpTweetRegularized({}-{}).h5".format(DEPTH, HIDDEN_DIM)

np.random.seed = 42
char_list = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ()#@,.:;-$?!/'\"\n")
num2char = dict(enumerate(char_list, 1))
num2char[0] = "<PAD>"
char2num = dict(zip(num2char.values(), num2char.keys()))
VOCAB_SIZE = len(char_list) + 1

n_epoch = 0

In [3]:
tweets = pickle.load(open(DATASET_FILE, "rb"))
print("Pickle loaded:", len(tweets), "Tweets")

Pickle loaded: 31227 Tweets


In [4]:
# Reshape Dataset like a boss!
# Convers string to char/token arrays
X = list(list(char2num[c] for c in t if c in char2num) for t in tweets)
# Pad tweets to 140 chars
X = np.array(list(tweet + (TWEET_LEN - len(tweet))*[0] for tweet in X if len(tweet) < TWEET_LEN))
# Get onehot-encodings
X = np.eye(VOCAB_SIZE)[X]
# Create X -> y pairs
y = X[:, 1:, :]
X = X[:, :-1, :]

In [11]:
# Helper Methods
def generate_text(length):
    ix = [np.random.randint(VOCAB_SIZE)]
    y_char = [num2char[ix[-1]]]
    X = np.zeros((1, length, VOCAB_SIZE))
    for i in range(length):
        X[0, i, ix[-1]] = 1
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        if ix[-1] != 0:
            y_char.append(num2char[ix[-1]])
    return ('').join(y_char)

def generate_tweets(num_tweets):
    ix = np.zeros((num_tweets, 1, VOCAB_SIZE))
    for a in ix:
        a[0, np.random.randint(VOCAB_SIZE)] = 1
    while True:
        iy = model.predict(ix)[:, ix.shape[1]-1, :]
        c = np.array([np.random.choice(np.arange(VOCAB_SIZE), p=ps) for ps in iy])
        #c = np.array([c = np.argmax(ps)) for ps in iy])    
        #c = np.argmax(iy[0, 0])
        if np.all(c==0) or ix.shape[1] >= MAX_LEN:
            break
        nx = np.eye(VOCAB_SIZE)[c].reshape(num_tweets, 1, VOCAB_SIZE)
        ix = np.concatenate((ix, nx), axis=1)
    tweets = ["".join(num2char[n] for n in np.argmax(tweet, axis=1) if n != 0) for tweet in ix]
    return tweets

In [6]:
# Model
model = Sequential()
model.add(layers.LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True, dropout=FIRST_DROPOUT))
for i in range(DEPTH - 1):
    model.add(layers.LSTM(HIDDEN_DIM, return_sequences=True, dropout=LATER_DROPOUT))
model.add(layers.TimeDistributed(layers.Dense(VOCAB_SIZE)))
model.add(layers.Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, None, 1024)        4526080   
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 1024)        8392704   
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 1024)        8392704   
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 80)          82000     
_________________________________________________________________
activation_1 (Activation)    (None, None, 80)          0         
Total params: 21,393,488
Trainable params: 21,393,488
Non-trainable params: 0
_________________________________________________________________


In [7]:
# Optional: Restore model from checkpoint
model = keras.models.load_model("models/goodTrump(3-1024).h5")
print("Restored model")

Restored model


In [8]:

# Min Valid Loss = 0.967 512-0.3-0-0
print("Commencing training")
while True:
    hist = model.fit(X, y, validation_split=0.1, batch_size=BATCH_SIZE, epochs=1)
    n_epoch += 1
    model.save(MODEL_FILE, overwrite=True)
    print(generate_text(100))
    print("Completed {}. epoch".format(n_epoch))
    print("_"*120)

Commencing training
Train on 26255 samples, validate on 2918 samples
Epoch 1/1
60 stories over the world to see the failing @nytimes is so instincts about me in the polls - they ar
Completed 1. epoch
________________________________________________________________________________________________________________________
Train on 26255 samples, validate on 2918 samples
Epoch 1/1
 6048/26255 [=====>........................] - ETA: 478s - loss: 0.6600 - acc: 0.8058

KeyboardInterrupt: 

In [None]:
print(generate_text(140))

In [14]:
print(generate_tweets(100))

['$5.9This mogger is very support the menswear best your intast success.', '1 thee people finally have the right things - this Aran problem still have no product everyone. Why can richal happiness, but they are no longer situations!', 'First of Dasahan Mocker is a total fool-basch--which on top of Christmas pretty who released $6,440 people to TPP uslimited ', 'Will be thinking of @Macys -- Egg!  They dont we are fighting to listen to DC for Christmas! I will stop you stop!', 'Obama did not have a personal horrendoused after 9 DAME  AMERICAN GOOST BORES WILL CHINGE PLANITY STANTEDS illegal immigration!', 'Amazing vaccinations last night, why are they now say 36.6 AM ISTALICE FOR TRUMP!', 'small john. Massive case by modern suited nite guys that is arresady to mention. Stay a much time he would looser bow. ', 'via @nypost by @JoeNBC: Trump says Judgthe 2 Norman rooms lives @MikePenaeytD! ???? ', "n@Scottish11  if We don't get played great on thete are, get a real cause for ", '@jvhitewi