** RNN to generate tweets, using character level generation. **

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import sys
import pickle
import csv
import os
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RNN, Softmax, Flatten, Dropout, Input
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

tf.enable_eager_execution()

In [2]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [3]:
# The code in this cell is copied from A Keras example file available on github.
# Reference: https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    f = open('twitter_epoch_test.log', 'a')
    
    start_index = random.randint(0, len(tweet_txt) - maxlen - 1)
    f.write('\n')
    f.write('----- Generating text after Epoch: %d\n' % epoch)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('---- Generating text to file: twitter_epoch_test.log ----')
        print('---- with diversity: %f\n' % diversity)
        f.write('----- diversity: %f\n' % diversity)


        generated = ''
        sentence = tweet_txt[start_index: start_index + maxlen]
        generated += sentence
        f.write('----- Generating with seed: "' + sentence + '"\n')
        f.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_index[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = index_to_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            f.write(next_char)
            f.flush()
        f.write('\n\n')
    f.close()
    

In [25]:
data = pd.read_csv("../Load_Tweets/data/tweet_data.csv") # this will break if this file is moved!
data.head()

Unnamed: 0,ID,RETWEET,TEXT
0,786204978629185536,False,PAY TO PLAY POLITICS. #CrookedHillary [URL]
1,786201435486781440,False,Very little pick-up by the dishonest media of ...
2,786189446274248704,False,Crooked Hillary Clinton likes to talk about th...
3,786054986534969344,False,Thank you Florida- a MOVEMENT that has never b...
4,786007502639038464,False,Join me Thursday in Florida &amp; Ohio!West Pa...


In [26]:
data['TEXT'][100]

'Certainly has been an interesting 24 hours!'

In [27]:
data['TEXT'].apply(lambda x: len(x)).describe()

count    10622.000000
mean       141.512709
std         70.206293
min          5.000000
25%         99.000000
50%        135.000000
75%        150.000000
max        315.000000
Name: TEXT, dtype: float64

In [28]:
# Put all the tweets into one string

tweet_txt = data['TEXT'][:].str.cat(sep=' ')
print('{} : total characters in our dataset'.format(len(tweet_txt)))


1513769 : total characters in our dataset


In [29]:
# Get all the unique characters used, and make a character mapping. 
# Here we set Global Variables that are used throughout the code.

# with open('../Load_Tweets/data/ArtOfTheDeal.txt') as f:
#     book_txt = f.read()
    
# tweet_txt = tweet_txt + book_txt
# path_to_file = tf.keras.utils.get_file(
#     'shakespeare.txt', 
#     'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

# # Read, then decode for py2 compat.
# tweet_txt = open(path_to_file, 'rb').read().decode(encoding='utf-8')

chars = list(set(tweet_txt))
chars.sort()
char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = np.array(chars)
print("Number of unique characters: ", len(chars))
maxlen = 30 # 141 Chosen because the average length of a tweet in our data is 141 characters.


Number of unique characters:  369


In [30]:
tweet_int = np.array([char_to_index[char] for char in tweet_txt])

In [31]:
tweet_int[:20]

array([46, 31, 55,  0, 50, 45,  0, 46, 42, 31, 55,  0, 46, 45, 42, 39, 50,
       39, 33, 49])

In [32]:
seq_length = 100
examples_per_epoch = len(tweet_txt)//seq_length
char_dataset = tf.data.Dataset.from_tensor_slices(tweet_int)


In [33]:
for i in char_dataset.take(5):
    print(index_to_char[i.numpy()])

P
A
Y
 
T


In [34]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(index_to_char[item.numpy()])))

'PAY TO PLAY POLITICS. #CrookedHillary [URL] Very little pick-up by the dishonest media of incredible '
'information provided by WikiLeaks. So dishonest! Rigged system! Crooked Hillary Clinton likes to talk'
" about the things she will do but she has been there for 30 years - why didn't she do them? Thank you"
' Florida- a MOVEMENT that has never been seen before and will never be seen again. Lets get out &amp;'
'… [URL] Join me Thursday in Florida &amp; Ohio!West Palm Beach, FL at noon:[URL]Cincinnati, OH this 7'


In [35]:
# Here we actual build the data.

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [36]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(index_to_char[input_example.numpy()])))
    print ('Target data:', repr(''.join(index_to_char[target_example.numpy()])))

Input data:  'PAY TO PLAY POLITICS. #CrookedHillary [URL] Very little pick-up by the dishonest media of incredible'
Target data: 'AY TO PLAY POLITICS. #CrookedHillary [URL] Very little pick-up by the dishonest media of incredible '


In [37]:
# Batch size 
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [38]:
# Here is a model using the Keras Functional Api.
if tf.test.is_gpu_available():
    rnn = tf.keras.layers.CuDNNLSTM
    print("We are on the GPU!!!")
else:
    import functools
    rnn = tf.keras.layers.LSTM
#     functools.partial(
#     tf.keras.layers.LSTM, recurrent_activation='sigmoid')
    
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
        rnn(rnn_units,
            return_sequences=True, 
            recurrent_initializer='glorot_uniform',
#             bias_regularizer=tf.keras.regularizers.l1(l=0.01),
            stateful=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model


We are on the GPU!!!


In [39]:
vocab_size = len(chars)
embedding_dim = 256
rnn_units = 1024
batch_size=BATCH_SIZE



model = build_model(
  vocab_size = vocab_size, 
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units, 
  batch_size=batch_size)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 256)           94464     
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       (64, None, 1024)          5251072   
_________________________________________________________________
dense_2 (Dense)              (64, None, 369)           378225    
Total params: 5,723,761
Trainable params: 5,723,761
Non-trainable params: 0
_________________________________________________________________


In [40]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss
)

In [41]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

EPOCHS = 30

history = model.fit(
    dataset.repeat(),
    validation_data=dataset,
    validation_steps=30,
    epochs=EPOCHS, 
    steps_per_epoch=steps_per_epoch, 
    callbacks=[checkpoint_callback])


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


562775 : total characters in Trumps book


In [42]:
model_g = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model_g.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model_g.build(tf.TensorShape([1, None]))

model_g.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 256)            94464     
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (1, None, 1024)           5251072   
_________________________________________________________________
dense_3 (Dense)              (1, None, 369)            378225    
Total params: 5,723,761
Trainable params: 5,723,761
Non-trainable params: 0
_________________________________________________________________


In [45]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 250

    # Converting our start string to numbers (vectorizing) 
    input_eval = [char_to_index[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a multinomial distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(index_to_char[predicted_id])

    return (start_string + ''.join(text_generated))

In [59]:
print(generate_text(model_g, start_string=u"@FoxNews"))

@FoxNews at nobody bad bias is the Tariffs and that rough 2016. Congratulations to book! Is it doing really quickly out, Judicial Watch on Hillary Clinton and 21 people comes to the lawmakers of garift. They are fast! Just landed in South Carolina at 7pm! #T


In [53]:
# Here we save the model

model.save('../Saved_models/second_model.h5')



In [60]:
model.save_weights('../Saved_models/second_model_weights.h5')

In [None]:
""" HERE I AM DOING SOME MODEL TESTING """

In [None]:
model = load_model('../Saved_models/first_char_model.h5')

In [None]:
cross_entropy_loss, accuracy = model.evaluate(X, y, batch_size=128)