** RNN to generate tweets, using character level generation. **

In [32]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import sys
import pickle
import csv
import os
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RNN, Softmax, Flatten, Dropout, Input
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

tf.enable_eager_execution()

In [2]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [3]:
# The code in this cell is copied from A Keras example file available on github.
# Reference: https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    f = open('twitter_epoch_test.log', 'a')
    
    start_index = random.randint(0, len(tweet_txt) - maxlen - 1)
    f.write('\n')
    f.write('----- Generating text after Epoch: %d\n' % epoch)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('---- Generating text to file: twitter_epoch_test.log ----')
        print('---- with diversity: %f\n' % diversity)
        f.write('----- diversity: %f\n' % diversity)


        generated = ''
        sentence = tweet_txt[start_index: start_index + maxlen]
        generated += sentence
        f.write('----- Generating with seed: "' + sentence + '"\n')
        f.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_index[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = index_to_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            f.write(next_char)
            f.flush()
        f.write('\n\n')
    f.close()
    

In [4]:
data = pd.read_csv("../Load_Tweets/data/tweet_data.csv") # this will break if this file is moved!
data.head()

Unnamed: 0,ID,RETWEET,TEXT
0,786204978629185536,False,PAY TO PLAY POLITICS. #CrookedHillary [URL]
1,786201435486781440,False,Very little pick-up by the dishonest media of ...
2,786189446274248704,False,Crooked Hillary Clinton likes to talk about th...
3,786054986534969344,False,Thank you Florida- a MOVEMENT that has never b...
4,786007502639038464,False,Join me Thursday in Florida &amp; Ohio!West Pa...


In [5]:
data['TEXT'].apply(lambda x: len(x)).describe()

count    10622.000000
mean       141.512709
std         70.206293
min          5.000000
25%         99.000000
50%        135.000000
75%        150.000000
max        315.000000
Name: TEXT, dtype: float64

In [6]:
# Put all the tweets into one string

tweet_txt = data['TEXT'][:].str.cat(sep=' ')
print('{} : total characters in our dataset'.format(len(tweet_txt)))

1513769 : total characters in our dataset


In [20]:
# Get all the unique characters used, and make a character mapping. 
# Here we set Global Variables that are used throughout the code.

chars = list(set(tweet_txt))
chars.sort()
char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = np.array(chars)
print("Number of unique characters: ", len(chars))
maxlen = 30 # 141 Chosen because the average length of a tweet in our data is 141 characters.


Number of unique characters:  369


In [8]:
tweet_int = np.array([char_to_index[char] for char in tweet_txt])

In [9]:
tweet_int[:20]

array([46, 31, 55,  0, 50, 45,  0, 46, 42, 31, 55,  0, 46, 45, 42, 39, 50,
       39, 33, 49])

In [25]:
seq_length = 100
examples_per_epoch = len(tweet_txt)//seq_length
char_dataset = tf.data.Dataset.from_tensor_slices(tweet_int)


In [11]:
for i in char_dataset.take(5):
    print(index_to_char[i.numpy()])

Instructions for updating:
Colocations handled automatically by placer.
P
A
Y
 
T


In [21]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(index_to_char[item.numpy()])))

'PAY TO PLAY POLITICS. #CrookedHillary [URL] Very little pick-up by the dishonest media of incredible '
'information provided by WikiLeaks. So dishonest! Rigged system! Crooked Hillary Clinton likes to talk'
" about the things she will do but she has been there for 30 years - why didn't she do them? Thank you"
' Florida- a MOVEMENT that has never been seen before and will never be seen again. Lets get out &amp;'
'… [URL] Join me Thursday in Florida &amp; Ohio!West Palm Beach, FL at noon:[URL]Cincinnati, OH this 7'


In [23]:
# Here we actual build the data.

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [24]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(index_to_char[input_example.numpy()])))
    print ('Target data:', repr(''.join(index_to_char[target_example.numpy()])))

Input data:  'PAY TO PLAY POLITICS. #CrookedHillary [URL] Very little pick-up by the dishonest media of incredible'
Target data: 'AY TO PLAY POLITICS. #CrookedHillary [URL] Very little pick-up by the dishonest media of incredible '


In [26]:
# Batch size 
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [27]:
# Here is a model using the Keras Functional Api.
if tf.test.is_gpu_available():
    rnn = tf.keras.layers.CuDNNGRU
else:
    import functools
    rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')
    
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
        rnn(rnn_units,
            return_sequences=True, 
            recurrent_initializer='glorot_uniform',
            stateful=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model


In [29]:
model = build_model(
  vocab_size = len(chars), 
  embedding_dim=256, 
  rnn_units=1024, 
  batch_size=BATCH_SIZE)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           94464     
_________________________________________________________________
gru_1 (GRU)                  (64, None, 1024)          3935232   
_________________________________________________________________
dense_1 (Dense)              (64, None, 369)           378225    
Total params: 4,407,921
Trainable params: 4,407,921
Non-trainable params: 0
_________________________________________________________________


In [30]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [33]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

EPOCHS = 3

history = model.fit(
    dataset.repeat(), 
    epochs=EPOCHS, 
    steps_per_epoch=steps_per_epoch, 
    callbacks=[checkpoint_callback])


Epoch 1/3
 32/236 [===>..........................] - ETA: 19:03 - loss: 4.1893

KeyboardInterrupt: 

In [None]:
# # Here we define the model, and compile it.
        
model=Sequential()

shape = (maxlen, len(chars))
# model.add(LSTM(128, input_shape=shape, return_sequences=True))
# model.add(Dropout(0.1))

# The average length of a tweet is 141 characters so that is the number I will choose.
model.add(LSTM(units=141, input_shape=shape))
model.add(Dropout(0.1))
model.add(Dense(len(chars), activation="softmax"))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=optimizer)
model.summary()
print()
print("---------------")
print("Data Dimensions")
print("---------------")
print("X: ", X_train.shape)
print("y: ", y_train.shape)

In [None]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(X_train, y_train, 
          batch_size=1000,
          epochs=2,
          callbacks=[print_callback]
         )

In [None]:
# evalutate the model

cross_entropy_loss, accuracy = model.evaluate(X_test, y_test, batch_size=128)

In [None]:
# Now that we have our model trained, let's see how well it was able to predict.
# Here I will give it a starting string of 30 characters long, randomly chosen from 
# the entirety of the tweet texts, and we will see what it outputs! This is exciting!!
# We will start by producing one tweet, which is 150 characters long.


start_index = random.randint(0, len(tweet_txt) - maxlen - 1)
starter =  tweet_txt[start_index : start_index + 30]
# starter = "Hillary is a bad actor"
generated = starter

# x_pred = np.zeros((1, maxlen, len(chars)), dtype=np.bool)
# for t, char in enumerate(starter):
#     x_pred[0, t, char_to_index[char]] = 1

# y_hat = model.predict(x_pred)[0]

# y_hat
for i in range(0, 120):
    x_pred = np.zeros((1, maxlen, len(chars)), dtype=np.bool)
    for t, char in enumerate(starter):
        x_pred[0, t, char_to_index[char]] = 1
        
    pred = model.predict(x_pred)[0]
    next_index = sample(pred)
    next_char = index_to_char[next_index]
    
    generated += next_char
    starter = starter[1:] + next_char
    
print(generated)


In [None]:
# Here we save the model

save.model('first_model.h5')

In [None]:
""" HERE I AM DOING SOME MODEL TESTING """

In [None]:
model = load_model('../Saved_models/first_char_model.h5')

In [None]:
cross_entropy_loss, accuracy = model.evaluate(X, y, batch_size=128)