** RNN to generate tweets, using character level generation. **

In [14]:
import pandas as pd
import numpy as np
import random
import sys
import pickle
import csv
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RNN, Softmax, Flatten, Dropout, Input
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

In [2]:
def build_data(inpt, maxlen=30, step=3):
    """ Build data from String to list of characters.

    Here we cut the data into overlapping sequences of characters.
    Tweets have a max length of 150 but we want to understand how to write a 
    tweet so we should pick a length smaller that that. Let's choose a random number, 
    how about 30.
                 
    -----------
    Args:
    -----------
    
        inpt -> String of text

        maxlen -> the maximum character length each input will be before we 
                  predict the next character.

        step -> The jump we make till we start our next group. For example
                If our list what [a, d, c, r, r, e, y, d, d ,s], with a maxlen
                of 3 and step of 2, then we would have lists, [a, d, c], [c, r, r]
                [r, r, e], and so on.
                
    -----------
    Returns:
    -----------
        
        Tuple of 2 elements -> (sentences, next_char)
    
        sentences -> a list of the character strings of length maxlen

        next_char -> a list of the next characters to be predicted. i.e. after t  30 characters
                 have been placed in the model, it should predict the 31st character.
    """
    
    sentences = []
    next_chars = []
    for i in range(0, len(tweet_txt) - maxlen, step):
        sentences.append(tweet_txt[i:i + maxlen])
        next_chars.append(tweet_txt[i + maxlen])
    
    return sentences, next_chars


In [3]:
def build_array(sentences, next_chars, test_size=0.20):
    """ Build data from lists to numpy vectors.
    
    We want to make a 3-dimensional array that has the shape
    (len(sentences), maxlen, len(chars)) a small example matrix might look like this:

          shape (3, 4, 4)

               / 0 1 0 0 /
              / 1 0 1 0 /
             / 0 0 0 1 /    Level 1.
            / 1 2 3 4 / 

             -------------

               / 1 0 0 0 /
              / 0 0 1 0 /
             / 0 1 0 1 /    Level 2.
            / 1 2 3 4 /  

             -------------

               / 0 0 1 0 /
              / 1 0 0 1 /
             / 0 1 0 0 /    Level 3.
            / 1 2 3 4 /                  


    -----------
    Args:
    -----------
    
        sentences -> a list of the character strings of length maxlen

        next_char -> a list of the next characters to be predicted. i.e. after t  30 characters
                     have been placed in the model, it should predict the 31st character.
                
        test_size ->  A number between 0 and 1 that represents the percentage of data that 
                      should be set aside to train on.
                
    -----------
    Returns:
    -----------
        
        Tuple of 4 elements - X_train, X_test, y_train, y_test
    """
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_to_index[char]] = 1
            y[i, char_to_index[next_chars[i]]] = 1 
            
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
            
    return X_train, X_test, y_train, y_test

In [4]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [5]:
# The code in this cell is copied from A Keras example file available on github.
# Reference: https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    f = open('twitter_epoch_test.log', 'a')
    
    start_index = random.randint(0, len(tweet_txt) - maxlen - 1)
    f.write('\n')
    f.write('----- Generating text after Epoch: %d\n' % epoch)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('---- Generating text to file: twitter_epoch_test.log ----')
        print('---- with diversity: %f\n' % diversity)
        f.write('----- diversity: %f\n' % diversity)


        generated = ''
        sentence = tweet_txt[start_index: start_index + maxlen]
        generated += sentence
        f.write('----- Generating with seed: "' + sentence + '"\n')
        f.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_index[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = index_to_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            f.write(next_char)
            f.flush()
        f.write('\n\n')
    f.close()
    

In [6]:
data = pd.read_csv("../Load_Tweets/data/tweet_data.csv") # this will break if this file is moved!
data.head()

Unnamed: 0,ID,RETWEET,TEXT
0,786204978629185536,False,PAY TO PLAY POLITICS. #CrookedHillary [URL]
1,786201435486781440,False,Very little pick-up by the dishonest media of ...
2,786189446274248704,False,Crooked Hillary Clinton likes to talk about th...
3,786054986534969344,False,Thank you Florida- a MOVEMENT that has never b...
4,786007502639038464,False,Join me Thursday in Florida &amp; Ohio!West Pa...


In [7]:
data['TEXT'].apply(lambda x: len(x)).describe()

count    10622.000000
mean       141.512709
std         70.206293
min          5.000000
25%         99.000000
50%        135.000000
75%        150.000000
max        315.000000
Name: TEXT, dtype: float64

In [8]:
# Put all the tweets into one string

tweet_txt = data['TEXT'][:].str.cat(sep=' ')
len(tweet_txt)

1513769

In [9]:
# Get all the unique characters used, and make a character mapping. 
# Here we set Global Variables that are used throughout the code.

chars = list(set(tweet_txt))
chars.sort()
char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = dict((i, c) for i, c in enumerate(chars))
print("Number of unique characters: ", len(chars))
maxlen = 141 # Chosen because the average length of a tweet in our data is 141 characters.

Number of unique characters:  369


In [17]:
# Make list of unedited twitter tweets.
tweets_lst = []
path = '/Users/schuylerjackson/text_generator/Load_Tweets/data/tweet_data.csv'
with open(path) as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        tweets_lst.append(row[2])

In [23]:
# build a dictionary to dump into a file.
pass_data_to_web = {
    'tweets_lst': tweets_lst,
    'chars': chars,
    'char_to_index': char_to_index,
    'index_to_char': index_to_char,
    'maxlen': maxlen
}
pass_data_to_web['maxlen']

141

In [24]:
# Making a file that can be loaded into the website framework.
# dump into file

with open('/Users/schuylerjackson/text_generator/website/flask_site/data/model.data', 'wb') as file:
    pickle.dump(pass_data_to_web, file)



In [10]:
# Here we actual build the data.

sentences, next_chars = build_data(inpt=tweet_txt, maxlen=maxlen, step=15)
X_train, X_test, y_train, y_test = build_array(sentences, next_chars, test_size=0.20)
print("Data:\n\ttrain: {}\n\ttest: {}".format(X_train.shape, X_test.shape))
print("Target:\n\ttrain: {}\n\ttest: {}".format(y_train.shape, y_test.shape))


Data:
	train: (80727, 141, 369)
	test: (20182, 141, 369)
Target:
	train: (80727, 369)
	test: (20182, 369)


In [15]:
# # Here we define the model, and compile it.
        
model=Sequential()

shape = (maxlen, len(chars))
# model.add(LSTM(128, input_shape=shape, return_sequences=True))
# model.add(Dropout(0.1))

# The average length of a tweet is 141 characters so that is the number I will choose.
model.add(LSTM(units=141, input_shape=shape))
model.add(Dropout(0.1))
model.add(Dense(len(chars), activation="softmax"))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=optimizer)
model.summary()
print()
print("---------------")
print("Data Dimensions")
print("---------------")
print("X: ", X_train.shape)
print("y: ", y_train.shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 141)               288204    
_________________________________________________________________
dropout_1 (Dropout)          (None, 141)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 369)               52398     
Total params: 340,602
Trainable params: 340,602
Non-trainable params: 0
_________________________________________________________________

---------------
Data Dimensions
---------------
X:  (80727, 141, 369)
y:  (80727, 369)


In [16]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(X_train, y_train, 
          batch_size=1000,
          epochs=2,
          callbacks=[print_callback]
         )

Instructions for updating:
Use tf.cast instead.
Epoch 1/2
---- Generating text to file: twitter_epoch_test.log ----
---- with diversity: 0.200000


---- Generating text to file: twitter_epoch_test.log ----
---- with diversity: 0.500000


---- Generating text to file: twitter_epoch_test.log ----
---- with diversity: 1.000000


---- Generating text to file: twitter_epoch_test.log ----
---- with diversity: 1.200000

Epoch 2/2
---- Generating text to file: twitter_epoch_test.log ----
---- with diversity: 0.200000


---- Generating text to file: twitter_epoch_test.log ----
---- with diversity: 0.500000


---- Generating text to file: twitter_epoch_test.log ----
---- with diversity: 1.000000


---- Generating text to file: twitter_epoch_test.log ----
---- with diversity: 1.200000



<tensorflow.python.keras.callbacks.History at 0x7f0f7ca3c2e8>

In [None]:
# evalutate the model

cross_entropy_loss, accuracy = model.evaluate(X_test, y_test, batch_size=128)

In [32]:
# Now that we have our model trained, let's see how well it was able to predict.
# Here I will give it a starting string of 30 characters long, randomly chosen from 
# the entirety of the tweet texts, and we will see what it outputs! This is exciting!!
# We will start by producing one tweet, which is 150 characters long.


start_index = random.randint(0, len(tweet_txt) - maxlen - 1)
starter =  tweet_txt[start_index : start_index + 30]
# starter = "Hillary is a bad actor"
generated = starter

# x_pred = np.zeros((1, maxlen, len(chars)), dtype=np.bool)
# for t, char in enumerate(starter):
#     x_pred[0, t, char_to_index[char]] = 1

# y_hat = model.predict(x_pred)[0]

# y_hat
for i in range(0, 120):
    x_pred = np.zeros((1, maxlen, len(chars)), dtype=np.bool)
    for t, char in enumerate(starter):
        x_pred[0, t, char_to_index[char]] = 1
        
    pred = model.predict(x_pred)[0]
    next_index = sample(pred)
    next_char = index_to_char[next_index]
    
    generated += next_char
    starter = starter[1:] + next_char
    
print(generated)


ting to the White House.”  So pledited him #Aaller' Shutd down does before concorined tusnNes &amp; calldoel as wish the great also for '@CTRJud’ slat


In [33]:
# Here we save the model

save.model('first_model.h5')

NameError: name 'save' is not defined

In [None]:
""" HERE I AM DOING SOME MODEL TESTING """

In [4]:
model = load_model('../Saved_models/first_char_model.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [15]:
cross_entropy_loss, accuracy = model.evaluate(X, y, batch_size=128)

