** RNN to generate tweets, using character level generation. **

In [None]:
import pandas as pd
import numpy as np
import random
import sys
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RNN, Softmax, Flatten, Dropout, Input
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import LambdaCallback

In [None]:
data = pd.read_csv("../Load_Tweets/data/tweet_data.csv") # this will break if this file is moved!

In [None]:
data.head()

In [None]:
# Put all the tweets into one string.

tweet_txt = data['TEXT'][:].str.cat(sep=' ')
len(tweet_txt)

In [None]:
# Ok, let's check out one of these tweets.

print(tweet_txt[:150])

In [None]:
# Get all the unique characters used.

chars = list(set(tweet_txt))
chars.sort()
print("Number of unique characters: ", len(chars))

In [None]:
# Make a character mapping.

char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# We want to cut the data into overlapping sequences of characters.
# Tweets have a max length of 150 but we want to understand how to write a 
# tweet so we should pick a length smaller that that. Let's choose a random number, 
# how about 30.

# maxlen -> the maximum character length each input will be before we 
#           predict the next character.
#
# step -> The jump we make till we start our next group. For example
#         If our list what [a, d, c, r, r, e, y, d, d ,s], with a maxlen
#         of 3 and step of 2, then we would have lists, [a, d, c], [c, r, r]
#         [r, r, e], and so on.
#
# sentences -> a list of the character strings of length maxlen
#
# next_char -> a list of the next characters to be predicted. i.e. after t  30 characters
#              have been placed in the model, it should predict the 31st character.


maxlen = 30
step = 3
sentences = []
next_chars = []

for i in range(0, len(tweet_txt) - maxlen, step):
    sentences.append(tweet_txt[i:i + maxlen])
    next_chars.append(tweet_txt[i + maxlen])

In [None]:
# We want to make a 3-dimensional array that has the shape
# (len(sentences), maxlen, len(chars)) a small example matrix might look like this:
#
#       shape (3, 4, 4)
#
#            / 0 1 0 0 /
#           / 1 0 1 0 /
#          / 0 0 0 1 /    Level 1.
#         / 1 2 3 4 / 

#          -------------

#            / 1 0 0 0 /
#           / 0 0 1 0 /
#          / 0 1 0 1 /    Level 2.
#         / 1 2 3 4 /  

#          -------------

#            / 0 0 1 0 /
#           / 1 0 0 1 /
#          / 0 1 0 0 /    Level 3.
#         / 1 2 3 4 / 


X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_index[char]] = 1
    y[i, char_to_index[next_chars[i]]] = 1 

In [None]:
# # Here we define the model, and compile it.

""" Here is the model

        --------- LSTM ----------
           |        |       |    
          \ /      \ /     \ / 
        --------- Dropuot -------
           |        |       |    
          \ /      \ /     \ /
        --------- LSTM ----------
           |        |       |    
          \ /      \ /     \ / 
        --------- Dropout -------
           |        |       |    
          \ /      \ /     \ /
        --------- DENSE ----------
"""
        
model=Sequential()

shape = (maxlen, len(chars))
model.add(LSTM(128, input_shape=shape, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(128, input_shape=shape))
model.add(Dropout(0.1))
model.add(Dense(len(chars), activation="softmax"))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()
print()
print("---------------")
print("Data Dimensions")
print("---------------")
print("X: ", X.shape)
print("y: ", y.shape)

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
# It should be mentioned that this code is not my own, it is used for testing purposes, and does not
# reflect the groups code base.


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(tweet_txt) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = tweet_txt[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_index[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = index_to_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [None]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(X, y, 
          batch_size=100000,
          epochs=50,
         )

In [None]:
# Now that we have our model trained, let's see how well it was able to predict.
# Here I will give it a starting string of 30 characters long, randomly chosen from 
# the entirety of the tweet texts, and we will see what it outputs! This is exciting!!
# We will start by producing one tweet, which is 150 characters long.


start_index = random.randint(0, len(tweet_txt) - maxlen - 1)
starter =  tweet_txt[start_index : start_index + 30]
generated = starter

# x_pred = np.zeros((1, maxlen, len(chars)), dtype=np.bool)
# for t, char in enumerate(starter):
#     x_pred[0, t, char_to_index[char]] = 1

# y_hat = model.predict(x_pred)[0]

# y_hat
for i in range(0, 120):
    x_pred = np.zeros((1, maxlen, len(chars)), dtype=np.bool)
    for t, char in enumerate(starter):
        x_pred[0, t, char_to_index[char]] = 1
        
    pred = model.predict(x_pred)[0]
    next_index = sample(pred)
    next_char = index_to_char[next_index]
    
    generated += next_char
    starter = starter[1:] + next_char
    
print(generated)


In [None]:
# Here we save the model

save.model('first_model.h5')