** RNN to generate tweets, using character level generation. **

In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RNN, Softmax, Flatten, Dropout, Input

In [2]:
data = pd.read_csv("../Load_Tweets/data/tweet_data.csv") # this will break if this file is moved!

In [3]:
data.head()

Unnamed: 0,ID,RETWEET,TEXT
0,786204978629185536,False,PAY TO PLAY POLITICS. #CrookedHillary [URL]
1,786201435486781440,False,Very little pick-up by the dishonest media of ...
2,786189446274248704,False,Crooked Hillary Clinton likes to talk about th...
3,786054986534969344,False,Thank you Florida- a MOVEMENT that has never b...
4,786007502639038464,False,Join me Thursday in Florida &amp; Ohio!West Pa...


In [4]:
# Put all the tweets into one string.

tweet_txt = data['TEXT'][:1000].str.cat(sep=' ')
len(tweet_txt)

103764

In [5]:
# Ok, let's check out one of these tweets.

print(tweet_txt[:150])

PAY TO PLAY POLITICS. #CrookedHillary [URL] Very little pick-up by the dishonest media of incredible information provided by WikiLeaks. So dishonest! 


In [6]:
# Get all the unique characters used.

chars = list(set(tweet_txt))
chars.sort()
print("Number of unique characters: ", len(chars))

Number of unique characters:  106


In [7]:
# Make a character mapping.

char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = dict((i, c) for i, c in enumerate(chars))

In [8]:
# We want to cut the data into overlapping sequences of characters.
# Tweets have a max length of 150 but we want to understand how to write a 
# tweet so we should pick a length smaller that that. Let's choose a random number, 
# how about 30.

# maxlen -> the maximum character length each input will be before we 
#           predict the next character.
#
# step -> The jump we make till we start our next group. For example
#         If our list what [a, d, c, r, r, e, y, d, d ,s], with a maxlen
#         of 3 and step of 2, then we would have lists, [a, d, c], [c, r, r]
#         [r, r, e], and so on.
#
# sentences -> a list of the character strings of length maxlen
#
# next_char -> a list of the next characters to be predicted. i.e. after t  30 characters
#              have been placed in the model, it should predict the 31st character.


maxlen = 30
step = 3
sentences = []
next_chars = []

for i in range(0, len(tweet_txt) - maxlen, step):
    sentences.append(tweet_txt[i:i + maxlen])
    next_chars.append(tweet_txt[i + maxlen])

In [9]:
# We want to make a 3-dimensional array that has the shape
# (len(sentences), maxlen, len(chars)) a small example matrix might look like this:
#
#       shape (3, 4, 4)
#
#            / 0 1 0 0 /
#           / 1 0 1 0 /
#          / 0 0 0 1 /    Level 1.
#         / 1 2 3 4 / 

#          -------------

#            / 1 0 0 0 /
#           / 0 0 1 0 /
#          / 0 1 0 1 /    Level 2.
#         / 1 2 3 4 /  

#          -------------

#            / 0 0 1 0 /
#           / 1 0 0 1 /
#          / 0 1 0 0 /    Level 3.
#         / 1 2 3 4 / 


X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_index[char]] = 1
    y[i, char_to_index[next_chars[i]]] = 1 

In [11]:
# embedding_layer= Embedding(total_words, EMBEDDING_DIM, weights=[embedding_matrix],input_length=max_seq,trainable=False)
# sequence_input = Input(shape=(max_seq,), dtype='int32')
# embedded_sequences= embedding_layer(sequence_input)
model=Sequential()
# e=Embedding(total_words, EMBEDDING_DIM, weights=[embedding_matrix],input_length=maxlen,trainable=False)
# model.add(e)
shape = (maxlen, len(chars))
model.add(LSTM(128, input_shape=shape))
# model.add(Dropout(0.1))
# model.add(Flatten())
model.add(Dense(len(chars), activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

print("---------------")
print("Data Dimensions")
print("---------------")
print("X: ", X.shape)
print("y: ", y.shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               120320    
_________________________________________________________________
dense_1 (Dense)              (None, 106)               13674     
Total params: 133,994
Trainable params: 133,994
Non-trainable params: 0
_________________________________________________________________
---------------
Data Dimensions
---------------
X:  (34578, 30, 106)
y:  (34578, 106)


In [12]:
model.fit(X, y, 
          batch_size=128,
          epochs=50,
         )

Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

KeyboardInterrupt: 

In [57]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [61]:
# Now that we have our model trained, let's see how well it was able to predict.
# Here I will give it a starting string of 30 characters long, randomly chosen from 
# the entirety of the tweet texts, and we will see what it outputs! This is exciting!!
# We will start by producing one tweet, which is 150 characters long.


start_index = random.randint(0, len(tweet_txt) - maxlen - 1)
starter =  tweet_txt[start_index : start_index + 30]
generated = starter

# x_pred = np.zeros((1, maxlen, len(chars)), dtype=np.bool)
# for t, char in enumerate(starter):
#     x_pred[0, t, char_to_index[char]] = 1

# y_hat = model.predict(x_pred)[0]

# y_hat
for i in range(0, 120):
    x_pred = np.zeros((1, maxlen, len(chars)), dtype=np.bool)
    for t, char in enumerate(starter):
        x_pred[0, t, char_to_index[char]] = 1
        
    pred = model.predict(x_pred)[0]
    next_index = sample(pred)
    next_char = index_to_char[next_index]
    
    generated += next_char
    starter = starter[1:] + next_char
    
print(generated)


ortion Demagogue #VPdebate [URL] Than oo ls eanaly gerulirid:… Perhing sald nille! Merseenhetit to&as yhet mume. . Em Orersiunt - thel bet fhe poror g
