### TrumpTweet Notebook

In [62]:
#Prepare environment
import re
import numpy as np
import tensorflow as tf
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import GRU
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

#### Data Prep

In [63]:
#Get the data file with the Tweets
df_tt = pd.read_csv('inputdata/tweets_11-06-2020.csv')
df_tt['date'] = pd.to_datetime(df_tt['date'])

#Get the number of tweets sent by Trump himself
#print('The number of tweets and retweets: ')
#print(df_tt.groupby('isRetweet').size())
df_tt = df_tt[df_tt['isRetweet']=='f']
df_tt = df_tt[df_tt['date'].between('2020-01-01','2020-11-19')]

# Output just the Tweets to a text file
df_tt['text'].to_csv('inputdata/tweets.txt',  header=None, index=None, sep=' ', mode='a')

# load ascii text and remove url's from the tweets
filename = "inputdata/tweets.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = re.sub(r'http\S+', '', raw_text)

# Create a file with the url's removed
with open('inputdata/clean_tweet.txt', "w", encoding="utf-8") as f:
    f.write(raw_text)


In [64]:
len(raw_text)

724933

In [65]:
# tokenize the text
tok = keras.preprocessing.text.Tokenizer(char_level=True, filters='!"#$%&()*+,-./:;<=>?[\\]^_`{|}~\t\n')
fit_text = tok.fit_on_texts([raw_text])

max_id = len(tok.word_index)
ds_size = tok.document_count

print('Number of distinct characters {}'.format(max_id))
print('The dataset size is {}'.format(ds_size))

#reduce max characters to 100
#max_id = 100

# Encode the full text so each char is represented by its ID
[encoded] = np.array(tok.texts_to_sequences([raw_text])) - 1 # -1 so we have 0-38

#split the dataset for training and test if needed
train_size = int(ds_size*.9)
dataset = tf.data.Dataset.from_tensor_slices(encoded)

# Create the character sequences
n_steps = 140 # each character sequence is 100 steps
window_length = n_steps + 1 #target is the input steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)
print('Total number of data windows: {}'.format(len(dataset)))

# flatten to 2d
dataset = dataset.flat_map(lambda window: window.batch(window_length))

batch_size = 32
dataset = dataset.shuffle(1000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:,:-1],windows[:,1:]))

# one-hot encode unique characters (48 of them)
dataset = dataset.map(lambda x_batch, y_batch: (tf.one_hot(x_batch, depth=max_id), y_batch))

# pre-fetch
dataset = dataset.prefetch(1)

Number of distinct characters 194
The dataset size is 1
Total number of data windows: 724793


In [36]:
len(raw_text)

6898013

#### Build the Character Generation Model
Stateless RNN model

In [None]:
#%%timeit

# define the model
model = keras.models.Sequential([
    keras.layers.GRU(512, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax'))
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam')

# Save a checkpoint after every epoch
EPOCHS = 20
checkpoint_filepath = 'checkpoints/weights.{epoch:02d}.hdf5'
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    save_freq='epoch',
    monitor='val_loss',
    mode='min',
    save_best_only=False)

#Fit the model
history = model.fit(dataset,epochs=EPOCHS,callbacks=[model_checkpoint])

Epoch 1/20
Epoch 2/20
  481/22650 [..............................] - ETA: 21:36:49 - loss: 1.0885

In [None]:
# Save model
model.save('model.h5')

##### Generate some tweets

In [52]:
# Create function to process text for prediction (seed text)
def preprocess(texts):
    x = np.array(tok.texts_to_sequences(texts)) - 1
    return tf.one_hot(x, max_id)

# Create a function to create next character using temperature
def next_char(text, temperature=1):
    x_new = preprocess([text])
    y_proba = model.predict(x_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba)/temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tok.sequences_to_texts(char_id.numpy())[0]

# Function to recursively generate text
def complete_text(text, n_chars=140, temperature=.2):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

# Create a single letter prediction
seed_text='t'
x_seed = preprocess([seed_text])
y_pred = model.predict_classes(x_seed)
tok.sequences_to_texts(y_pred + 1)[0][-1]

# Create a long sequence of text
print(complete_text('grab them by the pussy', temperature=0.2))

grab them by the pussy to the democrats and state that the democrats and a great the fake a great the fake a great and the work be a great the was the work a grea
