# Trump Tweet Bot (Keras)
---


In [2]:
%load_ext lab_black
import sys, random, os, gc, re, json
from string import punctuation

import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dropout, Input, Dense
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

from eda_func import tweet_cleaner, rmv_uncommon
from bot_func import DataGenerator, TextCorpus, one_hot_features

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [3]:
DATA_FILE = "./data/archive.json"

df = pd.read_json(DATA_FILE)

### Data Cleaning
----


In [4]:
df["text"] = df["text"].apply(tweet_cleaner)
df["text"] = df.text.apply(rmv_uncommon)
# Remove tweets that just contain empty strings
df = df[(df["text"] != "") | (df["text"] != " ")]

# Remove Retweets
df = df[df.is_retweet == False]

# Use "special" symbol @ to indicate end of tweet, since we removed them all before
corpus = "@".join(df["text"].values)

### Pipeline Set-up
----

In [5]:
full_corp = TextCorpus(corpus)

train_corp = full_corp[: int(len(full_corp) * 0.9)]
test_corp = full_corp[int(len(full_corp) * 0.9) :]

SAMPLE_LEN = 32
STEP_SIZE = 3
BATCH_SIZE = 32

train_gen = DataGenerator(train_corp, SAMPLE_LEN, STEP_SIZE)
test_gen = DataGenerator(test_corp, SAMPLE_LEN, STEP_SIZE, shuffle=False)

nchars = full_corp.get_num_chars()

model = Sequential(name="test_bot1")
model.add(Input(shape=(SAMPLE_LEN, nchars), dtype=np.float32))
model.add(CuDNNLSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(256))
model.add(Dropout(0.2))
model.add(Dense(nchars, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam")
print(model.summary())

Model: "test_bot1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm (CuDNNLSTM)       (None, 32, 256)           346112    
_________________________________________________________________
dropout (Dropout)            (None, 32, 256)           0         
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 256)               526336    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 80)                20560     
Total params: 893,008
Trainable params: 893,008
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
def decode(arr, corp):

    for batch in range(arr[0].shape[0]):
        print("-" * 40)
        print("Batch " + str(batch))
        string = ""
        sample = arr[0][batch, ...]
        for letter in sample:
            string += corp.indicies_to_char[np.where(letter == 1)[0][0]]
        ans = corp.indicies_to_char[np.where(arr[1][batch] == 1)[0][0]]
        print("SAMPLE:", string)
        print("TARGET: ({})".format(ans))


decode(train_gen[0], full_corp)

----------------------------------------
Batch 0
SAMPLE: I would like to wish everyone A 
TARGET: (H)
----------------------------------------
Batch 1
SAMPLE: ould like to wish everyone A HAP
TARGET: (P)
----------------------------------------
Batch 2
SAMPLE: d like to wish everyone A HAPPY 
TARGET: (A)
----------------------------------------
Batch 3
SAMPLE: ike to wish everyone A HAPPY AND
TARGET: ( )
----------------------------------------
Batch 4
SAMPLE:  to wish everyone A HAPPY AND HE
TARGET: (A)
----------------------------------------
Batch 5
SAMPLE:  wish everyone A HAPPY AND HEALT
TARGET: (H)
----------------------------------------
Batch 6
SAMPLE: sh everyone A HAPPY AND HEALTHY 
TARGET: (N)
----------------------------------------
Batch 7
SAMPLE: everyone A HAPPY AND HEALTHY NEW
TARGET: ( )
----------------------------------------
Batch 8
SAMPLE: ryone A HAPPY AND HEALTHY NEW YE
TARGET: (A)
----------------------------------------
Batch 9
SAMPLE: ne A HAPPY AND HEALTHY NE

## Train an Predict
---

In [None]:
tweet_ends = np.where(np.asarray(list(corpus)) == "@")[0]

for epoch in range(1, 31):
    print('-' * 40)
    print('Epoch', epoch)
    model.fit(
        train_gen,
        steps_per_epoch=train_gen.epoch_size,
        epochs=1,
        validation_data=test_gen,
        validation_steps=test_gen.epoch_size)
    
    seed_index = 1 + np.random.choice(tweet_ends, 1)[0]
    for diversity in [0.2, 0.7, 1.2]:
        genertate_tweet(model, seed_index, diversity, full_corp)
    print('-' * 40)    

----------------------------------------
Epoch 1
