In [1]:
!ls data/trump_tweet_data_archive/unzipped_condensed_json/

condensed_2009.json
condensed_2010.json


# Loading assets/Data

In [2]:
import json
from os import listdir
import numpy as np

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import download
download('punkt')

from collections import Counter
import re
import tensorflow as tf
import tensorflow.keras.utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Concatenate, Embedding, LSTM, Dense, Dropout, BatchNormalization, Input, TimeDistributed, Dot, RepeatVector, Activation, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import pydot

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\allen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
data_dir = './data/trump_tweet_data_archive/unzipped_condensed_json/'

In [4]:
data = []
for filename in listdir(data_dir):
    with open(data_dir + filename, 'r') as f:
        data += json.loads(f.read())
        
type(data), len(data), data[0], data[0]['text']

(list,
 198,
 {'source': 'Twitter Web Client',
  'id_str': '6971079756',
  'text': 'From Donald Trump: Wishing everyone a wonderful holiday & a happy, healthy, prosperous New Year. Let’s think like champions in 2010!',
  'created_at': 'Wed Dec 23 17:38:18 +0000 2009',
  'retweet_count': 28,
  'in_reply_to_user_id_str': None,
  'favorite_count': 12,
  'is_retweet': False},
 'From Donald Trump: Wishing everyone a wonderful holiday & a happy, healthy, prosperous New Year. Let’s think like champions in 2010!')

# Pre-processing data, creating training data/validation data

In [5]:
num_pattern = re.compile(r'\d+$')
year_pattern = re.compile(r'[12][90]\d\d$')
big_num = re.compile('\d[\d,]+$')

    
def apply_filters(x):
    if year_pattern.match(x):
        return '<year>'
    elif num_pattern.match(x):
        return '<num>'
    elif big_num.match(x):
        return '<bignum>'
    elif len(x) > 1 and '.' in x:
        return '<url>'
    else:
        return x

In [6]:
def special_filter(exclude = '', base_filters = '!"#$%&&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    return ''.join(set(base_filters) - set(exclude))    
    
special_filter('!?.$&()"')

']\n:_;+{|%-[`@\t>*,</#\\=^~}'

In [7]:
def filter_function(word):
    return word not in special_filter('!?.,$&():"')

tweet_text = map(lambda x : x['text'].replace('’', "'"), data)
nltk_tokenized_tweets = map(word_tokenize, tweet_text)
filtered_tokenized_tweets = map(lambda x : list(filter(filter_function, x)), nltk_tokenized_tweets)
lowered_tokenized_tweets = map(lambda x : list(map(str.lower, x)), filtered_tokenized_tweets)
placeholder_subbed_tweets = map(lambda x : list(map(apply_filters, x)), lowered_tokenized_tweets)

processed_text = list(placeholder_subbed_tweets)

print(processed_text[0:20])

[['from', 'donald', 'trump', ':', 'wishing', 'everyone', 'a', 'wonderful', 'holiday', '&', 'a', 'happy', ',', 'healthy', ',', 'prosperous', 'new', 'year', '.', 'let', "'s", 'think', 'like', 'champions', 'in', '<year>', '!'], ['trump', 'international', 'tower', 'in', 'chicago', 'ranked', '6th', 'tallest', 'building', 'in', 'world', 'by', 'council', 'on', 'tall', 'buildings', '&', 'urban', 'habitat', 'http', ':', '<url>'], ['wishing', 'you', 'and', 'yours', 'a', 'very', 'happy', 'and', 'bountiful', 'thanksgiving', '!'], ['donald', 'trump', 'partners', 'with', 'tv1', 'on', 'new', 'reality', 'series', 'entitled', ',', 'omarosa', "'s", 'ultimate', 'merger', ':', 'http', ':', '<url>'], ['--', 'work', 'has', 'begun', ',', 'ahead', 'of', 'schedule', ',', 'to', 'build', 'the', 'greatest', 'golf', 'course', 'in', 'history', ':', 'trump', 'international', '–', 'scotland', '.'], ['--', 'from', 'donald', 'trump', ':', '``', 'ivanka', 'and', 'jared', "'s", 'wedding', 'was', 'spectacular', ',', 'and'

In [8]:
wordcounts = Counter()
for processed_tweet in processed_text:
    for word in processed_tweet:
        wordcounts[word] += 1

In [9]:
VOCAB_SIZE = 20000

vocab = list(map(lambda x : x[0], wordcounts.most_common(VOCAB_SIZE)))
vocab.append('<end>')
vocab.append('<unk>')
word_index = {
    word : index + 1 for index, word in enumerate(vocab)
}
index_word = {index : word for word, index in word_index.items()}
print(index_word[1], ', ', index_word[2])

<url> ,  the


In [10]:
def word_to_seq(word):
    if word in word_index:
        return word_index[word]
    else:
        return word_index['<unk>']
    
processed_seq = [ [word_to_seq(word) for word in tweet] for tweet in processed_text]
print(processed_seq[0:3])

[[36, 19, 8, 4, 106, 68, 7, 195, 261, 37, 7, 58, 5, 262, 5, 263, 29, 196, 3, 389, 13, 69, 59, 197, 17, 96, 18], [8, 60, 97, 17, 264, 390, 391, 392, 393, 17, 78, 70, 394, 9, 395, 396, 37, 397, 398, 10, 4, 1], [106, 40, 11, 399, 7, 79, 58, 11, 400, 198, 18]]


In [11]:
def seq_to_samples(seq, end_token_index):
    samples = [
        (seq[:i], seq[i])
        for i in range(len(seq))
    ]
    samples.append((seq, end_token_index))
    return list(zip(*samples))

X, Y = seq_to_samples([1,2,3,4], 5)
print(X, Y)

([], [1], [1, 2], [1, 2, 3], [1, 2, 3, 4]) (1, 2, 3, 4, 5)


In [12]:
samples = []
labels = []
for seq in processed_seq:
    s, l = seq_to_samples(seq, word_index['<end>'])
    samples.extend(s)
    labels.extend(l)
    
len(samples), len(labels)

(4711, 4711)

In [13]:
for i in range(10):
    print(samples[i], ' '.join([index_word[word] for word in samples[i]]), labels[i], index_word[labels[i]])

[]  36 from
[36] from 19 donald
[36, 19] from donald 8 trump
[36, 19, 8] from donald trump 4 :
[36, 19, 8, 4] from donald trump : 106 wishing
[36, 19, 8, 4, 106] from donald trump : wishing 68 everyone
[36, 19, 8, 4, 106, 68] from donald trump : wishing everyone 7 a
[36, 19, 8, 4, 106, 68, 7] from donald trump : wishing everyone a 195 wonderful
[36, 19, 8, 4, 106, 68, 7, 195] from donald trump : wishing everyone a wonderful 261 holiday
[36, 19, 8, 4, 106, 68, 7, 195, 261] from donald trump : wishing everyone a wonderful holiday 37 &


In [14]:
max_seq_len = max([len(seq) for seq in processed_seq])
max_seq_len, len(processed_seq)

X = pad_sequences(samples, maxlen= max_seq_len, padding = 'pre')
X[0:10], X.shape

(array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  36],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  36,  19],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  36,  19,   8],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0, 

In [15]:
num_classes = max(labels) + 1
Y = to_categorical(labels, num_classes= num_classes)
Y.shape

(4711, 972)

In [16]:
X.shape, Y.shape

((4711, 39), (4711, 972))

# Define Models

In [17]:
lstm_drop = 0.2
dense_drop = 0.2
embed_dim = 300
lstm_dims = 128
dense_dim = num_classes

deeptrump = tf.keras.Sequential([
    Input(shape = (X.shape[1])),
    Embedding(num_classes, embed_dim, mask_zero = True),
    LSTM(lstm_dims, return_sequences = True),
    BatchNormalization(),
    Dropout(lstm_drop),
    LSTM(lstm_dims),
    BatchNormalization(),
    Dropout(lstm_drop),
    Dense(dense_dim, activation = 'relu'),
    BatchNormalization(),
    Dropout(dense_drop),
    Dense(dense_dim, activation = 'relu'),
    BatchNormalization(),
    Dropout(dense_drop),
    Dense(num_classes, activation = 'softmax')
])

optimizer = Adam(0.001)
deeptrump.compile(loss= 'categorical_crossentropy', optimizer = optimizer, metrics = ['acc'])
deeptrump.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 39, 300)           291600    
_________________________________________________________________
lstm (LSTM)                  (None, 39, 128)           219648    
_________________________________________________________________
batch_normalization (BatchNo (None, 39, 128)           512       
_________________________________________________________________
dropout (Dropout)            (None, 39, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0

In [37]:
%reload_ext tensorboard
import datetime

In [38]:
logdir = "logs\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = logdir, histogram_freq=1)
print(logdir)
deeptrump.fit(X,Y, epochs = 2, callbacks = [tensorboard_callback])

logs\fit\20191111-172616
Train on 4711 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1cb05a6f0b8>

In [20]:
lstm_drop = 0.5
dense_drop = 0.5
embed_dim = 300
lstm_dims = 128
dense_dim = num_classes

deep_bi_trump = tf.keras.Sequential([
    Input(shape = (X.shape[1],)),
    Embedding(num_classes, embed_dim, mask_zero = True),
    tf.keras.layers.Bidirectional(LSTM(lstm_dims, return_sequences = True)),
    BatchNormalization(),
    Dropout(lstm_drop),
    LSTM(2*lstm_dims),
    BatchNormalization(),
    Dropout(lstm_drop),
    Dense(dense_dim, activation = 'relu'),
    BatchNormalization(),
    Dropout(dense_drop),
    Dense(dense_dim, activation = 'relu'),
    BatchNormalization(),
    Dropout(dense_drop),
    Dense(num_classes, activation = 'softmax')
])

optimizer = Adam(0.001)
deep_bi_trump.compile(loss= 'categorical_crossentropy', optimizer = optimizer, metrics = ['acc'])
deep_bi_trump.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 39, 300)           291600    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 39, 256)           439296    
_________________________________________________________________
batch_normalization_8 (Batch (None, 39, 256)           1024      
_________________________________________________________________
dropout_8 (Dropout)          (None, 39, 256)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 256)               525312    
_________________________________________________________________
batch_normalization_9 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_9 (Dropout)          (None, 256)              

# Attention model

In [124]:
class MaskedSoftmax(tf.keras.layers.Layer):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    def call(self, z, mask = None):
        
        z = tf.keras.backend.exp(z)
        
        if not mask is None:
            assert(mask.shape == z.shape), 'Mask has incorrect dimensions: ' + str(z.shape) + ' vs. ' + str(mask.shape)
            z = tf.multiply(z, tf.dtypes.cast(mask, 'float32'))
        
        return tf.divide(z, tf.reduce_sum(z, axis = -1, keepdims=True))

In [141]:
class Attention(tf.keras.layers.Layer):
    
    def __init__(self, dense_units, **kwargs):
        super(Attention, self).__init__(**kwargs)
        assert(type(dense_units) in [tuple, list]), 'Argument dense units must be an iterable (10,5,2) etc.'
        self.dense_units = dense_units
        
    def build(self, input_shape):
        self.input_densor = Dense(self.dense_units[0], activation = 'tanh')
        if len(input_shape) > 1:
            self.densors = [Dense(nodes, activation = 'tanh') for nodes in self.dense_units[1:]]
        else:
            self.densors = []
        self.output_densor = Dense(1, activation = 'relu')
        self.dot = Dot(axes = 1)
        self.activator = MaskedSoftmax()
        self.repeater = RepeatVector(input_shape[1])
        self.concatenator = Concatenate(axis = -1)
    
    def call(self, a, s_prev = None, mask = None):
                 
        if not s_prev is None:
            
            assert(s_prev.shape[-1] == a.shape[-1]), 's_prev must have same last dimension as a_prev.'
            
            alpha = self.concatenator([a, self.repeater(s_prev)])
            
        else:
            alpha = a
        
        alpha = self.input_densor(alpha)
        
        for densor in self.densors:
            alpha = densor(alpha)
        
        alpha = self.output_densor(alpha)
        
        alpha = tf.squeeze(alpha, axis = -1)
        
        alpha = self.activator(alpha, mask = mask)
        
        print(alpha, a)
            
        context = self.dot([alpha, a])
        
        return context

In [145]:
embed = Embedding(10, 5, mask_zero=True)
rnn = Bidirectional(LSTM(3, return_sequences=True))
attn = Attention((10,10))

In [148]:
X = np.array([[0,0,0,5,8],[1,0,0,3,3]])


X = embed(X)
X = rnn(X)
X = attn(X, np.zeros((2,6)))

X

tf.Tensor(
[[0.         0.         0.         0.5002208  0.4997792 ]
 [0.33333334 0.         0.         0.33333334 0.33333334]], shape=(2, 5), dtype=float32) tf.Tensor(
[[[ 0.          0.          0.         -0.00092819  0.00463552
   -0.0011662 ]
  [ 0.          0.          0.         -0.00092819  0.00463552
   -0.0011662 ]
  [ 0.          0.          0.         -0.00092819  0.00463552
   -0.0011662 ]
  [-0.0058745   0.00543387 -0.00510851 -0.00092819  0.00463552
   -0.0011662 ]
  [-0.00405702 -0.0014659  -0.00272797  0.00436322 -0.00468217
   -0.00298495]]

 [[ 0.0016803   0.003492   -0.00670708 -0.00411446 -0.01846154
   -0.00603914]
  [ 0.0016803   0.003492   -0.00670708  0.00380335 -0.02956325
   -0.00619356]
  [ 0.0016803   0.003492   -0.00670708  0.00380335 -0.02956325
   -0.00619356]
  [ 0.01034939 -0.00875904  0.00422394  0.00380335 -0.02956325
   -0.00619356]
  [ 0.01443224 -0.01694424  0.01321637  0.00337907 -0.01684443
   -0.00367955]]], shape=(2, 5, 6), dtype=float32)


<tf.Tensor: id=33159, shape=(2, 6), dtype=float32, numpy=
array([[-4.9661640e-03,  1.9855106e-03, -3.9187628e-03,  1.7163439e-03,
        -2.1265354e-05, -2.0751706e-03],
       [ 8.8206455e-03, -7.4037602e-03,  3.5777446e-03,  1.0226513e-03,
        -2.1623071e-02, -5.3040832e-03]], dtype=float32)>