In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, BatchNormalization
tweets = pd.read_csv('realdonaldtrump.csv')
tweets.head(15)

Using TensorFlow backend.


Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,501,879,,
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,33,271,,
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,12,20,,
3,1741160716,https://twitter.com/realDonaldTrump/status/174...,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,10,26,,
4,1773561338,https://twitter.com/realDonaldTrump/status/177...,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1373,1954,,
5,1776419923,https://twitter.com/realDonaldTrump/status/177...,"Miss USA Tara Conner will not be fired - ""I've...",2009-05-12 14:21:55,28,28,,
6,1786560616,https://twitter.com/realDonaldTrump/status/178...,Listen to an interview with Donald Trump discu...,2009-05-13 12:38:28,14,17,,
7,1796477499,https://twitter.com/realDonaldTrump/status/179...,"""Strive for wholeness and keep your sense of w...",2009-05-14 11:30:40,17,27,,
8,1806258917,https://twitter.com/realDonaldTrump/status/180...,"Enter the ""Think Like A Champion"" signed book ...",2009-05-15 09:13:13,14,9,,
9,1820624395,https://twitter.com/realDonaldTrump/status/182...,"""When the achiever achieves, it's not a platea...",2009-05-16 17:22:45,19,49,,


In [2]:
tweets = tweets[('2016-11-08' <= tweets.date) & (tweets.date <= '2017-11-08')] #from election day 2016 to the next year, for space purposes
tweets = tweets[['content']]
tweets.head(10)

Unnamed: 0,content
30887,Today we are going to win the great state of M...
30888,TODAY WE MAKE AMERICA GREAT AGAIN!
30889,VOTE TODAY! Go to http://vote.gop to find your...
30890,We need your vote. Go to the POLLS! Let's cont...
30891,# ElectionDay http://vote.gop pic.twitter.com/...
30892,I will be watching the election results from T...
30893,"Just out according to @ CNN: ""Utah officials r..."
30894,"Don't let up, keep getting out to vote - this ..."
30895,Still time to # VoteTrump! # iVoted # Election...
30896,Watching the returns at 9:45pm. # ElectionNigh...


In [3]:
tweets.shape

(2143, 1)

In [4]:
import string 


def isAscii(s):         
    for c in s:
        if c not in string.printable:
            return False
    return True

tweets['content'] = tweets['content'].str.lower() #lower case all letters to reduce character count

tweets['content'] = tweets['content'].apply(lambda r: ' '.join([x for x in r.split() if 'http' not in x]))#getting rid of all website/links
tweets['content'] = tweets['content'].apply(lambda r: ' '.join([x for x in r.split() if 'pic.' not in x]))#getting rid of all linked pictures
tweets['content'] = tweets['content'].apply(lambda r: ' '.join([x for x in r.split() if isAscii(x)]))   #to remove words that arent ascii 
#tweet['content'] = tweets['content'].decode('ascii')

    
    
corpus_text = '\n'.join(tweets['content'].values) #one text for better 

print(corpus_text[:2000])

today we are going to win the great state of michigan and we are going to win back the white house! thank you mi! bei gerald r. ford international airport (grr)
today we make america great again!
vote today! go to to find your polling location. we are going to make america great again! # votetrump #
we need your vote. go to the polls! let's continue this movement! find your poll location: # electionday #
# electionday
i will be watching the election results from trump tower in manhattan with my family and friends. very exciting!
just out according to @ cnn: "utah officials report voting machine problems across entire country"
don't let up, keep getting out to vote - this election is far from over! we are doing well but there is much time left. go florida!
still time to # votetrump! # ivoted # bei trump tower
watching the returns at 9:45pm. # electionnight # bei trump tower
such a beautiful and important evening! the forgotten man and woman will never be forgotten again. we will all com

In [5]:
listofchars = sorted(list(set(corpus_text)))
print(listofchars)

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}']


In [6]:
#mapping each character to a unique integer and vice-versa
char_to_indx = dict((c, i) for i, c in enumerate(listofchars))
indx_to_char = dict((i, c) for i, c in enumerate(listofchars))

In [7]:
#cut the text into sequences, and save those to a list
seq = 40
steps = 3
sentences = []
next_chars = []


for i in range(0, len(corpus_text)-seq, steps):   
    sentences.append(corpus_text[i: i+seq])
    next_chars.append(corpus_text[i+seq])

    
    
x = np.zeros((len(sentences), seq, len(listofchars)), dtype=np.bool)#samples, timesteps, features
y = np.zeros((len(sentences), len(listofchars)), dtype=np.bool)#samples, features

#one hot encode the matrices
for i, sentence in enumerate(sentences):
    for t, listofchar in enumerate(sentence):
        x[i, t, char_to_indx[listofchar]] = 1
    y[i, char_to_indx[next_chars[i]]] = 1

        
print(len(sentences))

83095


In [8]:
#model architecture 
model = Sequential()
model.add(LSTM(128, input_shape = (x.shape[1], x.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())


model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(BatchNormalization())



model.add(Dense(y.shape[1], activation = 'softmax'))

model.compile(optimizer='RMSprop',loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 40, 128)           96768     
_________________________________________________________________
dropout_1 (Dropout)          (None, 40, 128)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 40, 128)           512       
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 64)                256       
_________________________________________________________________
dense_1 (Dense)              (None, 60)               

In [9]:
model.fit(x, y, epochs=8, batch_size=128)



Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [31]:
#use trained model to predict characters
def predict_next_chars(model, seed, num_to_predict): #maybe just model instead of textmodel? if this doesnt work?
    
    predicted_chars = ''
        
    for i in range(num_to_predict):
        x_test = np.zeros((1, seq, len(listofchars)))
        #transform seed to numerical characters
        for t, char in enumerate(seed):
            x_test[0, t, char_to_indx[char]]=1
        
        #make predictions from seed and translate those predictions back to characters
        x_predict = model.predict(x_test, verbose = 0)[0]
        r = np.argmax(x_predict)                       
        d = indx_to_char[r] 
        predicted_chars+=d
        seed = seed[1:]+d
    return predicted_chars


start_inds = [8228, 12400, 15003, 347]

for s in start_inds:
    start_index = s
    seed = corpus_text[start_index: start_index + seq]

    predictedtext = predict_next_chars(model, seed, num_to_predict=50)


    tweet = '['+ seed +']'+ predictedtext

    print(tweet)


[ she then said, "we have to accept the r]epublican and the working the working the working 
[ real work begins. america will start wi]ll be and the working the working the working the 
[oeing is building a brand new 747 air fo]r the fake news and the working the working the wo
[inue this movement! find your poll locat]ion the working the working the working the workin


In [None]:
"""
because of the low epochs (and maybe the smaller corpus text size, I only took one year of tweets), the model
reverts to the same text after a couple of characters, and after anytime words like the/of are predicted. That could also be 
the year of tweets I chose. The president was embroiled in alot of scandals right after his election,
and chose to hammer home alot fo similar talking points in his tweets, which made it harder to train
with a small amount of epochs.
With more epochs/a larger sample of tweets, this tweet predictor should preform 
significantly better, but this works okay considering time/memory contraints. It predicts characters that form actual
english words. Maybe if we introduced a function that generated more creative/diverse text generation, that could help
"""