<a href="https://colab.research.google.com/github/Da-Pen/CS486-twitter-bot/blob/main/LSTM/CS486_LSTM_word_level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
from collections import defaultdict

# CONSTANTS
NEWS_ORGS_DATA_FILE_NAME = '/content/data/newsorgs_data'
TRUMP_DATA_FILE_NAME = '/content/data/donald_trump_data'
ONLY_LOWERCASE = True
SKIP_URLS = True
SKIP_ELLIPSES = True
SKIP_RETWEETS = True
SKIP_REPLIES = True     # it seems like Trump often has tweets where he simply replies to another Twitter user or quotes them. They usually start with '@' or '"@'. If this is set to true, then ignore those tweets.
MIN_TWEET_LENGTH = 50 # characters

# returns a string minus all the urls in it
def ignore_urls(s):
    return ' '.join([x for x in s.split() if 'http' not in x])


# returns True for words like 'Hello' and 'hello' but not 'HELLO' or 'HelLo'
def is_normal_capitalization(word):
    return word[1:].islower()

# replaces 'Abcdef' with 'abcdef' but leaves 'ABCDEF' and 'AbCdeF' intact
def replace_first_caps(sentence):
    return ' '.join([word.lower() if is_normal_capitalization(word) else word for word in sentence.split(' ')])

# gets a list of strings representing the tweets in the given file.
# can limit the number of tweets to get using upto.
# replaces 'NEWLINE's with actual \n characters.
def get_tweets_list(filename, upto=None):
    f = open(filename, 'r')
    lines = f.read().split('\n')[:upto]
    f.close()
    # replace NEWLINE's and ignore all lines that do not have spaces (because they are probably just a link)
    lines = [line.replace('NEWLINE', '\n') for line in lines if line.strip().find(' ') != -1]
    if ONLY_LOWERCASE:
        lines = [replace_first_caps(line) for line in lines]
    if SKIP_ELLIPSES:  # skip tweets with the '…' character, which indicates that it has been truncated
        lines = [line for line in lines if line.find('…') == -1]
    if SKIP_URLS:
        lines = [ignore_urls(line) for line in lines]
    if SKIP_RETWEETS:
        lines = [line for line in lines if line[:2] != 'RT']
    if SKIP_REPLIES:
        lines = [line for line in lines if len(line) > 0 and line[0] != '@' and line[:2] != '"@']
    # # check what percentage of characters are valid: if less than MIN_VALID_CHAR_PERCENT are valid, then ignore this tweet. Otherwise, delete invalid characters.
    # lines = [filter_invalid_chars(line) for line in lines if filter_invalid_chars(line) is not None]
    return np.array(lines)

# given a list of tweets, gets a map of words to occurrences
def get_words(tweets):
    all_words = defaultdict(lambda: 0)
    for tweet in tweets:
        words = tweet.split(' ')
        for word in words:
            all_words[word] += 1
    return all_words

def get_words_list(words_map):
    min_occurrence = 5
    words_list = []
    for word in words_map.keys():
        if words_map[word] > min_occurrence:
            words_list.append(word)
    return words_list


def filter_words(tweet, words_set):
    return ' '.join([word for word in tweet.split(' ') if word in words_set])

# tweets = get_tweets_list(TRUMP_DATA_FILE_NAME)
tweets = get_tweets_list(NEWS_ORGS_DATA_FILE_NAME)

words_list = get_words_list(get_words(tweets))
word_to_index = dict((c, i) for i, c in enumerate(words_list))
index_to_word = dict((i, c) for i, c in enumerate(words_list))
print('there are', len(words_list), 'words')
print(words_list)
words_set = set(words_list)
# ignore all invalid words in tweets
print("BEFORE filtering, there were", len(tweets), "tweets")
new_tweets = []
for tweet in tweets:
    filtered_tweet = filter_words(tweet, words_set)
    if len(filtered_tweet) > 0.8*len(tweet):
        new_tweets.append(filtered_tweet)
tweets = new_tweets

# filter short tweets
tweets = [tweet for tweet in tweets if len(tweet) > MIN_TWEET_LENGTH]

print("AFTER filtering, there are", len(tweets), "tweets")


def main():
    pass    # do nothing (may comment out if we want to test something)

if __name__ == '__main__':
    main()


there are 7806 words
BEFORE filtering, there were 18632 tweets
AFTER filtering, there are 10685 tweets


Train Model

In [None]:
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Bidirectional, BatchNormalization, Activation
from keras.layers import Dropout
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.utils.data_utils import get_file
import random
import io
from google.colab import files
!pip3 install truecase
import truecase

INPUT_LENGTH = 5  # based on INPUT_LENGTH characters, our model generates the next character
GENERATED_TWEET_LENGTH = 20 # words


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def get_truecase(sentence):
    old_words = sentence.split(' ')
    new_words = truecase.get_true_case(sentence).split(' ')
    print('raw truecase:', ' '.join(new_words))
    for old_word in old_words:
        if not is_normal_capitalization(old_word):
            var1 = old_word.lower()
            var2 = var1[0].upper() + var1[1:]
            if var1 in new_words:
                new_words[new_words.index(var1)] = old_word
            elif var2 in new_words:
                new_words[new_words.index(var2)] = old_word
    return ' '.join(new_words)
    # return ' '.join([new_words[i] if is_normal_capitalization(old_words[i]) else old_words[i] for i in range(len(old_words))])

def on_epoch_end(epoch, _, data, model):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    for _ in range(2):     # use 10 different tweets as samples
        tweet = np.random.choice(data) # select random tweet
        start_index = 0

        for diversity in [0.2, 0.4, 0.6, 1.0]:
        # for diversity in [0.1, 0.2, 0.3, 0.4]:
        # for diversity in [0.3, 0.4, 0.5]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = tweet.split(' ')[start_index: start_index + INPUT_LENGTH]
            generated += ' '.join(sentence)
            print('----- Generating with seed: "' + ' '.join(sentence) + '"')
            # sys.stdout.write(generated)

            for i in range(GENERATED_TWEET_LENGTH):
                x_pred = np.zeros((1, INPUT_LENGTH, len(words_list)))
                for t, word in enumerate(sentence):
                    x_pred[0, t, word_to_index[word]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_word = index_to_word[next_index]
                generated += ' ' + next_word
                sentence = sentence[1:] + [next_word]

                # sys.stdout.write(next_word)
                # sys.stdout.flush()
            print(generated)
            print('with truecase:')
            # preserve
            print(get_truecase(generated))
            print()
    # save and download the model
    model.save('/content/model')
    !zip -r /content/model.zip /content/model
    files.download('/content/model.zip')


def train_from_data(data, train_limit=None):
    # convert the raw tweets list to input and output
    # input is equal to INPUT_LENGTH characters, output is a single character
    if train_limit:
        data = data[:train_limit]
    sentences = []
    next_words = []
    for tweet in data:
        tweet_words = tweet.split(' ')
        for i in range(0, len(tweet_words) - INPUT_LENGTH):
            sentences.append(tweet_words[i: i + INPUT_LENGTH])
            next_words.append(tweet_words[i + INPUT_LENGTH])
    print('# training samples:', len(sentences))
    # for i in range(10):
    #     print(sentences[i],'->',next_words[i])

    # vectorize the data
    print('Vectorization...')
    x = np.zeros((len(sentences), INPUT_LENGTH, len(words_list)), dtype=np.bool)
    y = np.zeros((len(sentences), len(words_list)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, word in enumerate(sentence):
            x[i, t, word_to_index[word]] = 1
        y[i, word_to_index[next_words[i]]] = 1

    # build the model
    print('Build model...')
    model = Sequential()
    model.add(LSTM(128, input_shape=(INPUT_LENGTH, len(words_list))))
    # model.add(LSTM(len(VALID_CHARS) * 7, input_shape=(INPUT_LENGTH, len(VALID_CHARS))))
    
    model.add(BatchNormalization())
    model.add(Activation('selu'))

    model.add(Dense(128))
    model.add(Activation('selu'))

    # model.add(Dense(len(VALID_CHARS)*4))
    # model.add(BatchNormalization())
    # model.add(Activation('selu'))

    # model.add(Bidirectional(LSTM(128), input_shape=(INPUT_LENGTH, len(VALID_CHARS))))
    model.add(Dense(len(words_list), activation='softmax'))

    # optimizer = RMSprop(lr=0.01)
    optimizer = Adam()
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    epochs = 10
    
    print_callback = LambdaCallback(on_epoch_end=lambda a, b: on_epoch_end(a, b, data, model))

    # train the model
    model.fit(x, y,
            epochs=epochs,
            callbacks=[print_callback]
            )

    # save and download the model
    model.save('/content/model')
    !zip -r /content/model.zip /content/model
    files.download('/content/model.zip')

def main():
    print("number of tweets:", len(tweets))
    train_from_data(tweets)


if __name__ == '__main__':
    main()

number of tweets: 8104
# training samples: 106272
Vectorization...
Build model...
Epoch 1/10
----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "thank you nevada! #AmericaFirst #MakeAmericaGreatAgain"
thank you nevada! #AmericaFirst #MakeAmericaGreatAgain #Trump2016 #SuperTuesday #MakeAmericaGreatAgain #Trump2016 #SuperTuesday #MakeAmericaGreatAgain #Trump2016 #SuperTuesday #MakeAmericaGreatAgain #Trump2016 #SuperTuesday #MakeAmericaGreatAgain #Trump2016 #MakeAmericaGreatAgain #Trump2016 #SuperTuesday #MakeAmericaGreatAgain #Trump2016 #SuperTuesday #MakeAmericaGreatAgain
with truecase:
raw truecase: Thank you Nevada! #Americafirst #Makeamericagreatagain #Trump2016 #Supertuesday #Makeamericagreatagain #Trump2016 #Supertuesday #Makeamericagreatagain #Trump2016 #Supertuesday #Makeamericagreatagain #Trump2016 #Supertuesday #Makeamericagreatagain #Trump2016 #Makeamericagreatagain #Trump2016 #Supertuesday #Makeamericagreatagain #Trump2016 #Supertuesday #Mak

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 2/10
----- Generating text after Epoch: 1
----- diversity: 0.2
----- Generating with seed: "wise words from my in"
wise words from my in new york post office in D.C. on the world of the world class amenities in the world of the world
with truecase:
raw truecase: Wise words from my in New York post office in D. C. on the world of the world class amenities in the world of the world
Wise words from my in New York post office in D. C. on the world of the world class amenities in the world of the world

----- diversity: 0.4
----- Generating with seed: "wise words from my in"
wise words from my in new york poll is going to be great american flag &amp; they have a complete waste. total lie to the
with truecase:
raw truecase: Wise words from my in New York poll is going to be great American flag& they have a complete waste. Total lie to the
Wise words from my in New York poll is going to be great American flag& they have a complete waste. Total lie to the

----- diversity: 0.6
----- Gene

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 3/10
----- Generating text after Epoch: 2
----- diversity: 0.2
----- Generating with seed: "looking forward to returning to"
looking forward to returning to the state of the union speech was amazing - and a great time to get the great state of the
with truecase:
raw truecase: Looking forward to returning to the state of the Union speech was amazing- and a great time to get the great state of the
Looking forward to returning to the state of the Union speech was amazing- and a great time to get the great state of the

----- diversity: 0.4
----- Generating with seed: "looking forward to returning to"
looking forward to returning to the state of the union speech was amazing people. will be back in arizona and will be a disaster for
with truecase:
raw truecase: Looking forward to returning to the state of the Union speech was amazing people. will be back in Arizona and will be a disaster for
Looking forward to returning to the state of the Union speech was amazing people. will be back

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 4/10
----- Generating text after Epoch: 3
----- diversity: 0.2
----- Generating with seed: "the trump organization purchase of"
the trump organization purchase of doral in miami. trump tower during the trump campaign has been a total disaster for the fact that I am
with truecase:
raw truecase: The Trump organization purchase of Doral in Miami. Trump tower during the Trump campaign has been a total disaster for the fact that I am
The Trump organization purchase of Doral in Miami. Trump tower during the Trump campaign has been a total disaster for the fact that I am

----- diversity: 0.4
----- Generating with seed: "the trump organization purchase of"
the trump organization purchase of doral in miami. I will be back soon! #Trump2016 #MakeAmericaGreatAgain #Trump2016 #FITN #Trump2016 #IACaucus #FITN #Trump2016 #MakeAmericaGreatAgain #FITN #Trump2016 #MakeAmericaGreatAgain
with truecase:
raw truecase: The Trump organization purchase of Doral in Miami. I will be back soon! #Trump2016 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 5/10
----- Generating text after Epoch: 4
----- diversity: 0.2
----- Generating with seed: "money should be at work"
money should be at work for a long time &amp; sadly, he gets nothing done. loser! @VanityFair are a way he is a third party
with truecase:
raw truecase: Money should be at work for a long time& sadly, he gets nothing done. loser! @Vanityfair are a way he is a third party
Money should be at work for a long time& sadly, he gets nothing done. loser! @Vanityfair are a way he is a third party

----- diversity: 0.4
----- Generating with seed: "money should be at work"
money should be at work is a big deal. -- the art of the deal about me! will be the finest in the U.S. history.
with truecase:
raw truecase: Money should be at work is a big deal.-- the art of the deal about me! will be the finest in the U. S. history.
Money should be at work is a big deal.-- the art of the deal about me! will be the finest in the U. S. history.

----- diversity: 0.6
----- Generating with s

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 6/10
----- Generating text after Epoch: 5
----- diversity: 0.2
----- Generating with seed: "germany is going through massive"
germany is going through massive attacks to its people by the wrong almost at the debate? just can't make the deal with iran deal, especially
with truecase:
raw truecase: Germany is going through massive attacks to its people by the wrong almost at the debate? just can't make the deal with Iran deal, especially
Germany is going through massive attacks to its people by the wrong almost at the debate? just can't make the deal with Iran deal, especially

----- diversity: 0.4
----- Generating with seed: "germany is going through massive"
germany is going through massive attacks to its people by the U.S.! is he was a bad deal. but i'm smart results to those who
with truecase:
raw truecase: Germany is going through massive attacks to its people by the U. S.! is he was a bad deal. But I'm smart results to those who
Germany is going through massive attacks to its 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 7/10
----- Generating text after Epoch: 6
----- diversity: 0.2
----- Generating with seed: "the record 13th season of"
the record 13th season of ‘All star’ @CelebApprentice. where they're coming from. #IACaucus location at so important to vote! #MakeAmericaGreatAgain #Trump2016 #MakeAmericaGreatAgain #Trump2016 #IACaucus #FITN
with truecase:
raw truecase: The record 13th season of ‘ all Star ’ @Celebapprentice. where they're coming from. #Iacaucus location at so important to vote! #Makeamericagreatagain #Trump2016 #Makeamericagreatagain #Trump2016 #Iacaucus #Fitn
The record 13th season of ‘ all Star ’ @Celebapprentice. where they're coming from. #Iacaucus location at so important to vote! #Makeamericagreatagain #Trump2016 #Makeamericagreatagain #Trump2016 #Iacaucus #Fitn

----- diversity: 0.4
----- Generating with seed: "the record 13th season of"
the record 13th season of ‘All star’ @CelebApprentice. where they're coming from. try to create a situation. -- sun tzu more than your

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 8/10
----- Generating text after Epoch: 7
----- diversity: 0.2
----- Generating with seed: "opportunities only present themselves if"
opportunities only present themselves if you are out there and never had a job, has had a tough time. that's politics! where we explain that
with truecase:
raw truecase: Opportunities only present themselves if you are out there and never had a job, has had a tough time. That's politics! where we explain that
Opportunities only present themselves if you are out there and never had a job, has had a tough time. That's politics! where we explain that

----- diversity: 0.4
----- Generating with seed: "opportunities only present themselves if"
opportunities only present themselves if you take advantage of yourself and NY times that is a senator like a dog. at he calls me I
with truecase:
raw truecase: Opportunities only present themselves if you take advantage of yourself and NY times that is a Senator like a dog. At he calls me I
Opportunities only pre

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 9/10
----- Generating text after Epoch: 8
----- diversity: 0.2
----- Generating with seed: "make it special! no better"
make it special! no better place to celebrate the eve than the most elite hotel in downtown @TrumpSoHo is a top destination located on the
with truecase:
raw truecase: Make it special! no better place to celebrate the eve than the most elite hotel in downtown @Trumpsoho is a top destination located on the
Make it special! no better place to celebrate the eve than the most elite hotel in downtown @Trumpsoho is a top destination located on the

----- diversity: 0.4
----- Generating with seed: "make it special! no better"
make it special! no better place to celebrate the eve than the most elite hotel in downtown toronto is the most elite in a winner
with truecase:
raw truecase: Make it special! no better place to celebrate the eve than the most elite hotel in downtown Toronto is the most elite in a winner
Make it special! no better place to celebrate the eve than t

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 10/10
----- Generating text after Epoch: 9
----- diversity: 0.2
----- Generating with seed: "I will be interviewed by"
I will be interviewed by @MariaBartiromo at 6:00 A.M. @FoxBusiness. enjoy! #Trump2016 #MakeAmericaGreatAgain #Trump2016 #MakeAmericaGreatAgain #FITN #FITN carolina. #MakeAmericaGreatAgain #Trump2016 #SuperTuesday #MakeAmericaGreatAgain #Trump2016 #IACaucus #FITN
with truecase:
raw truecase: I will be interviewed by @Mariabartiromo at 6:00 a. M. @Foxbusiness. enjoy! #Trump2016 #Makeamericagreatagain #Trump2016 #Makeamericagreatagain #Fitn #Fitn Carolina. #Makeamericagreatagain #Trump2016 #Supertuesday #Makeamericagreatagain #Trump2016 #Iacaucus #Fitn
I will be interviewed by @Mariabartiromo at 6:00 a. M. @Foxbusiness. enjoy! #Trump2016 #Makeamericagreatagain #Trump2016 #Makeamericagreatagain #Fitn #Fitn Carolina. #Makeamericagreatagain #Trump2016 #Supertuesday #Makeamericagreatagain #Trump2016 #Iacaucus #Fitn

----- diversity: 0.4
----- Generating with seed: "I wi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:tensorflow:Assets written to: /content/model/assets
updating: content/model/ (stored 0%)
updating: content/model/saved_model.pb (deflated 89%)
updating: content/model/variables/ (stored 0%)
updating: content/model/variables/variables.index (deflated 65%)
updating: content/model/variables/variables.data-00000-of-00001 (deflated 9%)
updating: content/model/assets/ (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Code to generate tweets after model is trained

In [16]:
from tensorflow import keras


NUM_TWEETS_TO_GENERATE = 100
TEMPERATURE = 0.7

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# --- LOAD THE MODEL --- #
# !unzip /content/model.zip
!unzip -o /content/model.zip
model = keras.models.load_model('/content/content/model')

GENERATED_TWEET_LENGTH = 30

f = open('/content/model-output.txt', 'w')
for i in range(NUM_TWEETS_TO_GENERATE):
    tweet = np.random.choice(tweets) # select random tweet
    start_index = 0
    generated = ''
    sentence = tweet.split(' ')[start_index: start_index + INPUT_LENGTH]
    generated += ' '.join(sentence)
    print('----- Generating with seed: "' + ' '.join(sentence) + '"')

    for i in range(GENERATED_TWEET_LENGTH):
        x_pred = np.zeros((1, INPUT_LENGTH, len(words_list)))
        for t, word in enumerate(sentence):
            x_pred[0, t, word_to_index[word]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, TEMPERATURE)
        next_word = index_to_word[next_index]
        generated += ' ' + next_word
        sentence = sentence[1:] + [next_word]

    print(generated)
    f.write(generated + '\n')
    print()
f.close()


Archive:  /content/model.zip
  inflating: content/model/saved_model.pb  
  inflating: content/model/variables/variables.index  
  inflating: content/model/variables/variables.data-00000-of-00001  
----- Generating with seed: "#Analysis: as canadian provinces line"


ValueError: ignored