<a href="https://colab.research.google.com/github/Da-Pen/CS486-twitter-bot/blob/main/LSTM/CS486_LSTM_word_level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np
from collections import defaultdict

# CONSTANTS
NEWS_ORGS_DATA_FILE_NAME = '/content/newsorgs_data'
TRUMP_DATA_FILE_NAME = '/content/donald_trump_data'
ONLY_LOWERCASE = True
SKIP_URLS = True
SKIP_ELLIPSES = True
SKIP_RETWEETS = True
SKIP_REPLIES = True     # it seems like Trump often has tweets where he simply replies to another Twitter user or quotes them. They usually start with '@' or '"@'. If this is set to true, then ignore those tweets.
MIN_TWEET_LENGTH = 50 # characters

# returns a string minus all the urls in it
def ignore_urls(s):
    return ' '.join([x for x in s.split() if 'http' not in x])


# returns True for words like 'Hello' and 'hello' but not 'HELLO' or 'HelLo'
def is_normal_capitalization(word):
    return word[1:].islower()

# replaces 'Abcdef' with 'abcdef' but leaves 'ABCDEF' and 'AbCdeF' intact
def replace_first_caps(sentence):
    return ' '.join([word.lower() if is_normal_capitalization(word) else word for word in sentence.split(' ')])

# gets a list of strings representing the tweets in the given file.
# can limit the number of tweets to get using upto.
# replaces 'NEWLINE's with actual \n characters.
def get_tweets_list(filename, upto=None):
    f = open(filename, 'r')
    lines = f.read().split('\n')[:upto]
    f.close()
    # replace NEWLINE's and ignore all lines that do not have spaces (because they are probably just a link)
    lines = [line.replace('NEWLINE', '\n') for line in lines if line.strip().find(' ') != -1]
    if ONLY_LOWERCASE:
        lines = [replace_first_caps(line) for line in lines]
    if SKIP_ELLIPSES:  # skip tweets with the '‚Ä¶' character, which indicates that it has been truncated
        lines = [line for line in lines if line.find('‚Ä¶') == -1]
    if SKIP_URLS:
        lines = [ignore_urls(line) for line in lines]
    if SKIP_RETWEETS:
        lines = [line for line in lines if line[:2] not in ('RT', 'rt')]
    if SKIP_REPLIES:
        lines = [line for line in lines if len(line) > 0 and line[0] != '@' and line[:2] != '"@']
    # # check what percentage of characters are valid: if less than MIN_VALID_CHAR_PERCENT are valid, then ignore this tweet. Otherwise, delete invalid characters.
    # lines = [filter_invalid_chars(line) for line in lines if filter_invalid_chars(line) is not None]
    return np.array(lines)

# given a list of tweets, gets a map of words to occurrences
def get_words(tweets):
    all_words = defaultdict(lambda: 0)
    for tweet in tweets:
        words = tweet.split(' ')
        for word in words:
            all_words[word] += 1
    return all_words

def get_words_list(words_map):
    min_occurrence = 5
    words_list = []
    for word in words_map.keys():
        if words_map[word] > min_occurrence:
            words_list.append(word)
    return words_list


def filter_words(tweet, words_set):
    return ' '.join([word for word in tweet.split(' ') if word in words_set])

tweets = get_tweets_list(TRUMP_DATA_FILE_NAME)
# tweets = get_tweets_list(NEWS_ORGS_DATA_FILE_NAME)

words_list = get_words_list(get_words(tweets))

# words_list = ['organizers', 'announced', 'that', 'canadian', 'singer', 'the', 'will', 'perform', 'at', '2021', 'super', 'show.', 'new', 'projections', 'by', "ontario's", 'science', 'advisory', 'table', 'show', 'pandemic', 'is', 'worsening', 'across', 'province', 'quebec', 'premier', 'fran√ßois', 'legault', 'says', 'government', 'must', 'consider', 'all', 'options', 'to', 'curb', 'spread', 'of', 'COVID-19,', 'including', 'winter', 'holiday', 'break', 'in', 'international', 'force', 'peace', 'agreement', 'said', 'eight', 'were', 'killed', 'when', 'one', 'its', 'crashed', 'during', 'a', 'mission', 'number', 'COVID-19', 'cases', 'per', 'day', 'on', 'rise', 'states,', 'and', 'deaths', 'from', 'federal', 'has', 'plans', 'help', 'more', 'people', 'living', 'hong', 'kong', 'come', 'canada', 'as', 'chinese', 'down', 'pro-democracy', 'movement', 'may', 'have', 'broken', 'law', 'protests', 'RCMP', '@cattunneycbc', 'A', 'nova', 'scotia', 'man', 'launched', 'lawsuit', 'against', 'organization', 'runs', 'where', 'he', 'others', 'abused', 'for', 'years.', 'knew', 'it', 'was', 'wrong', 'her', 'sister', 'with', 'yet', 'she', 'went', 'great', 'cover', 'up', 'this', 'information.', 'canadians', 'deserve', 'better', 'their', 'elected', '‚Äî', 'conservative', 'critic', 'michael', '@AshleyBurkeCBC', 'U.S.', 'food', 'drug', 'administration', 'approved', 'promising', 'antibody', 'therapy', 'scientists', 'treat', 'mild', 'moderate', 'coronavirus', 'infections.', '@JPTasker', '"There', 'no', 'time.', 'we', 'act', 'now.', 'need', 'something', 'strong', 'mandatory', 'order', 'dr.', 'windsor', 'regional', 'hospital', 'CEO', 'david', 'drove', 'pick', 'an', 'patient', "didn't", 'enough', 'stock', 'available', 'typhoon', 'areas', 'passed', 'over', 'two', 'million', 'are', 'without', 'power', 'metro', 'surrounding', 'area.', 'told', 'pay', 'balance', 'after', 'speaking', 'someone', 'virgin', 'mobile', 'about', 'expensive', 'president-elect', 'joe', 'biden', 'appointed', 'top', 'democratic', 'official', 'ron', 'klain', 'his', 'chief', 'staff.', 'charge', 'obama', "administration's", 'response', 'health', 'crisis', 'independent', 'And', "they're", 'advocates', 'say', 'step', 'efforts', 'fleeing', 'conflict', 'abroad', 'despite', 'lockdowns', 'travel', 'restrictions.', '@OttawaReporter', 'opposition', 'final', 'before', 'protest', 'four', 'every', 'remembrance', 'what', 'soldiers', 'some', 'already', 'project', 'ontario', 'surpass', '2,000', 'early', '@CBCQueensPark', 'james', 'song', 'keep', 'point', 'view', 'would', 'fears', 'transmission', 'around', 'st.', 'really', 'set', 'into', 'motion', 'first', 'also', 'leave', 'position', 'legal', 'analyst', 'CNN.', "province's", 'newest', 'public', 'sets', 'indoor', 'outdoor', 'size', 'five', 'people,', 'agency', 'own', 'authority', 'not', 'nations', 'do', 'things,', 'expert', 'says.', 'restrictions', 'could', 'include', 'things', 'like', 'capacity', 'services', 'activities', 'among', 'them', 'nearly', '200', 'queens', 'had', 'potential', 'start', 'there', 'few', 'but', 'still', 'development', 'rapid', 'testing', 'vaccines', 'novel', 'ensure', 'summer', 'games', 'can', 'successfully', 'take', 'place', 'tokyo', 'next', 'year,', 'starting', 'july', 'toward', 'dropped', '29', 'cent', 'pre-pandemic', '21', 'border', 'get', 'discovery', 'conservation', 'asked', 'court', 'father', 'return', 'role', 'global', 'management', 'agreed', 'share', 'company', 'price', 'very', 'good', "here's", 'latest', 'how', 'rest', 'world.', 'success', 'keeping', 'rates', 'low', 'change', 'coming', 'weeks,', 'according', '@TO_jwo', 'newly', 'confirmed', 'infections', 'push', 'seven-day', 'average', 'daily', 'highest', 'been', 'any', 'pandemic.', 'former', 'parks', 'manager', 'anonymous', 'restaurants', 'grocery', 'private', 'being', 'planned', 'long-term', 'care', 'facilities', 'home', "canada's", 'oldest', 'many', 'whom', 'particularly', 'high', 'risk', 'COVID-19.', 'centre', 'be', 'spot', 'police', 'community', 'groups', 'drop', 'off', 'person', 'overnight', 'instead', 'sending', '15', 'lawmakers', 'following', 'move', 'fellow', 'storms', 'longer', 'ocean', 'expert.', '@NebulousNikki', 'death', 'toll', 'again', 'begun', 'than', '600', 'since', 'late', 'august,', 'half', 'which', 'occurred', 'just', 'three', 'weeks.', 'who', 'should', 'vaccine', 'extremely', 'challenging', 'data', '@adamsmiller', '"It', 'so', 'said,', 'through', 'face', 'LIVE', '|', 'CBC', 'news', 'special', 'made', 'covered', 'took', '300', 'hours', 'our', 'annual', 'going', 'virtual', 'way', 'make', 'experience', 'future', '@Murray_Brewster', 'protests.', 'now', 'they', 'both', 'veterans', 'second', 'world', 'war.', 'fox', 'remarks', 'those', 'trump.', 'chose', 'toronto', 'because', 'demand', 'seeing', 'users', 'country,', 'executive', 'scott', "tuesday's", 'numbers', 'continue', 'phase', 'largely', 'lower', 'strict', 'guidelines', 'place.', 'hundreds', 'died', 'air', 'fighting', 'escalating', 'fear', 'civil', 'war', 'given', 'deep', 'between', 'prime', 'minister', 'abiy', 'comes', 'largest', 'ethnic', 'provides', 'opportunity', 'used', 'same', 'or', 'measures', 'under', 'section', '22', 'city', 'require', 'meeting', 'event', 'halls', 'other', 'remain', 'massive', 'explosion', 'port', 'district', 'buildings', 'within', 'killing', 'justices', 'heard', 'arguments', 'appeal', 'coalition', '20', 'states', 'california', 'york', 'house', 'representatives', 'hoping', 'preserve', 'doctors', 'health-care', 'workers', 'warning', 'hospitals', 'able', 'handle', 'rising', 'cases.', 'staffers', 'employees', 'them,', 'appearances', 'publicly', 'work.', 'amazon', 'fines', 'billion', 'US', 'faces', 'antitrust', 'results', 'showing', 'won', 'election,', 'dismissed', 'questions', 'whether', 'lost', 'judge', 'election', 'president', "trump's", 'claims', 'fraud', 'polls.', 'statement', 'officials', 'reporting', 'called', 'country.', '"We', 'truly', 'fight', 'officer', 'brent', 'tuesday', 'morning', 'announcement', 'brian', 'total', 'includes', 'consecutive', 'well', 'peel', 'region,', '100', 'region', '50', 'NDP', 'leader', 'thinks', 'basic', 'income', 'pilot', 'newfoundland', 'labrador', 'fuel', 'national', 'program.', "it's", '1', 'trial.', "he's", 'last', 'him', 'months', 'clear', 'name.', 'alberta', 'calling', 'two-week', 'lockdown', 'homes', 'repeatedly', 'violated', 'provincial', 'rules.', 'martin', "country's", 'congress', 'corruption', 'update:', 'trudeau,', 'address', 'defence', 'staff', 'general', 'asking', 'join', 'campaign', 'award', 'cross', 'soldier', 'you', 'university', 'professor', '$1', 'gold', 'work', '@mle_chung', '#Analysis', 'committed', 'climate', 'change.', 'pressure', 'more.', '73', 'years,', 'john', 'visited', 'grave', 'fallen', 'immigration', 'lawyer', 'pandemic,', 'detention', 'don', 'mortgage', 'bad', 'interest', '2020', 'prize', 'awarded', 'collection', 'remove', 'parts', 'voters', 'chance', 'polls', '9', 'a.m.', '8', 'p.m.', 'friday.', 'monday,', 'support', 'known', 'bill', 'believed', 'always', 'represents', 'regardless', 'why', 'wear', 'it.', 'believe', 'conversation', 'choice', 'signed', '70', 'control', 'armenian', 'forces', 'backed', 'armenia', 'ended', 'estimated', '30,000', 'died.', 'deal', 'monday', 'sell', 'partnership', 'british', 'columbia', 'briefing', 'weekend', 'sweeping', 'vancouver', 'coastal', 'regions', 'period', 'surge.', 'list', 'eligible', 'people.', 'recall', 'certain', '2018', 'expected', 'storm', 'likely', 'marks', 'beginning', 'trump', 'conceded', 'claiming', 'evidence', 'conspiracy', 'democrats', 'vote', 'tally', "biden's", 'analysis', 'far', 'looked', 'seven', 'days', 'pfizer', 'look', '14', 'adds', 'uncertainty', 'transition', 'nov.', '3', 'vote.', 'refused', 'concede', "week's", 'biden,', 'jan.', 'association', 'nothing', 'lawyers', 'making', 'financial', 'various', 'political', 'donations', 'test', 'positivity', 'rate', 'manitoba', 'rose', 'these', 'men', 'reported', 'alleged', 'He', 'justin', 'trudeau', 'spoke', 'today', 'shared', 'battle', 'attorney', 'criminal', 'investigation', 'donald', 'defamation', 'women', 'sexual', 'liberal', 'spend', 'dollars', 'connect', 'most', 'high-speed', 'internet', 'approval', 'process', 'confidence', 'safe', 'effective.', 'markets', 'result', 'elections,', 'saw', 'democrat', 'win', 'presidency.', "pfizer's", 'trial', 'secured', 'millions', 'said.', 'filed', 'alleging', 'job', 'allow', 'widespread', 'use', 'hyperloop', 'completed', "world's", 'passenger', 'ride', 'system,', 'executives', 'hit', 'las', 'kids', 'phone', 'handling', 'increased', 'even', 'difficult', 'demands', 'tropical', 'eta', 'landfall', 'key', 'sunday', 'night,', 'leaving', 'scores', 'dead', 'missing', 'mexico', 'central', 'america.', '#Analysis:', 'radio', 'host', 'topped', 'supporters', 'skeptical', 'progressive', 'policies', 'economy', '@markgollom', 'airline', 'industry', 'contingent', 'providing', 'passengers', 'whose', 'flights', 'cancelled', 'suggests', 'shots', '90', 'effective', 'preventing', "doesn't", 'mean', 'woman', 'officers', 'prison', 'heading', 'supreme', 'end', 'practice', 'halifax', 'students', 'raised', 'thousands', 'victims', 'mass', 'shooting', 'while', 'walking', 'route', 'aides', 'allies', 'acknowledged', 'privately', 'best', 'outbreak', 'infected', '11', 'residents', 'report', 'cases,', 'family', 'members', 'out', 'B.C.', 'focus', 'social', 'group', 'exercise', 'decisions', 'united', 'recorded', 'million,', 'seen', 'ont.', 'rallied', 'weekend.', 'came', 'observers', 'see', 'prince', 'william', 'changes', 'him.', "georgia's", 'so-called', 'state,', 'reliable', 'republican', 'presidential', 'wake', 'real', 'estate', 'experts', 'search', 'space.', 'foreign', 'affairs', 'advance', 'theresa', 'tam', 'several', 'accelerated', 'schools', 'spent', 'trying', 'sort', 'quality', 'issues.', 'prove', 'another', 'challenge.', 'hockey', 'night', 'icon', 'NHL', 'star,', 'age', 'falling', 'short', 'diversity', 'numbers.', 'TV', 'alex', 'trebek', 'long', 'career', 'back', 'years', 'upcoming', 
#  'ban', 'single-use', 'celebrations', 'put', 'plans.', 'BREAKING:', '80', 'edmonton', 'heavy', 'saturday.', 'couple', 'left', 'brunswick', 'gained', 'media', 'never', 'go', 'away.', 'mark', 'nation', '@Alex_Panetta', 'shows', 'viewers', 'tree', 'hot', 'days,', 'tweet', 'led', 'actor', 'arctic', 'space', 'station', '40', 'ago,', 'ken', 'record,', 'named', 'red', 'sunday,', 'sun', 'showed', 'up.', 'sure', 'exactly', 'happened', 'ont.,', 'launching', 'reading', 'service', 'proposed', 'drivers', 'downtown', 'critics', 'hurt', 'tiny', 'alaska,', 'attend', 'school', 'actually', 'getting', 'does', 'winnipeg', 'warn', 'curfew', 'amount', 'play', 'love', 'received', 'island', 'until', 'budget', 'sign', 'doug', 'ford', 'addresses', 'americans', 'time', 'star', 'paul', 'lee', 'emotional', 'moment', 'once', 'nation,', 'cities', 'worst', 'infection', 'canada.', "can't", "won't", 'become', 'again.', '@AaronWherry', 'quickly', 'turn', 'surge', 'least', 'prevent', 'worse.', 'looks', 'forward', 'building', 'relationship', 'biden.', 'crossed', '270', 'electoral', 'college', 'votes', 'pennsylvania.', 'documents', 'obtained', 'news,', 'patients', 'intensive', 'units', 'friday', 'june.', 'america', 'wins', 'election.', 'state', 'results.', 'politics,', 'energy', 'firms', 'jobs', 'struggling', '@KyleBakx', '@TonySeskus', 'mail-in', 'ballots', 'slim', 'candidates', 'delay', 'voter', 'here', 'answers', 'your', 'questions.', 'registered', 'voters,', 'though', 'turnout', 'recent', "there's", 'turned', 'beat', 'friday,', '28', 'care.', 'sitting', 'room', 'watching', 'movie', 'wearing', 'masks', 'usual', 'protocols', 'series', 'turner', 'pulled', 'game', 'tested', 'positive', "There\'s", 'generation', 'along', 'maybe', "don't", 'idea', 'lot', 'if', 'japanese', 'discovered', 'middle', 'record', 'little', 'bit', 'chaotic', 'COVID', 
#  'elections.', 'changing', 'market', 'recovery', 'october,', 'only', 'jobs.', 'added', 'smallest', 'began', 'month.', 'FBI', 'virginia', 'philadelphia', 'police.', 'timeline', 'warns', 'doses', 'first.', 'presidency', 'might', 'relations', 'russia', 'appeared', 'propaganda', 'based', 'whole', 'foods', 'poppies', "weren't", 'allowed', 'recently', 'updated', 'locations', "wouldn't", 'rejected', 'requests', 'minneapolis', 'charged', 'george', 'ordered', 'tried', 'together', 'airlines', 'schedule', 'tens', 'monthly', 'cancel', 'vast', 'majority', 'weeks', "It's", 'strategy', 'free', 'cannabis', 'concerned', 'stop', 'using', 'opioid', 'treatment.', 'places', 'enormous', 'support.', 'fact', "pennsylvania's", 'county', '2016', 'explain', 'born', 'leads', 'pennsylvania', 'conference', 'important', 'unfold', "he'll", 'illegal', 'businesses', 'The', 'claimed', 'responsible', 'losses', 'suffered', '2008', 'outbreak.', '#Opinion:', 'judges', 'neither', 'nor', 'ultimately', 'saying', 'is,', 'be.', 'figure', 'means', 'fewer', 'paid', 'did', 'animals', 'change,', 'MPs', 'growing', 'online', 'security', 'threats', 'directed', 'public.', 'wait', 'numbers,', 'suggested', 'burden', 'cannot', '@don_pittis', 'month', 'started', 'candidate', 'ground', 'battleground', 'georgia', 'parliament', 'bring', 'existing', 'suicide', 'prevention', 'looking', 'ways', 'meet', "aren't", 'broke', 'season', 'blue', 'detailed', 'case', 'due', 'technical', 'problems', 'unclear', 'letter', 'door', '"The', 'situation', 'needs', 'bonnie', 'henry', 'active', 'risen', 'multiple', 'studies', 'shown', 'light', 'found', 'streets', 'affect', 'august', 'fall', 'season,', 'production', 'season.', 'elections', 'spokesperson', 'having', 'nonpartisan', 'politics', 'voting', 'process.', 'small', 'amid', 'wave', 'tax', 'announce', 'direct', 'funding', 'billions', 'aid', 'helped', 'replace', 'costs', 'companies', 'facing', 'revenue', 'limit', 'puts', 'strain', 'areas.', 'officially', 'paris', 'countries', 'increases', 'below', 'century.', 'delayed', 'almost', 'handful', 'maintains', 'lead', 'experts,', 'intervene', 'stage', 'seemed', 'seem', 'know', 'counted.', 'germany', '20,000', 'day,', 'level', 'yet.', 'israel', 'village', 'west', 'such', '"I', 'think', 'meant', 'arm', 'GM', 'pickup', 'assembly', 'plant', 'labour', 'union', '@p_evans', 'town', 'bracing', 'rally', 'england', 'entering', 'venues', 'stores', 'selling', 'round', 'save', 'life.', 'crusade', 'cut', 'spending', 'biggest', 'deficit', 'history.', 'all-time', 'day.', 'major', '100,000', 'store', 'call', 'recommended', 'filter', 'mask,', 'bought', 'shopping', 'telescope', 'B.C.,', 'astronomers', 'powerful', 'fast', 'presence', 'team', 'philadelphia,', 'dozens', 'protesters', 'ready', 'down,', 'tensions', 'counting', 'votes.', 'engaged', 'recount', 'wisconsin', 'filing', 'lawsuits', 'pennsylvania,', 'michigan', 'georgia.', 'economists', 'worry', 'divide', 'dire', 'economic', 'follow', '2018,', 'canada,', '13', 'lives', 'coal', 'taken', 'line.', 'vote-counting', 'centers', 'detroit', 'phoenix', 'returns', 'states.', 'moved', 'closer', 'victory', 'race', 'wisconsin.', 'danish', 'virus', 'became', '500', 'jones', 'afternoon', 'fared', 'better.', 'begins', 'count', 'workplace', 'declared', 'linked', 'makes', 'us', 'accurate', 'picture', 'reached', 'parties', 'provide', 'rent', 'relief', 'fashion', 'bid', 'over.', 'posted', 'himself', 'twitter', 'front', 'map', 'term', "we're", 'want', 'falsely', 'apply', 'week.', 'extra', 'required', 'previously', 'flow', 'goods', 'statistics', 'ahead.', 'rules', 'ballots,', 'causing', 'delays', 'access', 'numerous', 'opening', 'ballots.', 'anyone', 'less', 'popular', 'information', 'mayor', 'threatened', 'divided', 'masks.', 'chair', 'task', 'efficacy', 'safety', 'advanced', 'investigating', 'rare', 'flu', 'detected', 'There', '27', 'influenza', 'concern', 'threshold', 'too', 'single', '"murder', 'advised', 'hurricane', 'continues', 'coast', 'much', 'tuesday,', 'setting', 'deadly', 'landslides', 'ballot', 'proposal', 'uber', 'status', 'rather', 'employees.', '7', 'remained', 'morning.', 'carbon', 'emissions', 'car', 'earth', 'times.', 'formally', 'promise', 'six', 'states:', 'nevada,', 'georgia,', 'arizona,', 'michigan,', '@EricGrenierCBC', 'miss', 'possible', 'glass', 'oregon', 'amounts', 'street', 'drugs', 'huawei', 'suing', 'agencies', 'try', 'release', 'believes', 'reveal', 'behind', 'arrest', 'post-election', 'october', 'resort', 'buy', 'created', 'lottery', 'them.', 'cold', 'weather', 'mental', '5', 'prepare', 'reports', 'close', 'expressed', 'tight', 'ongoing', 'counts', 'swing', 'shares', 'trading', 'hands', 'opened', 'above', 'offers', 'investors', 'offer', 'served', 'flight', 'royal', 'american', 'continued', 'wednesday', 'winner', 'declaration', 'race.', 'joining', 'coverage', 'find', 'here:', 'prospects', 'stimulus', 'recover', 'americans.', 'wisconsin,', 'coast,', 'races', 'contested', 'north', 'carolina,', '"This', 'large', 'challenge', 'poll', 'surveyed', 'system', 'marked', 'hand', 'maintaining', 'physical', 'distance', 'part', 'sustained', 'urging', 'stay', 'home,', 'live', 'appear', 'site', 'in,', 'president,', 'senate', 'house.', 'winds', 'kilometres', 'center.', 'U.S.,', 'recommendations', 'officials.', 'history,', 'ottawa', 'policy', 'streaming', 'platforms', 'faced', 'traditional', 'jason', 'listen', 'advice', 'main', 'recommending', 'choose', 'intended', 'clarity', 'level.', 'gain', 'senate,', 'flip', 'depending', 'presidency,', 'deciding', 'monitoring', 'urged', 'additional', 'single-day', 'province.', 'regulators', 'critical', 'reaching', 'virus.', 'teams', 'option', 'considered', 'erin', "O'Toole", 'conservatives', 'simply', 'open', 'lose', 'training', 'bias', 'internal', 'showdown', '2000', 'lessons', "today's", 'life', 'construction', 'raise', 'build', 'down.', 'She', 'months,', 'away', 'young', 'girl', 'rescued', 'rubble', 'collapsed', 'izmir,', 'earthquake', 'seasonal', 'type', 'depression', 'brought', 'winter,', 'that‚Äôs', 
#  'produced', 'unusual', 'moments', 'windows', 'white', 'children', 'got', 'haul', 'initial', 'eastern', 'long,', 'dark', 'winter.', 'rainfall', 'predicted', 'region.', 'awards', 'book', 'human', 'rights', 'area', 'targeted', 'gives', 'shut', 'block', 'road', 'near', 'rural', 'bodies', 'fourth', 'injured', 'secure', '2016,', 'moving', 'wife', 'kelly', 'peaceful', 'loses', 'that,', 'trump,', 'anything', 'tend', 'considering', 'survive', 'member', 'household', 'symptoms', 'entire', 'transport', 'detect', 'endangered', 'atlantic', 'right', 'interior', 'vienna', 'terror', 'attack.', "B.C.'s", 'historic', 'preparing', 'plan', 'giving', 'bloc', 'blanchet', 'doubling', 'draw', 'line', "party's", 'values', 'kept', 'diagnosis', 'secret', 'alarm', 'reported.', 'waiting', 'hosted', 'fair', 'attended', 'iranian', 'spokesman', 'involved', 'voluntarily', 'taking', 'review', 'rideau', 'culture.', 'readers', 'send', 'protect', 'full', 'patients.', 'remains', 'chances', 'hillary', 'ago.', 'Donald', 'pull', 'ruled', 'owner', 'published', 'rescue', 'girls', 'alive', 'apartment', 'turkish', '21,', 'stand', 'charges', 
#  'assault', '2017', 'alcohol', 'system.', 'note', 'prepared', 'earlier', 'year', 'deputy', 'greater', 'intelligence', 'here,', 'gathered', 'family,', 'friends', 'colleagues', 'formula', 'driver', 'lewis', 'hamilton', 'grand', 'different', 'nine', 'finally', 'tell', 'coronavirus,', 'story.', 'consumer', 'germany,', 'avoid', 'travel.', 'crown', 'washington', "isn't", 'impact', 'processes', 'determine', 'disease', 'equivalent', 'category', 'boris', 'johnson', 'current', 'threaten', 'laid', 'children.', 'outcome', 'networks', 'soon', 'call.', 'local', 'cult', 'journalists', 'past', '24', 'began.', 'finance', 'trail', 'territory', 'permanent', "nation's", 'bringing', "city's", 'intense', 'capital', 'appears', 'arrived', 'later,', '2020.', 'finds.', 'arrested', 'male', 'suspect', 'injured,', 'sent', 'crashing', 'province,', 'hospitalized', 'fire', 'chris', 'decided', '2018.', 'gatherings', 'learned', 'bank', 'governor', 'reducing', 'medical', 'business', 'owners', "they've", 'shot', 'military', 'essential', 'close,', 'immune', 'board', 'receive', 'regular', 'ends', 'hour', 'sleep', 'worth', 'it?', 'economy.', 'prospect', 'big', 'halloween', 'watch', 'movies', 'it,', 'run', 'mind', 'events', 'distancing,', 'florida.', 'antibodies', 'sean', 
#  'bond', 'threatening', 'bob', 'money', 'mississippi', "she's", 'politically', '2019.', '1,', '@cbcasithappens', 'felt', '"the', 'recovered', 'earlier.', 'changed', 'bay', "hasn't", 'suggest', 'golf', 'played', 'struggled', 'problem', 'worse', 'northern', 'P.E.I.', "man's", 'website', 'allows', 'visitors', 'read', '@trevorjdunn', 'mother', 'equipment', 'necessary', 'stronger', 'anxiety', 'looming', 'montreal', 
#  'scrambling', 'warming', 'spaces', 'beds', 'airport', 'program', '12', 'projects', 'party', 'emergency', 'aim', 'It\'s', 'politics.', "that's", 'kind', 'nature', 'turning', 'updates', 'alert', 'app', 'exposure', 'date', 'actress', 'sentence', 'authorities', "alberta's", 'health,', 'outbreaks', 'ahead', 'older', 'brother', 'released', 'third', 'plastic', 'identified', 'citizen', 'spreading', '@ybrend', 'thought', '17', 'sound', 'protected', 
#  'immigrants', 'jack', 'black', 'ad', 'aimed', 'turkey,', 'aegean', 'prompted', 'write', 'action', 'coronavirus.', 'commons', 'gave', 'easier', 'seek', 'london,', 'personal', 'worker', 'failed', 'pet', 'investigators', 'attack', 'french', 'church', 'grew', 'domestic', 'But', 'overall', 'activity', 'february', 'sea', 'turkey', 'western', 'pushing', 'brink', '1,000', 'saskatchewan', 'finished', 'victories', 
#  'basis', 'false', 'contained', 'trend', 'towards', 'haven‚Äôt', 'up."', 'narrow', 'my', 'I', 'speak', 'language', 'expects', 'increase', '2021.', 'immediate', 'taylor,', 'army', 'peter', 'massachusetts',
#   "year's", 'director', 'cameron', 'resident', 'law,', 'convicted', 'crime', 'holds', '10', 'serving', 'world,', 'guns', 'upon', 'ruling', 'justice', 'reality', 'allegations', 'abuse', "state's", 'continuing', 'calls', 'medicine', 'prices', 'cancer', 'account', 'granted', 'request', 'privacy', 'mail', 'sunday.', 'population', 'researchers', 'study', 'party,', 'senators', 're-elected', '3.', 'quarter,', 'previous', 'one.', 'european', 'leaders', 'faith', 'digital', 'collected', 'images', 'technology', 'knowledge', 'commissioner', 'probe', 'teachers', 'stress', 'related', 'finding', '@jonmontpetit', 'committee', 'stuck', 'WE', 'charity', 'coral', 'reef', 'barrier', 'healthy', '120', 'thursday,', 'france', 'fresh', 'path', 'growth', 'operations', 'giant', 'quarantine', 'zeta', 'louisiana', 'argues', 'mix', 'competing', 'shifts', 'wreak', 'mountain', 'battles', 'then', 'nurses', 'influx', 'resurgence', 'hospitals.', 'armed', 'knife', 'nice,', 'It', 'dog', 'week,', 'projected', 'night.', 'reopening', 'caught', 'rock', 'popularity', 'TikTok.', 'responded', 'complaints', 'masks,', 'issues', 'heavily', 'longtime', '25', 'water', 'northwestern', 'clean', '30', 'stranded', 'friend', 'miles', 'taylor', 'working', 'department', 'combat', 'misconduct', 'gripped', 'described', 'relatives', 'suffering', '2010', 'body', 'episode', 'closing', 'bars,', 'sharp', 'governments', 'sought', 'fill', 'demanding', 'reporter', 'south', 'failure', 'gets', 'there.', 'ability', 'underway', 'ads', 'widely', 'media.', 'answering', 'This', 'time,', 'testing.', 'offering', 'controversial', 'pages', 'cast', 'terms', 'takes', 'declare', 'milestone', '10,000', 'That', 'true', 'ties', 'platform', 'dozen', 'senior', 'me', 'voters.', 'power.', 'trouble', 'president.', 'designed', 'slow', 'throughout', 'claim', 'superior', 'court,', 'oxford', 'retail', 'duty', 'natural', 'gas', 'reviewed', "quebec's", 'environmental', 'worried', 'meng', "ottawa's", 'reduce', 'community.', 'effort', 'species', 'artists', "president's", 'collins,', 'tom', 'office', 'doctor', 'author', 'shift', 'physician', 'delivered', 'writing', 'hearing', 'court.', 'hotel', 'bush', 'kill', '"an', 'waste', 'professional', 'raniere,', 'sex', 'keith', 'sentencing', 'female', 'followers', 'infectious', 'homeless', 'inspired', 'research', 'london', 'protection', 'southern', 'commission', 'sparked', 'uptick', 'cuts', 'adding', 'alta.,', 'hope', 'negative', 'consequences', 'right-wing', 'version', 'appearing', 'e-commerce', 'TikTok', 'products', 'app.', 'hearings', 'crimes', "government's", 'eyes', 'signs', 'bomb', 'blast', 'islamic', 'message', 'ignore', 'mandate', 'spring,', 'treatments', 'odds', 'further', 'beyond', 'think,', 'stretched', 'anywhere', 'florida', 'say.', 'reason', 'levels', 'disputed', 'brings', 'vacant', 'resigned', 'august.', 'sports', 'hall', 'fame', 'closures', 'lifted', 'thursday.', 'andrew', 'studio', 'depend', 'naval', 'oil', 'english', 'asian', 'hornet', 'nest', 'surface', 'ice', 'NASA', 'held', 'today.', 'head', 'steady', 'climb', 'mask', 'post', 'requested', 'devices', "NASA's", 'OSIRIS-REx', 'spacecraft', 'asteroid', 'particles', 'determined', 'washington,', 'question', 'accurately', 'azerbaijan', 'accusing', 'each', 'violating', 'Nagorno-Karabakh.', 'mike', 'pence', 'session', 'demanded', 'transportation', 'it‚Äôs', 'wind', 'year.', 'record-breaking', 'campaigning', 'combined', 'nationwide', 'hopes', 'battery', 'display', 'entirely', 'putting', 'funeral', 'comfort', 'restore', 'park', 'now,', 'aims', 'battered', 'campaign.', 'none', 'boost', 'data.', 'princess', 'raising', 'concerns', 'admitted', 'million.', '@sophiaharrisCBC', 'spike', 'straight', 'software', 'warned', 'reasons', 'campaigns', 'student', 'chess', 'player', 'wireless', 'entrepreneur', 'guard', 'imposed', 'closure', 'compared', 'hard', 'suit', 'ask', 'veteran', 'poland', 'influence', 'acting', 'priority', 'commercial', 'sector', 'accept', 'diagnosed', 'one-day', 'seeking', 're-election', 'later', 'challenges', 'secretary', 'democracy', 'integrity', 'snap', 'constitution', 'act.', 'saturday,', '@bethanylindsay', 'programs.', 'skin', 'risks', 'parents', 'children,', 'begin', 'january', 'contract', 'aware', 'expanding', 'disease,', 'track', 'currently', 'video', 'hollywood', 'remaining', '&amp;', 'same.', 'demonstrations', 'unit', 'explore', 'severe', 'forecast', 'arrive', 'environment', 'celebrating', 
#  'council', 'festival', 'separated', 'border.', 'famous', 'sold', 'US,', 'ever', 'bird', 'hate', 'suspected', 'unknown', 'election?', 'inform', 'coverage.', 'email', 'legislation', 'harder', 'country', 'toronto,', '@LaurenPelley', 'thursday', 'evening', 'imagine', 'point.', 'U.K.', 'EU', 'negotiations', 'trade', 'lays', '‚Äúthe', 'greatest', 'lead.', 'identify', 'workers,', 'brief', 'affordable', 'housing', 'especially', 'serious', 'reveals', 'repeat', 'marched', 'carrying', 'withstand', 'animal', 'studying', 'planes', 'buildings.', 'willing', "island's", 'So', 'attempts', 'holding', 'advantages', 'cash', 'and,', 'hiring', 'staff,', 'creating', 'plan,', "china's", 'definition', 'unveiled', 'spring', 'works', 'offered', 'land', 'calgary', 'picked', 'speed', '14th', 'century', 'merely', 'temperatures', 'days.', 'debate', 'carried', 'ET.', 'itself', 'tim', 'burger', 'king', 'customers', 'minnesota', 'murder', 'second-degree', 'judiciary', 'voted', 'favour', 'nomination', 'amy', 'coney', 'barrett', 'learn', 'residential', 'education', 'korean', 'gunman', 'astronaut', 'provinces', 'artist', 'fierce', 'winning', 'barack', 'views', 'philadelphia.', 'rudy', 'giuliani', 'featured', 'borat', 'scene', 'demonstrating', 'cities,', 'feared', 'accused', 'senate.', 'race,', 'helping', 'effects', 'contain', 'june', 'progress', 'registration', 'caused', 'opposed', 'barrett,', 'appeals', 'confirmation', 'expand', 'judicial', 'records', 'wednesday,', '@sarahcrgr', 'iran', 'met', 'delegation', 'week', 'talks', 'figures', 'michigan.', 'postal', 'employees,', 'vehicles', 'appearance', 'tonight', 'month,', 'restrictions,', 'criticism', 'approach.', 'effectively', 'paying', 'stopped', 'purdue', 'wealthy', 'advisers', 'chosen', '4', 'give', 'preliminary', 'september', 'sales', 'running', 'ending', 'double', 'halt', 'pope', 'francis', 'endorsed', 'same-sex', 'unions', 'corporate', 'individuals', 'liberals', 'found.', 'representing', 'families', 'google', 'moves', 'tech', 'china', 'reach', 'disinformation', 'measure', 'backing', 'resources', 'assistant', 'olympics', 'significant', 'brutality', 'japan', 'bill.', 'steadily', 'fake', 'encouraging', 'busy', 'freeze', 'nuclear', 'extend', 'arms', 'treaty', 'jeff', 'bridges', 'voice', 'character', '"As', 'sept.', 'launch', 'lobster', 'sued', 'advertising', 'competition', 'seniors', 'staying', 'safe,', 'nursing', 'included', 'turns', 'out,', 'professors', 'defending', 'colleague', 'suspended', 'class', 'benefits', 'youth', 'programs', 'playing', 'appointment', 'infections,', 'useful', 'probably', 'hardest', 'decision', 'pumpkin', 'condition', 'statue', 'times', 'larger', 'summer.', 'approach', 'advocacy', 'wants', 'conversations', 'vulnerable', 'shutting', 'effect', 'damage', 'degrees', 'music', 'usually', 'hotels', 'rates,', 'classes', 'home.', 'dealing', 'anthony', 'fauci', 'agree', 'mostly', 'uncertainty,', 'survey', 'november,', 'russian', 'attacks', 'wide', 'range', 'champagne', 'wall', 'migrants', 'narrowly', 'funded', 'managed', 'gary', 'league', 'fans', 'closed', 'countries.', 'original', 'selected', 'network', 'moon,', 'humans', "john's", 'indigenous', 'impose', 'apart', 'messages', 'telling', 'stories', 'gaining', 'strength', 'consumers', 'auto', 'attacked', 'violent', 'crucial', 'racism', 'shows.', 'blocking', 'thai', 'What', 'fire.', 'widow', 'locked', 'apple', 'cabinet', 'ministers', 'joined', 'hold', 'discuss', 'southwest', 'native', 'perfect', 'counted', 'childhood', 'starts', 'engaging', 'wounded', 'UN', '"a', 'name', 'monday.', 'not.', 'contact', 'tracing', 'depends', 'texas', 'physically', 'distancing', 'in-person', 'attention', 'Tuesday.', 'sample', 'reopen', 'oct.', 'however,', 'scheduled', 'story', 'two-thirds', 'laura', 'ones', 'inside', 'treatment', 'not,', 'normal', 'school.', 'done', 'standard', 'edward', 'county.', 'complete', 'wildlife', 'estimate', '2015', 'volunteers', 'expect', 'doing', 'decades', 'socially', 'ever.', 'on,', '60', 'zealand', 'jacinda', 'office.', 'polling', "wasn't", 'steps', 'happening', 'delivery', 'orders', 'govern', 'operation', 'fully', 'canada‚Äôs', 'hear', 'leaders,', 'systemic', 'healthcare', 'developed', 'school,', 'monitor', 'san', 'lucky', 'opportunities', 'emails', 'tied', 'lived', 'son', 'fund', 'produce', 'involving', 'conducted', 'daughter', 'decades.', 'patrol', 'retired', 'argued', 'expecting', 'flying', 'fossil', 'heat', '700', 'fifth', 'violence', 'harassment', 'sen.', 'ben', 'christian', 'clinical', 'visit', 'dying', 'visits', 'syndrome', 'boards', 'model', 'remote', 'leading', 'doesn‚Äôt', 'drive', 'benefit', 'measures,', 'wondering', 'control.', 'maps', 'names', 'industry.', 'firm', 'market.', 'tony', 'jury', 'refuses', 'gov.', 'gretchen', 'whitmer', 'plot', 'tonight,', 'debate,', 'temporarily', 'restricted', 'three-day', 'As', 'today,', 'country‚Äôs', 'field', 'milwaukee', 'suburb', "barrett's", 'nomination.', 'grant', 'asylum', 'criticized', 'jerry', 'dispute', 'virtually', 'queen', 'elizabeth', 'associated', 'kamala', "harris's", 'vice-presidential', 'refusing', 'customer', 'restaurant', 'september,', 'electric', 'vehicle', 'economy,', 'writes', 'thomas', 'marathon', 'promised', 'supposed', 'networks.', "thailand's", 'banned', 'targeting', 'limited', 'rapidly', 'way.', '16', 'instagram', 'twitter.', 'bear', "who's", 'facility', 'box', 'months.', 'outside', 'applications', 'issue', 'ten', '2016.', 'tourism', 'represent', 'group,', 'window', 'deliver', 'issued', 'camp', 'melania', 'revealed', 'communications', 'hired', '18', 'feeling', 'points', "tonight's", 'debate.', 'huge', 'margin', 'senator', 'cheaper', 'antigen', 'tests', 'losing', "haven't", 'taxes.', 'NBC', 'opponent', '911', 'york,', 'prosecutors', 'debates', 'empty', 'isolation', 'pain', 'racist', 'photo', 'driven', 'moon', 'maintain', 'trick-or-treating', 'accounts', "russia's", 'pose', 'remember', 'although', 'rejecting', 'placed', 'higher', 'design', 'orange', 'hospitalizations', 'outbreaks.', 'crowds', 'phones', 'faster', '5G', 'europe', 'package', 'chain', 'leader,', 'settlement', 'smaller', 'rival', 'seats', 'saudi', 'uses', 'attempting', 'create', 'investigate', 'issue.', 'declined', 'v.', 'properly', 'highly', 'rely', 'dogs', 'europe,', 'sick.', 'kidnap', 'detained', 'desperate', 'late-stage', 'paused', 'decades,', 'thanks', 'jean', 'remembered', 'turmoil', 'exposed', 'ohio', 'wanted', "i'm", 'revealing', 'china,', 'suburban', 'state.', 'grow', 'length,', 'mouth', 'consume', 'interview', 'london.', 'pleaded', 'guilty', 'ruth', 'bader', 'ginsburg', 'spots', 'defeating', 'robert', 'nobel', 'theory', 'era', 'connected', 'sharing', 'landmark', 'filled', 'completely', 'republicans', 'fired', 'city.', 'normally', 'thanksgiving', 'away,', 'poised', 'reports.', 'dreams', 'museum', 'alone', 'easy', 'returned', 'performances', 'damaging', 'health.', 
#  'insisted', 'rally.', 'come.', 'blocked', 'receiving', 'concluded', 'airborne', 'followed', 'happens', 'exchange', 'diplomats', 'on.', 'act,', 'city‚Äôs', 'tourist', 'add', 'acres', 'seeks', 'moral', 'removed', 'videos', 'green', 'says,', 'rich', 'upheld', 'driving', 'minister,', 'replaced', 'tools', 'east', 'Some', 'saturday', 'delta', 'unprecedented', 'prominent', "what's", 'stake', 'journalist', 'reduced', 'processing', 'built', 'eating', 'illness', 'death.', 'experimental', 'experiences', 'feel', 'visiting', 'rapper', 'los', 'angeles', 'deaths.', 'protecting', 'experienced', 'addressing', 'barriers', 'laws', 'daniel', 'details', 'brutal', 'modern', 'increasing', 'atmosphere.', 'gender', 
#  'QAnon', 'netflix', '$2', 'attempt', 'serve', 'reminder', 'I\'m', 'literature', 'individual', 'defended', 'harris', 'history', 'funds', 'goal', 'finish', 'urge', 'celebrate', 'soaring', 'concept', 'vice', 'farm', 'positions', 'guy', 'battling', 'lung', 'november', 'government.', 'facebook', 'weed', 'britain,', 'ET', 'billionaire', 'steve', 'generations', 'old', 'subject', 'cost', 'argue', 'undermine', 'dangerous', '2', 'developing', 'method', 'trust', 'weighing', 'decade', 'studies.', 'speakers', 'convention', 'tasked', 'throne', 'speech', 'becomes', '2021,', 'viral', 'sharply', 'chairman', 'joint', 'gen.', 'van', '"If', 'university.', 'australian', 'mainland', 'surging', 'smith', 'in.', 'fought', 'vaccine.', '2022.', 'needed.', 'newsletter:', 'side', 'tough', 'introduced', 'lab', 'Joe', 'present', 'too.', 'nominee', 'incident', 'organizations', 'jobs,', 'severity', 'protocols,', 'contracting', 'confident', 'seat', 'burned', 'press', 'misleading', 'wrong.', 'gun', 'die', 'either', 'birth', 'denied', 'him,', 'party.', '‚Ä¢', 'breaks', 'ground.', 'charles', 'shark', 'others.', 'society', 'develop', 'leadership', 'banks', 'feed', 'accepted', 'quite', 'say,', 'stronghold', 'raises', 'title', 'inner', 'circle', "who've", 'matter', 'fell', 'ill', 'possibly', 'win.', 
#  "they'll", 'proving', 'valley', 'bars', 'missed', 'work,', 'fall.', 'sense', 'leader.', 'minutes', 'ambulance', 'needed', 'E.', 'contest', 'complications', 'screening', 'increasingly', 'lady', 'navy', 'direction', 'conduct', 'yorkers', 'rarely', 'resume', 'midst', 'everything', 'extended', 'hitting', 'forest', 'respond', 'findings', 'CNN', 'struggle', 'word', 'contracted', 'quietly', 'experiencing', 'lisa', 'parole', 'unable', 'Here', 'drew', 'worldwide', 'afternoon,', 'attempted', 'proud', 'allowing', 'sit', 'maine', 'advantage', 'learning', 'employee', 'then,', 'deemed', 'serves', 'distribution', 'child', 'chrissy', 'husband,', 'loss', 'wrote', 'france,', 'greenhouse', 'fraud,', 'voting.', 'sick', 'brand', 'vatican', 'citing', 'strip', 'often', 'alexei', 'recovering', 'nerve', 'vladimir', '$10', 'infrastructure', 'initiatives', 'weigh', 'television', 'separate', 'italy', 'summer,', 'rocket', '545', 'doubled', 'tells', 'aside', 'parent', 'directly', 'survived', 'wednesday.', 'responding', 'analyzed', 'here‚Äôs', 'taste', 'certainly', 'am', 'woman,', 'For', 'challenger', 'accusations', 'britain‚Äôs', 'articles', 'universal', 'dismiss', 'talk', 'dream', 'band', 'blow', 'urgent', 'sale', 'weekly', 'belarusian', 'alexander', 'populous', 'taxes', 'first-ever', 'glitches', 'crisis.', 'roughly', 'update', 'lines', 'courts', '‚Äì', 'breonna', 'raid', 'wildfires', 'destroyed', 'complain', 'taught', 'let', 'fatally', 'televised', 'fine', 'matters', 'ago', 'immunity', 'italian', 'planet', 'participate', 'hall.', 'cancer,', 'happen', 'At', 'attacks.', 'marriage', 'fires', 'famed', 'wine', "california's", 'county,', 'forcing', 'goes', 'lake', 'facts', 'onto', 'affected', 'en', 'deadline', 'clearly', 'pass', 'case.', 'pregnancy', 'scale', 'doors', 'herself', 'ministry', 'analysis:', 'divisions', 'generally', 'impossible', 'simple', "you're", 'teacher', 'emmanuel', 'retailers', 'posts', 'pacific', 'baby', 'sources', 'surpassed', 'january.', 'pounds', 'government‚Äôs', 'pledge', 'demonstrators', 'three-quarters', 'meaning', 'muslim', 'hair', 'religious', 'complex', 'russia,', 'investigations', 'evangelical', 'GDP', 'shutdowns', 'intentionally', 'supporting', 'regulatory', 'reporters', '"He', 'insurance', 'normal.', 'activists', 'lukashenko', 'briefly', 'that.', 'kinds', 'korea', 'troops', 'age.', 'transfer', 're-election.', 'chaos', 'promises', 'thousand', 'They', 'de', 'vote,', 'presidents', 'limiting', 'tourists', 'forced', "britain's", 'form', 'credit', 'xi', 'burning', 'mounting', 'actions', 'minority', 'highlights', 'dutch', 'australia', 'died,', 'derek', 'kentucky', 'unite', 'roberts', 'words', 'describe', 'german', 'treating', 'nation.', 'command', 'plenty', 'rain', 'awaiting', 'called.', 'encounter', 'april', 'courts.', 'payette', 'checks', '6', "family's", 'coffins', 'colleges', 'commitment', 'headed', 'journey', 'entry', 'extraordinary', 'legacy', 'carry', 'thing', 'march', 'mitt', 'romney', 'slammed', 'offices', 'life,', 'factors', "australia's", 'campus', 'painting', 'square', 'delivering', 'together.', 'tapped', 'constitutional', 'addition', 'interfere', 'NFL', 'connection', 'britain', 'unless', 'jersey', 'anxious', 'lack', 'cloud', 'jonathan', 'comedy', 'completing', 'sweep', 'enforcement', 'addressed', 'airs', 'fundamental', 'cars', 'parking', 'crowd', 'afghan', 'hangs', 'for.', 'politician', 'immediately', 'edition', 'film', 'guidance', 'resource', 'outstanding', 'casting', 'july,', 'cyber', 'else', 'lots', 'disabilities', 'treasury', 'indian', 'Now,', 'yet,', 'prestigious', 'jim', 'worked', 'risk.', 'ambitious', 'unfounded', 'fraud.', 'wish', 'provided', '50,000', 'barr', 'bottle', 'blamed', 'politicians', 'One', 'help.', 'outlook', 'possibility', 'threat', 'tests,', 'positive.', 'trip', 'activist', 'agenda', 'anybody', 'count.', 'populations', 'ultimate', 'hunter', 'sometimes', "we've", 'china.', 'ran', 'man.', 'plane', 'max', 'knowing', 'gulf', 'panel', 'audio', 'flooding', 'city,', 'slowed', 'source', 'representative', 'championship', 'allegedly', 'smoke', 'covering', 'reforms', "taylor's", 'beirut', 'protects', 'diseases', 'types', 'out.', 'you.', 'commander', 'universities', 'twice', 'supported', 'operating', 'established', 'debt', 'sees', 'deeply', 'coach', 'viewed', 'found,', 'cemetery', '2020,', 'governors', 'week‚Äôs', 'hurricane,', 'center', 'atmosphere', 'deliberately', 'know,', 'analysts', "company's", 'wildfire', "Here's", 'stretch', 'boat', 'graham', 'bureau', 'discarded', 'safer', 'returning', 'weapons', 'resignation', 'conditions', 'deleted', 'exploring', 'efficient', '2019,', 'bin', 'al', 'disposable', 'boston', 'creative', '2019', 'fix', 'roles', 'sites', 'register', 'catching', 'marking', 'bold', 'similar', 'fate', 'dangers', 'husband', 'seems', 'aggressive', 'reverse', '75', 'december', 'march.', 'afghanistan', 'greece', 'founder', 'wipes', 'string', 'survivors', 'extreme', 'target', 'classified', 'india', 'sexually', 'her.', '1960s', 'proved', 'promoted', 'emerged', 'defend', 'fire,', 'california,', 'spooky', 'stars', 'successful', 'scientist', 'towns', 'tracking', 'improve', 'initially', 'reported,', 'finds', 'pace', 'refusal', 'throw', 'swept', 'ranks', 'gone', 'encourage', 'screen', 'shifted', 'trump‚Äôs', 'quick', 'cause', 'standing', 'understand', 'stocks', 'hood', 'online.', '24.', 'catholic', 'aboard', 'All', 'racial', 'longest', "court's", 'parliamentary', 'rape', 'art', 'charlie', 'institute', 'killed,', 'CDC', 'feature', 'emotionally', 'importance', 'football', 'began,', 'may.', 'didn‚Äôt', 'brief:', 'features', 'severely', 'draws', 'catch', 'communities', 'minds', 'themselves', 'hours.', 'ship', 'stark', 'blood', 'barely', 'planning', 'patience', 'thin', '45', 'weekend,', 'yes,', 'everyone', 'tonight.', 'live.', 'stickers', 'crew', 'signal', 'amazon,', 'quarter', 'february,', 'archaeologists', 'safely', 'familiar', 'code', 'lets', 'becoming', 'disaster', 'balance,', 'reform', '85-year-old', 'brooklyn', 'comments', 'restrained', 'heads', 'cruise', 'storage', 'stolen', 'jurors', 'content', 'hole', 'solar', 'cook', 'doubt', 'vaccine,', 'trials', 'metropolitan', 'know:', 'drive-thru', 'announced.', 'sizes', 'surrounded', 'jacob', 'volunteer', 'eric', 'instead.', 'house,', 'baron', 'degree', 'shrinking', 'statements', 'deal.', 'gains', 'camera', 'midwest', 'harry', 'eye', 'Read', 'more:', 'seemingly', 'temporary', 'page', 'struck', 'recession', 'italy,', 'common', 'happy', 'touched', 'far-right', 'piece', 'covid-19', 'blasio', "america's", '@CNNOpinion', 'co-founder', '#SilenceIsNotAnOption', 'podcast:', 'covid-19,', 'homeland', 'resign', 'banning', 'controlled', 'rep.', 'alaska', 'president-elect.', 'johns', 'hopkins', 'closely', 'texas,', 'grim', 'counties', 'arizona', 'project.', 'state‚Äôs', 'stood', 'US.', 'truth', '@donlemon', 'tweets', 'potentially', 'covid-19.', 'chicago', 'course', 'On', 'cuomo', 'thoughts', 'üéß', 'listen:', 'recognize', 'GOP', 'pentagon', 'defense', 'inauguration', '--', 'today‚Äôs', '@DrSanjayGupta', 'McConnell', 'reject', 'montana', 'threatens', 'elderly', 'adults', 'census', 'adviser', 'campaign,', 'acknowledge', 'baseless', 'iPhone', 'mini', 'us,', 'ET/PT', 'passing', 'biden‚Äôs', 'decline', 'suggesting', 'divisive', 'fueled', 'americans,', 'refuse', 'victory,', 'guests', 'üì©', 'inbox', 'daily:', 'today:', 'UK', 'brexit', '@lukemcgee', 'win,', 'wild', 'talking', '@StCollinson', 'tennessee', 'sworn', 'primary', 'shanghai', 'entirety', 'relocated', 'dubbed', '"walking', 'machine."', 'virus,', 'protests,', '@jgriffiths', 'you,', 'W.', 'collapse', 'hurricanes', 'loved', 'ones.', 'board.', 'traveled', 'unlikely', 'CNN‚Äôs', 'labor', 'bernie', '‚ÄúIf', 'brad', 'discusses', '‚ÄúWe', 'statewide', 'confirm', 'Covid-19.', 'lost.', 'champion', '"President', 'placing', '@CillizzaCNN', 'el', 'paso', 'culture', 'results,', 
#  '20,', 'check:', 'baselessly', 'utah', 'india,', 'JUST', 'IN:', 'loeffler', 'perdue', 'traffic', 'navajo', 'all,', 'FDA', 'surrogate', 'McCain', 'cindy', 'regime', "CNN's", 'writes.', 'illinois,', 'academy', 'nationally', 'grown', '@ForecasterEnten', 'revenge', 'morning,', 'first:', 'warren', 'We', 'victory.', 'authorized', 'drugmaker', 'ceremony', 'event.', 'confederate', 'traveling', 'PROJECTION:', '#CNNElection', 'context', 'rampant', 'devastating', 'dominated', 'women,', 'mexico,', 'kong,', 'beijing', 'powers', 'mink', 'stacey', 'abrams', 'historically', 'graduates', 'producing', 'maryland', 'featuring', 'hall,', 'discover', 'check', 'whip', 'protections', 'correspondents', 'seriously', 'election:', 'la', 'adopted', 'fauci,', 'allergy', 'diseases,', 'pro', 'grounded', 'misinformation', 'cal', 'thom', 'dramatically', "democrats'", 'eventually', 'trump,"', 'surprise', 'globally', 'reacts', 'pair', 'recep', 'tayyip', 'erdogan', 'congratulated', 'updates:', 'posing', 'capture', 'runoff', 'performed', '"Jeopardy!"', 'way,', '‚ÄúI', 'can‚Äôt', 'citizens', 'mitch', 'chamber', 'behavior', 'republican,', 'Biden-Harris', "team's", 'jr.', 'illinois', 'florida,', 'via', 'broadcast', 'invalidate', 'prepares', 'administration.', '"What', 'next,', 'dana', 'waves', 'II', '(and', 'iconic', 'photographs', 'prosecutor', 'far.', 'pressing', 'striking', 'gay', 'trusted', 'egypt', 'trebek,', '90%', 'effective,', 'polarized', '@CNNOpinion.', 'urban', 'carson', 'atlanta,', 'recreational', 'marijuana', 'disappearing', 'watched', 'no.', "harris'", '50%', 'confront', 'eli', 'firing', '@JohnKingCNN', 'podcast', 'fastest', 'jon', 'ossoff,', 'books', 'it,"', 'congress.', '‚ÄúIt', 'effectiveness', 'incumbent', 'sewage', 'covid', 'jared', 'k-pop', 'base', 'investigative', 'collins', 'latino', 'differences', 'latinos', 'life-threatening', 'defeat', 'heal', 'america?', 'devoted', '"You', 'color', 'democrat,', 'enten', 'landscape', '...', 'unity', 'kushner', 'biden:', 'favorite', 'fan', '"For', 'message.', 'office,', '"When', 'hometown', 'wilmington,', 'Watch', 'Follow', 'harris,', '‚ÄúThis', 'career.', 'tweeted', '‚ÄúYou', 'delaware,', '‚ÄúIt‚Äôs', 'nation‚Äôs', '46th', 'meadows,', 'helps', 'decide', 'meadows', 'know.', 'trajectory', 'suggests.', 'counted,', 'madison', 'records.', 'congressional', 'sarah', 'transgender', 'explains', 'freedom', 'deadlines', 'correct', 'ballot,', 'martha', 'votes,', 'christie', 'edge', 'stands', 'hasn‚Äôt', '‚ñ™Ô∏é', 'nevada', 'carolina', 'alabama', 'focusing', 'cori', 'bush,', 'here.', 'missouri', 'expectations', 'far,', 'breaking', '‚ÄúThe', 'USPS', 'color.', 'what‚Äôs', 'topic', 'shaping', 'unemployment', 'fort', 'department.', 'jump', 'YouTube', 'unpopular', 'oklahoma', 'warnings', 'reelection', 'election-related', 'podcast,', 'wall"', 'labeled', 'legalize', 'jersey,', 'susan', 'dakota', 'delaware', 'updates.', '#CNNelection', 'absentee', 'abortion', 'undecided', 'hawaii', 'nebraska,', 'alexandria', 'Ocasio-Cortez', 'iowa', 'flag', 'colorado', 'proposition', 'lindsey', 'jaime', 'hampshire', 'marjorie', 'LGBTQ', 'houston', 'wait.', 'performance', 'other,', 'here!', 'winning.', 'they‚Äôre', 'america,', 'deborah', 'alarming', 'disinformation?', 'lives,', "night's", 'chancellor', 'man,', 'interactive', '-', 'america‚Äôs', 'economy?', 'again,', 'ant', 'questioning', 'tuesday.', 'voting,', 'park,', 'approaches', 'materials', 'unrest', 'diverse', 'surprised', 'backlash', 'monarchy', '"But', 'civic', 'profound', 'african', 'chapter', 'electors', 'districts', 'favor', 'eta,', 'assailant', 'Day.', 'horrific', 'sentenced', 'questioned', 'players', 'candidates,', 'solo', 'carolina.', '@JohnAvlon', 'immigrant', 'consistently', 'rallies', 'feet', 'opinion', 'focused', 'forgot', 'surge,', 'catastrophic', 'celebrated', 'atlanta', 'citizenship', 'systems', 'wolf', "university's", 'pledging', "saturn's", 'molecule', 'measuring', 'sail', 'terrorist', 'lightning', 'sounds', 'bats', 'naturally', 'little-known', 'automaker', 'snatched', 'walter', 'wallace', 'pepper', 'spray', '@MaeveReston', '1918', 'worst,', 'flu.', 'scientific', 'tall', 'relationship,', 'alliance,', 'approaches,', 'seize', 'US-Mexico', 'wall,', 'digestive', 'feces.', 'Cities', 'hornets"', 'captured', 'kate', 'rubins', 'clinton', 'interview.', 'run,', 'ventilation', '99%', 'viruses,', 'harvard', 'accepting', 'travelers', 'gloves', 'connery', 'beverages', 'heart', 'origin', 'reportedly', 'administration,', 'memory', 'black,', 'nancy', 'sue', 'ohio,', 'Friday.', 'marijuana.', 'fracking', 'acquired', 'pete', 'buttigieg', 'prior', 'enter', '‚ÄúHe', 'shape', 'danger', 'photographer', 'jinping', 'jeffrey', 'hackers', 'scheme', 'percentage', 'competitive', 'racing', 'right,', 'pivotal', 'iowa,', 'fatal', 'place,', 'brett', 'hoped', 'pre-election', 'struggles', 'handing', 'milky', 'facebook.', 'czech', 'unexpected', 'surpassing', 'ballot.', 'xinjiang,', 'In', 'lasting', 'transmit', 'answer', 'speaker', 'pelosi', 'consequential', 'If', 'adds.', 'pres.', 'battlegrounds', 'near-Earth', 'fit', 'speaks', 'profit', '14%', 'jr.,', 'scandal', 'contentious', 'facebook,', 'perhaps', '‚Äúnot', 'aren‚Äôt', 'hispanic', 'Here‚Äôs', 'theater', 'first-time', 'google,', 'decapitated', 'he‚Äôs', 'we‚Äôre', 'conclude', 'dodgers', 'kavanaugh', 'jet', 'substantial', 'partial', 'CIA', 'disputes', 'tomorrow', 'upper', 'canceled', 
#  'cultural', 'war,', 'all-new', '#FirstLadies,', 'soccer', 'turnout.', 'sacha', 'tampa', 'rays', 'firefighters', 'younger', 'point,', 'i‚Äôm', 'folks', 'reaction', 'classic', 'likes', 'apologized', 'don‚Äôt', 'pandemic:', 'audience', 'Biden.', 'latin', 'he‚Äôll', 'solid', 'roosevelt', 'comply', 'emily', 'chronic', '@ddale8', 'magazine', 'columnist', 'firmly', 'uyghurs', 'motivation', 'justify', 'shield', 'republicans.', 'vietnam', 'stance', 'article', 'we‚Äôve', 'peak', 'paper', 'republicans,', 'boycott', '14-year-old', '$25,000', 'Anika', "chebrolu's", 'invention', 'in-silico', 'methodology', 'selectively', 'bind', 'protein', 'SARS-CoV-2', 'pence‚Äôs', 'challenger,', 'last-ditch', 'policy.', 'located', 'insufficient', 'render', 'ineffective,', 'specialist', 'peru,', 'phrase', 'lesley', 'eleanor', 'elevate', 'instead,', 'African-American', 'diet', 'tea', 'pressure,', 'bigger', 'trove', 'ancient', 'women.', '4.', 'randi', 'kaye', 'founded', 'disasters', 'anger', 'luck', 'year‚Äôs', 'racially', 'francisco', 'stadium', 'it,‚Äù', '‚ÄúThey', 'rule', '‚ÄúA', 'McConnell,', 'editors', 'enters', 'With', 'plotting', 'predominantly', '60%', 'pushed', 'writer', '#Debates2020', 'topics', 'times,', "thursday's", 'colorado,', 'typical', 'customs', 'undocumented', 'lowest', 'won‚Äôt', 'lagos', 'official,', 'worrying', 'transformed', 'scattershot', 'insult', 'products.', 'abruptly', 'zion', 'suburbs', 'isn‚Äôt', 'exploded', 'epsilon', 'referred', '6%', 'paint', 'pursue', 'worsened', 'openly', 'boxes', 'harnessing', 'agents', 'kansas', 'lay', 'person?', 'kim', 'trumpism', 'suburbs.', 'periods', 'Thursday.', 'tries', 'repeated', 'vowed', 'reagan', 'See', 'guide', 'all.', 'averaging', 'remarkable', 'interviews', 'QAnon,', 'likelihood', 'songs', 'electricity', 'tallies', 'live,', 'europe.', 'nick', 'saban', 'she‚Äôs', 'heated', 'president‚Äôs', 'giuliani,', 'films', 'dueling', 'ABC', 'complicated', '‚Äúa', 'you‚Äôre', 'editor,', 'economist', '‚ÄúI‚Äôm', 'gaming', 'them.‚Äù', 'barrett‚Äôs', 'there‚Äôs', 'news:', 'flipped', 'county:', 'que', 'un', 'del', 'presidente', 'talked', 'percent', 'daily.', 'china‚Äôs', 'hundred', 'prompting', 'majority.', 'FTI', 'scenes', 'cracking', 'world‚Äôs', 'Opinion', 'President', 'threw', 'georgia‚Äôs', 'armistice', 'aung', 'suu', 'harris‚Äôs', 'paths', 'lael', 'party‚Äôs', 'defeat,', 'como', 'una', 'y', 'para', 'errors', 'maricopa', 'latest:', 'swung', 'unsubstantiated', 'why?', 'europe‚Äôs', 'defeated', 'weakness', 'democrats‚Äô', 'france‚Äôs', 'democrats.', 'habitable', 'whatever', 'asks', 'live:', 'wasn‚Äôt', 'early,', 'you‚Äôd', 'decisive', 'falsehoods', '#Election2020', 'plumbing', 'partisan', 'midwestern', 'Miami-Dade', 'people‚Äôs', 'Eastern', 'demographic', 'india‚Äôs', '#ElectionDay', 'newsletter', '#election2020', 'editor', 'administration‚Äôs', 'africa', 'implementing', 'writes,', 'Times/Siena', '‚Äúlong', 'working-class', 'forecasting', 'üó≥', 'thursday‚Äôs', 'fridge', 'college-educated', 'dollar', 'Find', 'üìå', 'africa‚Äôs', 'monopoly', 'peru', 'value', 'bound', 'inflame', '@1843mag', 'assesses', '‚ÄúMoney', 'talks‚Äù', '@AnneMcElvoy', 'asks‚Äù', 'roll-out', '@tomstandage', 'ahead‚Äù', '‚ÄúChecks', 'balance‚Äù', '-@jonfasman', '-@gelliottmorris', '-@DSORennie', 'intelligence‚Äù', 'booze', '‚ÄúBabbage‚Äù', 'rigorous', 'fair-minded', 'intelligence‚Äù:', "economist's", 'circulation"', 'involve?', 'OpenRAN', "kelly's", 'all-Democratic', 'trot', 'front-runners', 'sistine', '@projectlincoln', '@NHJennifer', '@KnCukier', '@T_Wainwright', 'galicia', 'nigel', 'mr', 'intelligence‚Äù,', 'editor-in-chief', 'hub', '@jonfasman', 'tigray', 'economist‚Äôs', 'ahmed', 'treasuries', 'mapped', '‚ÄúDemocracies', 'partisan.', 'unique...But', 'tribalised.‚Äù', 'partisanship', '@BoFrankln', 'voted?', 'ethiopia‚Äôs', 'fake.', '"Babbage"', 'wandered', "philadelphia's", '‚Äúworking', 'hotel‚Äù']

word_to_index = dict((c, i) for i, c in enumerate(words_list))
index_to_word = dict((i, c) for i, c in enumerate(words_list))
print('there are', len(words_list), 'words')
print(words_list)
words_set = set(words_list)
# ignore all invalid words in tweets
print("BEFORE filtering, there were", len(tweets), "tweets")
new_tweets = []
for tweet in tweets:
    filtered_tweet = filter_words(tweet, words_set)
    if len(filtered_tweet) > 0.8*len(tweet):
        new_tweets.append(filtered_tweet)
tweets = new_tweets

# filter short tweets
tweets = [tweet for tweet in tweets if len(tweet) > MIN_TWEET_LENGTH]

print("AFTER filtering, there are", len(tweets), "tweets")




def main():
    pass    # do nothing (may comment out if we want to test something)

if __name__ == '__main__':
    main()


there are 5414 words
BEFORE filtering, there were 20001 tweets
AFTER filtering, there are 8104 tweets


In [26]:
# filter text file of tweets for ones that are valid inputs for the first 5 words
tweets_list = get_tweets_list('/content/processed_trump_test_data')
def first_5_words_in_dict(text):
    words = text.split(' ')
    if len(words) < 6:
        return False
    for i in range(5):
        if words[i] not in words_set:
            return False
    return True
tweets_list = [t for t in tweets_list if first_5_words_in_dict(t)]
f = open('/content/out', 'w')
for t in tweets_list:
    f.write(t + '\n')
print('asdf')
f.close()


asdf


Train Model

In [32]:
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Bidirectional, BatchNormalization, Activation
from keras.layers import Dropout
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.utils.data_utils import get_file
import random
import io
from google.colab import files
!pip3 install truecase
import truecase

INPUT_LENGTH = 5  # based on INPUT_LENGTH characters, our model generates the next character
GENERATED_TWEET_LENGTH = 20 # words


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def get_truecase(sentence):
    old_words = sentence.split(' ')
    new_words = truecase.get_true_case(sentence).split(' ')
    print('raw truecase:', ' '.join(new_words))
    for old_word in old_words:
        if not is_normal_capitalization(old_word):
            var1 = old_word.lower()
            var2 = var1[0].upper() + var1[1:]
            if var1 in new_words:
                new_words[new_words.index(var1)] = old_word
            elif var2 in new_words:
                new_words[new_words.index(var2)] = old_word
    return ' '.join(new_words)
    # return ' '.join([new_words[i] if is_normal_capitalization(old_words[i]) else old_words[i] for i in range(len(old_words))])

def on_epoch_end(epoch, _, data, model):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    for _ in range(2):     # use 10 different tweets as samples
        tweet = np.random.choice(data) # select random tweet
        start_index = 0

        for diversity in [0.2, 0.4, 0.6, 1.0]:
        # for diversity in [0.1, 0.2, 0.3, 0.4]:
        # for diversity in [0.3, 0.4, 0.5]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = tweet.split(' ')[start_index: start_index + INPUT_LENGTH]
            generated += ' '.join(sentence)
            print('----- Generating with seed: "' + ' '.join(sentence) + '"')
            # sys.stdout.write(generated)

            for i in range(GENERATED_TWEET_LENGTH):
                x_pred = np.zeros((1, INPUT_LENGTH, len(words_list)))
                for t, word in enumerate(sentence):
                    x_pred[0, t, word_to_index[word]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_word = index_to_word[next_index]
                generated += ' ' + next_word
                sentence = sentence[1:] + [next_word]

                # sys.stdout.write(next_word)
                # sys.stdout.flush()
            print(generated)
            print('with truecase:')
            # preserve
            print(get_truecase(generated))
            print()
    # save and download the model
    model.save('/content/model')
    !zip -r /content/model.zip /content/model
    files.download('/content/model.zip')


def train_from_data(data, train_limit=None):
    # convert the raw tweets list to input and output
    # input is equal to INPUT_LENGTH characters, output is a single character
    if train_limit:
        data = data[:train_limit]
    sentences = []
    next_words = []
    for tweet in data:
        tweet_words = tweet.split(' ')
        for i in range(0, len(tweet_words) - INPUT_LENGTH):
            sentences.append(tweet_words[i: i + INPUT_LENGTH])
            next_words.append(tweet_words[i + INPUT_LENGTH])
    print('# training samples:', len(sentences))
    # for i in range(10):
    #     print(sentences[i],'->',next_words[i])

    # vectorize the data
    print('Vectorization...')
    x = np.zeros((len(sentences), INPUT_LENGTH, len(words_list)), dtype=np.bool)
    y = np.zeros((len(sentences), len(words_list)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, word in enumerate(sentence):
            x[i, t, word_to_index[word]] = 1
        y[i, word_to_index[next_words[i]]] = 1

    # build the model
    print('Build model...')
    model = Sequential()
    model.add(LSTM(128, input_shape=(INPUT_LENGTH, len(words_list))))
    # model.add(LSTM(len(VALID_CHARS) * 7, input_shape=(INPUT_LENGTH, len(VALID_CHARS))))
    
    model.add(BatchNormalization())
    model.add(Activation('selu'))

    model.add(Dense(128))
    model.add(Activation('selu'))

    # model.add(Dense(len(VALID_CHARS)*4))
    # model.add(BatchNormalization())
    # model.add(Activation('selu'))

    # model.add(Bidirectional(LSTM(128), input_shape=(INPUT_LENGTH, len(VALID_CHARS))))
    model.add(Dense(len(words_list), activation='softmax'))

    # optimizer = RMSprop(lr=0.01)
    optimizer = Adam()
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    epochs = 10
    
    print_callback = LambdaCallback(on_epoch_end=lambda a, b: on_epoch_end(a, b, data, model))

    # train the model
    model.fit(x, y,
            epochs=epochs,
            callbacks=[print_callback]
            )

    # save and download the model
    model.save('/content/model')
    !zip -r /content/model.zip /content/model
    files.download('/content/model.zip')

def main():
    print("number of tweets:", len(tweets))
    train_from_data(tweets)


if __name__ == '__main__':
    main()

Collecting truecase
[?25l  Downloading https://files.pythonhosted.org/packages/07/00/061ba5033c2b8632174946d2664c881d1283a321258cb47da1f79721adb6/truecase-0.0.11-py3-none-any.whl (28.4MB)
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28.4MB 154kB/s 
Installing collected packages: truecase
Successfully installed truecase-0.0.11
number of tweets: 8104
# training samples: 106272
Vectorization...
Build model...
Epoch 1/10
----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "tomorrow on the @MissUniverse facebook"
tomorrow on the @MissUniverse facebook is the last night on fox news morning at 9 P.M. on fox news morning at 7:00 P.M. enjoy! @FoxNews
with truecase:
raw truecase: Tomorrow on the @Missuniverse Facebook is the last night on Fox news morning at 9 P. M. On Fox news morning at 7:00 P. M. enjoy! @Foxnews
Tomorrow on the @Missuniverse Facebook is the last night on Fox news morning at 9 P. M

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 2/10
----- Generating text after Epoch: 1
----- diversity: 0.2
----- Generating with seed: "credit the bloomberg administration for"
credit the bloomberg administration for the american people have been treated badly in the first responders and the truth last night. they are not going
with truecase:
raw truecase: Credit the Bloomberg administration for the American people have been treated badly in the first responders and the truth last night. They are not going
Credit the Bloomberg administration for the American people have been treated badly in the first responders and the truth last night. They are not going

----- diversity: 0.4
----- Generating with seed: "credit the bloomberg administration for"
credit the bloomberg administration for the massive oil that the election. we can do better. we need to be the right direction. don't let you
with truecase:
raw truecase: Credit the Bloomberg administration for the massive oil that the election. We can do better. We need to be the

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 3/10
----- Generating text after Epoch: 2
----- diversity: 0.2
----- Generating with seed: "has done a GREAT job"
has done a GREAT job as we are doing a GREAT job as a job as the as you are doing a great job for
with truecase:
raw truecase: Has done a great job as we are doing a great job as a job as the as you are doing a great job for
Has done a GREAT job as we are doing a GREAT job as a job as the as you are doing a great job for

----- diversity: 0.4
----- Generating with seed: "has done a GREAT job"
has done a GREAT job in the U.S. than the best of south carolina! #MakeAmericaGreatAgain #Trump2016 #MakeAmericaGreatAgain #Trump2016 #SuperTuesday #MakeAmericaGreatAgain #Trump2016 #MakeAmericaGreatAgain #SuperTuesday #Trump2016 #MakeAmericaGreatAgain
with truecase:
raw truecase: Has done a great job in the U. S. than the best of South Carolina! #Makeamericagreatagain #Trump2016 #Makeamericagreatagain #Trump2016 #Supertuesday #Makeamericagreatagain #Trump2016 #Makeamericagreatag

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 4/10
----- Generating text after Epoch: 3
----- diversity: 0.2
----- Generating with seed: "great reviews on the new"
great reviews on the new hampshire and the very dishonest media is that they are not even close! the other people of the U.S. is
with truecase:
raw truecase: Great reviews on the New Hampshire and the very dishonest media is that they are not even close! the other people of the U. S. is
Great reviews on the New Hampshire and the very dishonest media is that they are not even close! the other people of the U. S. is

----- diversity: 0.4
----- Generating with seed: "great reviews on the new"
great reviews on the new celebrity apprentice - but will be great! see you soon! #MakeAmericaGreatAgain #Trump2016 #MakeAmericaGreatAgain #Trump2016 #IACaucus #FITN #FITN #NHPrimary #MakeAmericaGreatAgain #Trump2016
with truecase:
raw truecase: Great reviews on the new celebrity apprentice- but will be great! see you soon! #Makeamericagreatagain #Trump2016 #Makeamericagreatagain

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 5/10
----- Generating text after Epoch: 4
----- diversity: 0.2
----- Generating with seed: "my @gretawire interview re: the"
my @gretawire interview re: the most followers. we are holding up over the next year. the year. I love steve new hotel. it's important to
with truecase:
raw truecase: My @Gretawire interview re: the most followers. We are holding up over the next year. the year. I love Steve new hotel. It's important to
My @Gretawire interview re: the most followers. We are holding up over the next year. the year. I love Steve new hotel. It's important to

----- diversity: 0.4
----- Generating with seed: "my @gretawire interview re: the"
my @gretawire interview re: the most competitive economy in the world. it is holding up next year. the year. let‚Äôs make america great again! you
with truecase:
raw truecase: My @Gretawire interview re: the most competitive economy in the world. It is holding up next year. the year. Let ‚Äô s make America great again! you
My @Gretawire int

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 6/10
----- Generating text after Epoch: 5
----- diversity: 0.2
----- Generating with seed: "the radical left dems are"
the radical left dems are working hard, but THE PEOPLE are unable to get tough with china. china &amp; then now! #MAGAüá∫üá∏ in june &amp;
with truecase:
raw truecase: The radical left Dems are working hard, but the people are unable to get tough with China. China& then now! #Maga üá∫ üá∏ in June&
The radical left Dems are working hard, but THE PEOPLE are unable to get tough with China. China& then now! #Maga üá∫ üá∏ in June&

----- diversity: 0.4
----- Generating with seed: "the radical left dems are"
the radical left dems are suffering badly. we are getting dumber pundits on T.V. hard to watch, zero talent! @CNN now being I in the
with truecase:
raw truecase: The radical left Dems are suffering badly. We are getting dumber pundits on T. V. hard to watch, zero talent! @Cnn now being I in the
The radical left Dems are suffering badly. We are getting dumber 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 7/10
----- Generating text after Epoch: 6
----- diversity: 0.2
----- Generating with seed: "it‚Äôs thursday. how much has"
it‚Äôs thursday. how much has OPEC did not treated me winning the debate. so much he is doing approval rating polls. shows planned for it
with truecase:
raw truecase: It ‚Äô s Thursday. how much has OPEC did not treated me winning the debate. so much he is doing approval rating polls. shows planned for it
It ‚Äô s Thursday. how much has OPEC did not treated me winning the debate. so much he is doing approval rating polls. shows planned for it

----- diversity: 0.4
----- Generating with seed: "it‚Äôs thursday. how much has"
it‚Äôs thursday. how much has OPEC did not treated the election against our very fair and fair beautiful. judge apprentice. great golf hotels in no.
with truecase:
raw truecase: It ‚Äô s Thursday. how much has OPEC did not treated the election against our very fair and fair beautiful. Judge apprentice. great golf hotels in no.
It ‚Äô s Thur

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 8/10
----- Generating text after Epoch: 7
----- diversity: 0.2
----- Generating with seed: "my wonderful son, eric, will"
my wonderful son, eric, will no longer be allowed to raise the day for our economy. we are making us strong and rich again. he
with truecase:
raw truecase: My wonderful son, Eric, will no longer be allowed to raise the day for our economy. We are making us strong and rich again. He
My wonderful son, Eric, will no longer be allowed to raise the day for our economy. We are making us strong and rich again. He

----- diversity: 0.4
----- Generating with seed: "my wonderful son, eric, will"
my wonderful son, eric, will no longer be allowed to raise the energy 4th july with a big crowd expected. it's all talk action. our
with truecase:
raw truecase: My wonderful son, Eric, will no longer be allowed to raise the energy 4TH July with a big crowd expected. It's all talk action. Our
My wonderful son, Eric, will no longer be allowed to raise the energy 4TH July with a bi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 9/10
----- Generating text after Epoch: 8
----- diversity: 0.2
----- Generating with seed: "read about last night's apprentice"
read about last night's apprentice on entertainment weekly comfort @GolfChannel at @TrumpTowerNY. prices open. don't to those for have been asking for weeks for USMC
with truecase:
raw truecase: Read about last night's apprentice on entertainment weekly comfort @Golfchannel at @Trumptowerny. prices open. Don't to those for have been asking for weeks for Usmc
Read about last night's apprentice on entertainment weekly comfort @Golfchannel at @Trumptowerny. prices open. Don't to those for have been asking for weeks for USMC

----- diversity: 0.4
----- Generating with seed: "read about last night's apprentice"
read about last night's apprentice on entertainment weekly comfort @GolfChannel at @CBSNews morning. enjoy! MAKE AMERICA GREAT AGAIN! TRUMP donald J. this is holding up
with truecase:
raw truecase: Read about last night's apprentice on entertainment we

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 10/10
----- Generating text after Epoch: 9
----- diversity: 0.2
----- Generating with seed: "young entrepreneurs ‚Äì always remember"
young entrepreneurs ‚Äì always remember in negotiations that sometimes the best deal you make is the right result. they don't let the all down, it's
with truecase:
raw truecase: Young entrepreneurs ‚Äì always remember in negotiations that sometimes the best deal you make is the right result. They don't let the all down, it's
Young entrepreneurs ‚Äì always remember in negotiations that sometimes the best deal you make is the right result. They don't let the all down, it's

----- diversity: 0.4
----- Generating with seed: "young entrepreneurs ‚Äì always remember"
young entrepreneurs ‚Äì always remember everything we just had wasting money on building up while many jobs. that were would not want to run for
with truecase:
raw truecase: Young entrepreneurs ‚Äì always remember everything we just had wasting money on building up while many jobs. that were

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:tensorflow:Assets written to: /content/model/assets
updating: content/model/ (stored 0%)
updating: content/model/saved_model.pb (deflated 89%)
updating: content/model/variables/ (stored 0%)
updating: content/model/variables/variables.index (deflated 65%)
updating: content/model/variables/variables.data-00000-of-00001 (deflated 9%)
updating: content/model/assets/ (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Code to generate tweets after model is trained

In [33]:
from tensorflow import keras


NUM_TWEETS_TO_GENERATE = 100
TEMPERATURE = 0.7

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# --- LOAD THE MODEL --- #
# !unzip /content/model.zip
!unzip -o /content/model.zip
model = keras.models.load_model('/content/content/model')

GENERATED_TWEET_LENGTH = 30
INPUT_LENGTH = 5

f = open('/content/model-output.txt', 'w')
f2 = open('/content/out', 'r')
lines = f2.read().split('\n')

# for i in range(NUM_TWEETS_TO_GENERATE):
for tweet in lines:
    # tweet = np.random.choice(tweets) # select random tweet
    start_index = 0
    generated = ''
    sentence = tweet.split(' ')[start_index: start_index + INPUT_LENGTH]
    generated += ' '.join(sentence)
    print('----- Generating with seed: "' + ' '.join(sentence) + '"')

    for i in range(GENERATED_TWEET_LENGTH):
        x_pred = np.zeros((1, INPUT_LENGTH, len(words_list)))
        for t, word in enumerate(sentence):
            x_pred[0, t, word_to_index[word]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, TEMPERATURE)
        next_word = index_to_word[next_index]
        generated += ' ' + next_word
        sentence = sentence[1:] + [next_word]

    print(generated)
    f.write(generated + '\n')
    print()
f.close()


Archive:  /content/model.zip
  inflating: content/model/saved_model.pb  
  inflating: content/model/variables/variables.index  
  inflating: content/model/variables/variables.data-00000-of-00001  
----- Generating with seed: "georgia won‚Äôt let us look"
georgia won‚Äôt let us look like a horrible times. OPEC continues to rip us off. not worth new leadership worth your half enthusiasm way OPEC ripping us at our economy and again. it is ripping

----- Generating with seed: "stock market getting very close"
stock market getting very close to @MittRomney tonight. the deal of american people as to fix its own problems, of which there are many, and most well received survive at CPAC next two weeks @BarackObama

----- Generating with seed: "the radical left democrats, working"
the radical left democrats, working and fix this. often just like with an residential $5 million order to robbing us at our economy never seen before! level in deficit has topped $1T for a year

----- Generating with s