In [3]:
#
# This script will walk through all the tweet id files and
# hydrate them with twarc. The line oriented JSON files will
# be placed right next to each tweet id file.
#
# Note: you will need to install twarc, tqdm, and run twarc configure
# from the command line to tell it your Twitter API keys.
#
# Special thanks to Github users edsu and SamSamhuns for contributing to this file. This file was repurposed from our other
# data repository on COVID-19 related tweets : https://github.com/echen102/COVID-19-TweetIDs
#

import gzip
import json

from tqdm import tqdm
from twarc import Twarc
from pathlib import Path

twarc = Twarc()
data_dirs = ['C:\\Users\\Afnan Anwar\\Desktop\\SPROJ\\Cleaning\\']


def main():
    for data_dir in data_dirs:
       for path in Path(data_dir).iterdir():
            if path.name.endswith('.txt'):
                hydrate(path)


def _reader_generator(reader):
    b = reader(1024 * 1024)
    while b:
        yield b
        b = reader(1024 * 1024)


def raw_newline_count(fname):
    """
    Counts number of lines in file
    """
    f = open(fname, 'rb')
    f_gen = _reader_generator(f.raw.read)
    return sum(buf.count(b'\n') for buf in f_gen)


def hydrate(id_file):
    print('hydrating {}'.format(id_file))

    gzip_path = id_file.with_suffix('.jsonl.gz')
    if gzip_path.is_file():
        print('skipping json file already exists: {}'.format(gzip_path))
        return

    num_ids = raw_newline_count(id_file)

    with gzip.open(gzip_path, 'w') as output:
        with tqdm(total=num_ids) as pbar:
            for tweet in twarc.hydrate(id_file.open()):
                output.write(json.dumps(tweet).encode('utf8') + b"\n")
                pbar.update(1)


if __name__ == "__main__":
    main()

  0%|                                                                                        | 0/10001 [00:00<?, ?it/s]

hydrating C:\Users\Afnan Anwar\Desktop\SPROJ\Cleaning\target.txt


 85%|█████████████████████████████████████████████████████████████████▊           | 8548/10001 [02:58<00:30, 47.93it/s]


In [1]:
# use this code after alteration to convert your downloaded tweet id files to a manageable number of ids

import pandas as pd
import os
import glob

#path = os.getcwd()

humza = pd.read_csv("C:\\Users\\Afnan Anwar\\Desktop\\SPROJ\\Cleaning\\2020-11\\Election.txt")
humza.columns = ['id'] 


In [2]:
target = list(humza.loc[:10000,'id'])
os.chdir("C:\\Users\\Afnan Anwar\\Desktop\\SPROJ\\Cleaning\\2020-11")

new_file = open('target.txt','w')

for id in target:
    new_file.write(str(id)+"\n")
    
new_file.close()
#len(humza)

In [4]:
# Run this after running the second cell which hydrated the ids. Use the directory named based on folder name where you stored id file. 
# Format for directory is YYYY-MM

path = 'C:\\Users\\Afnan Anwar\\Desktop\\SPROJ\\Cleaning'
all_files = glob.glob(path + "/*.gz")

li = []

for filename in all_files:
    df = pd.read_json(filename,compression='infer',lines=True)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)

In [5]:
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import glob
import os
from pycorenlp import StanfordCoreNLP
import matplotlib.pyplot as plt

# The keywords used to extract this data were Trump, Biden and elections2020

In [6]:
# Checking how many unique users
len(data.id.unique())

8548

In [7]:
data.created_at = pd.to_datetime(data.created_at)

In [55]:
#Litchman's key : social unrest
temp_frame = data.copy()
temp_frame['full_text'] = temp_frame['full_text'].str.lower() 
unrest = temp_frame[temp_frame.full_text.str.contains('unrest')]
protest = temp_frame[temp_frame.full_text.str.contains('protest')]
violence = temp_frame[temp_frame.full_text.str.contains('violence')]
demonstration = temp_frame[temp_frame.full_text.str.contains('demonstration')]
turmoil = temp_frame[temp_frame.full_text.str.contains('turmoil')]
outcry = temp_frame[temp_frame.full_text.str.contains('outcry')]
riot = temp_frame[temp_frame.full_text.str.contains('riot')]
revolt = temp_frame[temp_frame.full_text.str.contains('revolt')]
Floyd = temp_frame[temp_frame.full_text.str.contains('floyd')]
George  = temp_frame[temp_frame.full_text.str.contains('George')]
immigrants = temp_frame[temp_frame.full_text.str.contains('immigrant')]
muslims = temp_frame[temp_frame.full_text.str.contains('muslim')]
protest  = temp_frame[temp_frame.full_text.str.contains('protest')]
BLM = temp_frame[temp_frame.full_text.str.contains('BLM')]

 


#print(f"The number of tweets containing unrest are {len(unrest)}, The number of tweets containing protest are {len(protest)}, The number of tweets containing demonstration are {len(demonstration)}, The number of tweets containing turmoil are {len(turmoil)}, The number of tweets containing outcry are {len(outcry)}, The number of tweets containing riot are {len(riot)}, The number of tweets containing revolt are {len(revolt)}")
unrest =  len(unrest)+ len(protest)+ len(demonstration)+ len(turmoil) + len(outcry) + len(riot) + len(revolt)+ len(Floyd)+len(George)+len(immigrants)+len(muslims)+len(protest)+len(BLM)

unrest

topicdominance = (unrest/8548)*100
#topicdominance

#Litchman's key: Mid-term gains
#temp_frame['full_text'] = temp_frame['full_text'].str.lower() 

democrats = temp_frame[temp_frame.full_text.str.contains('democrats')]
republicans = temp_frame[temp_frame.full_text.str.contains('republicans')]
seats = temp_frame[temp_frame.full_text.str.contains('seats')]
speaker = temp_frame[temp_frame.full_text.str.contains('Nancy Pelosi')]
congress = temp_frame[temp_frame.full_text.str.contains('congress')]

mtg =  len(democrats)+ len(republicans)+ len(seats)+ len(speaker) + len(congress)


#mtg

topdom2 = (mtg/8548)*100
#topdom2



#Litchman's key: No third party
party = temp_frame[temp_frame.full_text.str.contains('party')]
third = temp_frame[temp_frame.full_text.str.contains('third')]
libertarians = temp_frame[temp_frame.full_text.str.contains('Libertarian')]
greens= temp_frame[temp_frame.full_text.str.contains('Greens')]
bi_partisan = temp_frame[temp_frame.full_text.str.contains('bipartisan')]


ntp =  len(party)+ len(third)+ len(libertarians)+ len(greens) + len(bi_partisan) 
#ntp

ntpdom = (ntp/8548)*100
#ntpdom

# Litchman's key: No primary contest

primary = temp_frame[temp_frame.full_text.str.contains('primary')]
contest = temp_frame[temp_frame.full_text.str.contains('contest')]
rocky_de_la_fuente = temp_frame[temp_frame.full_text.str.contains('Rocky De La Fuente')]
caucuses = temp_frame[temp_frame.full_text.str.contains('Caucuses')]


prim_contest =  len(primary)+ len(contest)+ len(rocky_de_la_fuente)+ len(caucuses)
#prim_contest

pc = (prim_contest/8548)*100
#pc

#Litchman's key: Incumbent seeking re-election
reelection = temp_frame[temp_frame.full_text.str.contains('re elect')]
four_more_years = temp_frame[temp_frame.full_text.str.contains('4 more')]
keep_america_great = temp_frame[temp_frame.full_text.str.contains('keep America')]
MAGA = keep_america_great = temp_frame[temp_frame.full_text.str.contains('MAGA')]
Trump2020 = temp_frame[temp_frame.full_text.str.contains('Trump')]
great = temp_frame[temp_frame.full_text.str.contains('great')]

incumbent =  len(reelection)+ len(four_more_years)+ len(keep_america_great)+ len(MAGA)+ len(Trump2020)+len(great)
#incumbent

reelection = (incumbent/8548)*100
#reelection

#Litchman's key: Strong short-term economy
unemployment = temp_frame[temp_frame.full_text.str.contains('unemployment')]
recession = temp_frame[temp_frame.full_text.str.contains('recession')]
stock = temp_frame[temp_frame.full_text.str.contains('stock')]
economy = temp_frame[temp_frame.full_text.str.contains('economy')]
market = temp_frame[temp_frame.full_text.str.contains('market')]
trade = temp_frame[temp_frame.full_text.str.contains('trade')]

short_term =  len(unemployment)+ len(recession)+ len(stock)+ len(economy)+ len(market)+len(trade)
#short_term

st = (short_term/8548)*100
#st



# Strong long-term economy
GDP = temp_frame[temp_frame.full_text.str.contains('GDP')]
economy = temp_frame[temp_frame.full_text.str.contains('economy')]
growth = temp_frame[temp_frame.full_text.str.contains('growth')]
boom = temp_frame[temp_frame.full_text.str.contains('boom')]
market = temp_frame[temp_frame.full_text.str.contains('market')]
shares = temp_frame[temp_frame.full_text.str.contains('shares')]
stocks = temp_frame[temp_frame.full_text.str.contains('stocks')]

long_term =  len(GDP)+ len(economy)+ len(growth)+ len(boom)+ len(market)+len(shares)+len(stocks)

lt = (long_term/8548)*100

lt
# Major policy change.
executive = temp_frame[temp_frame.full_text.str.contains('executive')]
order = temp_frame[temp_frame.full_text.str.contains('order')]
Obama_care = temp_frame[temp_frame.full_text.str.contains('Obama care')]
fracking  = temp_frame[temp_frame.full_text.str.contains('fracking')]
environment = temp_frame[temp_frame.full_text.str.contains('environment')]
Health_care = temp_frame[temp_frame.full_text.str.contains('Health')]
Care= temp_frame[temp_frame.full_text.str.contains('Care')]
peace = temp_frame[temp_frame.full_text.str.contains('peace')]
funding = temp_frame[temp_frame.full_text.str.contains('funding')]
immigration = temp_frame[temp_frame.full_text.str.contains('immigration')]
Supreme = temp_frame[temp_frame.full_text.str.contains('Supreme Court')]
tax = temp_frame[temp_frame.full_text.str.contains('tax')]

mpc =  len(executive)+ len(order)+ len(Obama_care)+ len(fracking)+ len(environment)+len(Health_care)+len(Care)+len(peace)+len(funding)+len(immigration)+ len(Supreme)+len(tax)
#mpc

mdom = (mpc/8548)*100
#mdom

#No scandal
impeachment = temp_frame[temp_frame.full_text.str.contains('impeachment')]
scandal   = temp_frame[temp_frame.full_text.str.contains('scandal')]
Ukraine = temp_frame[temp_frame.full_text.str.contains('ukraine')]
transcript  = temp_frame[temp_frame.full_text.str.contains('transcript')]
interference    = temp_frame[temp_frame.full_text.str.contains('interference')]
call = temp_frame[temp_frame.full_text.str.contains('call')]
Hunter = temp_frame[temp_frame.full_text.str.contains('Hunter')]
Biden = temp_frame[temp_frame.full_text.str.contains('Biden')]
allegation = temp_frame[temp_frame.full_text.str.contains('allegation')]
impeachable = temp_frame[temp_frame.full_text.str.contains('impeachable')]
aid  = temp_frame[temp_frame.full_text.str.contains('aid')]
whistleblower = temp_frame[temp_frame.full_text.str.contains('whistleblower')]
controversy = temp_frame[temp_frame.full_text.str.contains('controversy')]


ns =  len(impeachment)+len(scandal)+len(Ukraine)+len(transcript)+len(interference)+len(call)+len(Hunter)+len(Biden)+len(allegation)+len(impeachable)+len(aid)+len(whistleblower)+len(controversy)

nsdom = (ns/8548)*100
nsdom
# No foreign/military failure
china = temp_frame[temp_frame.full_text.str.contains('china')]
northkorea = temp_frame[temp_frame.full_text.str.contains('north korea')]
russia   = temp_frame[temp_frame.full_text.str.contains('russia')]
iran = temp_frame[temp_frame.full_text.str.contains('iran')]
bombing = temp_frame[temp_frame.full_text.str.contains('bombing')]
middle_east = temp_frame[temp_frame.full_text.str.contains('middle east')]
WW3 = temp_frame[temp_frame.full_text.str.contains('WW3')]
war = temp_frame[temp_frame.full_text.str.contains('war')]






military = len(china)+len(northkorea)+len(russia)+len(peace)+len(iran)+len(bombing)+len(middle_east)+len(WW3)+len(war)
m = (military/8548)*100
#m

# Military success
nato = temp_frame[temp_frame.full_text.str.contains('nato')]
nobel = temp_frame[temp_frame.full_text.str.contains('Nobel')]
prize = temp_frame[temp_frame.full_text.str.contains('prize')]
peace = temp_frame[temp_frame.full_text.str.contains('peace')]
Israel = temp_frame[temp_frame.full_text.str.contains('Israel')]


ms = len(nato) +len(nobel)+len(prize)+len(peace)+len(Israel)

msdom = (ms/8548)*100
msdom

#Charismatic incumbent
atinfa = temp_frame[temp_frame.full_text.str.contains('antifa')]
Anon = temp_frame[temp_frame.full_text.str.contains('anon')]
Q = temp_frame[temp_frame.full_text.str.contains('Q')]
cult = temp_frame[temp_frame.full_text.str.contains('cult')]
great = temp_frame[temp_frame.full_text.str.contains('great')]
powerful  = temp_frame[temp_frame.full_text.str.contains('powerful')]
leader = temp_frame[temp_frame.full_text.str.contains('leader')]

ch = len(atinfa)+len(Anon)+len(Q)+len(cult)+len(great)+len(powerful)+len(leader)

chdom = (ms/8548)*100
chdom

#Uncharismatic challenger

sleepy = temp_frame[temp_frame.full_text.str.contains('sleepy')]
joe = temp_frame[temp_frame.full_text.str.contains('joe')]
basement = temp_frame[temp_frame.full_text.str.contains('basement')]
Quid_Pro= temp_frame[temp_frame.full_text.str.contains('quid Pro')]
slow = temp_frame[temp_frame.full_text.str.contains('slow')]
creepy = temp_frame[temp_frame.full_text.str.contains('creepy')]
old = temp_frame[temp_frame.full_text.str.contains('old')]
violent = temp_frame[temp_frame.full_text.str.contains('violent')]
socialist = temp_frame[temp_frame.full_text.str.contains('socialist')]
stutter = temp_frame[temp_frame.full_text.str.contains('stutter')]
stammer = temp_frame[temp_frame.full_text.str.contains('stammer')]


unch = len(sleepy)+len(joe)+ len(basement)+len(Quid_Pro)+len(slow)+len(creepy)+len(old)+len(violent)+len(socialist)+len(stutter)+len(stammer)

undom = (unch/8548)*100
undom
 

17.477772578380907

In [56]:
# Segmenting Data based on tweet text containing Trump and Biden 

temp_frame['full_text'] = temp_frame['full_text'].str.lower() 
trump = temp_frame[temp_frame.full_text.str.contains('trump')]
biden = temp_frame[temp_frame.full_text.str.contains('biden')]

print(f"The number of tweets containing Trump are {len(trump)} while those containing Biden are {len(biden)}.")

The number of tweets containing Trump are 4998 while those containing Biden are 2013.


In [57]:
# Grouping based on Unique Users
temp_frame.columns
# Dropping duplicates
temp_frame = temp_frame.drop_duplicates('id')
temp_frame

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,extended_entities,source,in_reply_to_status_id,...,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,lang,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status
0,2020-11-02 22:59:55+00:00,1323399466439892999,1323399466439892992,rt @realdonaldtrump: https://t.co/cnrzjewxko,False,"[0, 44]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1323351620948762624, 'id_str...","<a href=""http://twitter.com/download/android"" ...",,...,6390,0,False,False,0.0,und,,,,
1,2020-11-02 22:59:55+00:00,1323399467064872961,1323399467064872960,rt @realdonaldtrump: just landed in traverse c...,False,"[0, 71]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""http://twitter.com/download/iphone"" r...",,...,20829,0,False,False,,en,,,,
2,2020-11-02 22:59:55+00:00,1323399467140292608,1323399467140292608,rt @realdonaldtrump: i prepaid millions of dol...,False,"[0, 75]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,...,20714,0,False,False,,en,,,,
3,2020-11-02 22:59:55+00:00,1323399467379396608,1323399467379396608,rt @realdonaldtrump: as christians throughout ...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,...,25536,0,False,False,,en,,,,
4,2020-11-02 22:59:55+00:00,1323399467408777216,1323399467408777216,rt @realdonaldtrump: just landed in ohio. see ...,False,"[0, 68]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""http://twitter.com/download/android"" ...",,...,13057,0,False,False,,en,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8543,2020-11-02 23:02:30+00:00,1323400119773880320,1323400119773880320,rt @chrisklemens: trump supporters are a cance...,False,"[0, 65]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""http://twitter.com/download/iphone"" r...",,...,1590,0,False,False,,en,,,,
8544,2020-11-02 23:02:30+00:00,1323400119887298561,1323400119887298560,rt @realdonaldtrump: ¡mi #americandreamplan es...,False,"[0, 140]","{'hashtags': [{'text': 'AmericanDreamPlan', 'i...",,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,...,18032,0,False,False,,es,,,,
8545,2020-11-02 23:02:30+00:00,1323400120004739072,1323400120004739072,rt @realdonaldtrump: make america great again!,False,"[0, 46]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""http://twitter.com/download/android"" ...",,...,47083,0,False,False,,en,,,,
8546,2020-11-02 23:02:30+00:00,1323400120013103106,1323400120013103104,rt @realdonaldtrump: my #americandreamplan is ...,False,"[0, 140]","{'hashtags': [{'text': 'AmericanDreamPlan', 'i...",,"<a href=""http://twitter.com/download/iphone"" r...",,...,13776,0,False,False,,en,,,,


In [58]:
import pandas as pd
import numpy as np
import re
import gensim
from gensim.utils import simple_preprocess
# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
#from wordcloud import WordCloud
# Plotting

import matplotlib.pyplot as plt

#Taking a look at the data
temp_frame.head()

#Checking the col names
#for col in df.columns: 
#    print(col)

#Checking out the 'text' col
    #print(df['text'])

#Remove '@ User from the 'text' col
    
def remove_users(tweet, pattern1, pattern2):
    r = re.findall(pattern1, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)

    r = re.findall(pattern2, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet

df['tidy_tweet'] = df['full_text'].apply(lambda x: remove_users(x,"@ [\w]*", "@[\w]*"))

#Normalization
df['tidy_tweet'] = df['tidy_tweet'].str.lower()
#print(df['tidy_tweet'])

# Remove all the hashtags from the text
def remove_hashtags(tweet, pattern1, pattern2):
    r = re.findall(pattern1, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)

    r = re.findall(pattern2, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet

df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x: remove_hashtags(x,"# [\w]*", "#[\w]*"))

# Remove all links & URLs
def remove_links(tweet):
    tweet_no_link = re.sub(r"http\S+", "", tweet)
    return tweet_no_link

df['tidy_tweet'] = df['tidy_tweet'].apply(remove_links)

# Removing Punctuations, Numbers, and Special Characters
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

#Removing words with less than 3 characters
df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x:' '.join([w for w in x.split() if len(w)>3]))

#Tokenization
def tokenize(tweet):
    for word in tweet:
        yield(gensim.utils.simple_preprocess(str(word), deacc=True))  

df['tidy_tweet_tokens'] = list(tokenize(df['tidy_tweet']))

# Prepare Stop Words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'https', 'twitter', 'religions', 'pic','twitt',])

# REMOVE STOPWORDS
def remove_stopwords(tweets):
    return [[word for word in simple_preprocess(str(tweet)) if word not in stop_words] for tweet in tweets]

df['tokens_no_stop'] = remove_stopwords(df['tidy_tweet_tokens'])

# REMOVE TWEETS LESS THAN 3 TOKENS
df['length'] = df['tokens_no_stop'].apply(len)
df = df.drop(df[df['length']<3].index)
df = df.drop(['length'], axis=1)

df = df[['created_at','tokens_no_stop','full_text',]]
#df.reset_index(drop=True, inplace=True)


[nltk_data] Downloading package stopwords to C:\Users\Afnan
[nltk_data]     Anwar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
'''
# Calculating Compound Sentiment Scores for Social unrest

# Vader Sentiment function

def sentiment_analyzer_scores(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    return score['compound']

# The following two lines will take ages to execute on millions of tweets so I'd suggest run it on a couple 100 with slicing to see if this works for you guys

unrest_sent = unrest['full_text'][:100].map(sentiment_analyzer_scores)
protest_sent = protest['full_text'][:100].map(sentiment_analyzer_scores)
turmoil_sent = turmoil['full_text'][:100].map(sentiment_analyzer_scores)
demonstration_sent = demonstration['full_text'][:100].map(sentiment_analyzer_scores)
outcry_sent = outcry['full_text'][:100].map(sentiment_analyzer_scores)
riot_sent = riot['full_text'][:100].map(sentiment_analyzer_scores)
revolt_sent = revolt['full_text'][:100].map(sentiment_analyzer_scores)
'''
# Calculating Compound Sentiment Scores for Trump and Biden

# Vader Sentiment function

def sentiment_analyzer_scores(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    return score['compound']

# The following two lines will take ages to execute on millions of tweets so I'd suggest run it on a couple 100 with slicing to see if this works for you guys

trump_sent = trump['full_text'][:100].map(sentiment_analyzer_scores)
biden_sent = biden['full_text'][:100].map(sentiment_analyzer_scores)

In [60]:
# Bag of Words Conversion
dictionary = gensim.corpora.Dictionary(df['tokens_no_stop'])

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

bow_corpus = [dictionary.doc2bow(tweet) for tweet in df['tokens_no_stop']]

In [61]:
# Topic Modelling using BOW

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=25, id2word=dictionary, passes=2, workers=2)



In [63]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic : {idx} \nWords: {topic}\n")

Topic : 0 
Words: 0.055*"biden" + 0.028*"election" + 0.024*"america" + 0.021*"like" + 0.020*"state" + 0.019*"trump" + 0.017*"home" + 0.015*"people" + 0.014*"prison" + 0.014*"vaccine"

Topic : 1 
Words: 0.055*"people" + 0.051*"trump" + 0.043*"biden" + 0.041*"america" + 0.034*"great" + 0.030*"make" + 0.022*"american" + 0.022*"time" + 0.018*"vote" + 0.017*"donald"

Topic : 2 
Words: 0.089*"trump" + 0.029*"biden" + 0.027*"covid" + 0.025*"lebron" + 0.025*"spreader" + 0.025*"james" + 0.021*"school" + 0.020*"obama" + 0.020*"children" + 0.019*"gave"

Topic : 3 
Words: 0.087*"trump" + 0.046*"country" + 0.042*"great" + 0.039*"remember" + 0.038*"built" + 0.038*"christians" + 0.037*"souls" + 0.036*"went" + 0.035*"celebrate" + 0.035*"throughout"

Topic : 4 
Words: 0.102*"biden" + 0.052*"harris" + 0.048*"kamala" + 0.018*"china" + 0.017*"communism" + 0.016*"calls" + 0.015*"michigan" + 0.015*"equality" + 0.014*"mocked" + 0.014*"outcome"

Topic : 5 
Words: 0.072*"americans" + 0.070*"education" + 0.069*

In [16]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7599964141845703	 
Topic: 0.115*"trump" + 0.062*"biden" + 0.026*"lady" + 0.025*"gaga" + 0.020*"pennsylvania" + 0.019*"millions" + 0.016*"covid" + 0.015*"votes" + 0.014*"election" + 0.013*"deaths"

Score: 0.010000149719417095	 
Topic: 0.173*"vote" + 0.058*"trump" + 0.042*"biden" + 0.017*"tomorrow" + 0.016*"proud" + 0.014*"ballot" + 0.011*"wins" + 0.011*"police" + 0.011*"voting" + 0.011*"gonna"

Score: 0.010000149719417095	 
Topic: 0.073*"country" + 0.071*"great" + 0.069*"remember" + 0.068*"christians" + 0.068*"souls" + 0.068*"built" + 0.067*"went" + 0.067*"celebrate" + 0.067*"throughout" + 0.035*"biden"

Score: 0.010000149719417095	 
Topic: 0.049*"president" + 0.045*"trump" + 0.025*"biden" + 0.021*"year" + 0.021*"every" + 0.020*"remember" + 0.019*"great" + 0.019*"voting" + 0.019*"country" + 0.016*"vote"

Score: 0.010000149719417095	 
Topic: 0.078*"trump" + 0.054*"biden" + 0.030*"harris" + 0.020*"kamala" + 0.018*"states" + 0.018*"lebron" + 0.014*"united" + 0.012*"vote" + 0.012*