# Topic Model Visualization

Uses tmplot topic modeling visualization library to visualize topic models generated by `biterm_topic_modeling.py`.

In [1]:
NUM_TOPICS = 50
MODEL_FOLDER = f"{NUM_TOPICS}_topics_model"

In [2]:
import tmplot as tmp
import pandas as pd
import os
import pickle
import altair_saver

In [3]:
###############
# IMPORT DATA #
###############
# Read text from all S&P 500 tweets into a Python list.
data_folder = "../data/tweets/ten_years"
texts = []
for comp_csv in os.listdir(data_folder):
    print(f"Reading tweets from CSV: {comp_csv}")
    df = pd.read_csv(f"{data_folder}/{comp_csv}", lineterminator='\n')
    texts += df['text'].str.strip().tolist()

Reading tweets from CSV: pncbank_tweets.csv
Reading tweets from CSV: teradyneinc_tweets.csv
Reading tweets from CSV: bakerhughesco_tweets.csv
Reading tweets from CSV: ceridian_tweets.csv
Reading tweets from CSV: aiginsurance_tweets.csv
Reading tweets from CSV: expediamedia_tweets.csv
Reading tweets from CSV: costargroup_tweets.csv
Reading tweets from CSV: cboe_tweets.csv
Reading tweets from CSV: skyworksinc_tweets.csv
Reading tweets from CSV: bathbodyworks_tweets.csv
Reading tweets from CSV: raytheontech_tweets.csv
Reading tweets from CSV: kelloggcompany_tweets.csv
Reading tweets from CSV: truistnews_tweets.csv
Reading tweets from CSV: royalcaribbean_tweets.csv
Reading tweets from CSV: eastmanchemco_tweets.csv
Reading tweets from CSV: dominos_tweets.csv
Reading tweets from CSV: edwardslifesci_tweets.csv
Reading tweets from CSV: labcorp_tweets.csv
Reading tweets from CSV: allegionplc_tweets.csv
Reading tweets from CSV: kraftheinzco_tweets.csv
Reading tweets from CSV: evergypower_tweets.

Reading tweets from CSV: ultabeauty_tweets.csv
Reading tweets from CSV: atmosenergy_tweets.csv
Reading tweets from CSV: okcupid_tweets.csv
Reading tweets from CSV: esteelauder_tweets.csv
Reading tweets from CSV: csx_tweets.csv
Reading tweets from CSV: abbvie_tweets.csv
Reading tweets from CSV: equinix_tweets.csv
Reading tweets from CSV: dupont_news_tweets.csv
Reading tweets from CSV: solaredgepv_tweets.csv
Reading tweets from CSV: cvshealth_tweets.csv
Reading tweets from CSV: lowes_tweets.csv
Reading tweets from CSV: idexcorp_tweets.csv
Reading tweets from CSV: aligntechinc_tweets.csv
Reading tweets from CSV: b_binsurance_tweets.csv
Reading tweets from CSV: teleflex_tweets.csv
Reading tweets from CSV: mgmresortsintl_tweets.csv
Reading tweets from CSV: fleetcor_tweets.csv
Reading tweets from CSV: mandt_bank_tweets.csv
Reading tweets from CSV: sbasite_tweets.csv
Reading tweets from CSV: incyte_tweets.csv
Reading tweets from CSV: resmed_tweets.csv
Reading tweets from CSV: waterscorp_tweet

Reading tweets from CSV: mdlz_tweets.csv
Reading tweets from CSV: edison_energy_tweets.csv
Reading tweets from CSV: bioradlifesci_tweets.csv
Reading tweets from CSV: zoetis_tweets.csv
Reading tweets from CSV: henryschein_tweets.csv
Reading tweets from CSV: comcast_tweets.csv
Reading tweets from CSV: servicenownews_tweets.csv
Reading tweets from CSV: gm_tweets.csv
Reading tweets from CSV: cloroxco_tweets.csv
Reading tweets from CSV: pentair_tweets.csv
Reading tweets from CSV: quanta_services_tweets.csv
Reading tweets from CSV: southerncompany_tweets.csv
Reading tweets from CSV: bostonsci_tweets.csv
Reading tweets from CSV: biotechne_tweets.csv
Reading tweets from CSV: marathonpetroco_tweets.csv
Reading tweets from CSV: interpublicipg_tweets.csv
Reading tweets from CSV: microsoft_tweets.csv
Reading tweets from CSV: psegdelivers_tweets.csv
Reading tweets from CSV: ptc_tweets.csv
Reading tweets from CSV: sempra_tweets.csv
Reading tweets from CSV: questdx_tweets.csv
Reading tweets from CSV:

In [4]:
##########################
# LOAD A PRE-SAVED MODEL #
##########################
# Load the model.
print("Loading presaved model")
with open(f"{MODEL_FOLDER}/model.pkl", "rb") as file:
    model = pickle.load(file)

Loading presaved model


In [5]:
#######################
# VISUALIZE RESULTS #
#######################
# Run the interactive report interface
tmp.report(model=model, docs=texts)

VBox(children=(VBox(children=(HBox(children=(HTML(value='<b>Select a topic</b>:'), Dropdown(options=((0, 0), (‚Ä¶

In [6]:
##################################################################
# SAVE SCATTERPLOT OF TOPICS BASED ON INTERTOPIC DISTANCE VALUES #
##################################################################

In [7]:
# Calculate the coordinates of topics based on intertopic distance values.
# By default, the combination of t-distributed Stochastic Neighbor Embedding
# and symmetric Kullback-Leibler divergence is used to calculate topics coordinates in 2D.
topics_coords = tmp.prepare_coords(model)

In [8]:
# Plot topics
topic_scatterplot = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label')
altair_saver.save(topic_scatterplot, f"{MODEL_FOLDER}/intertopicdistance_tsne.html")

In [9]:
############################################
# PHI MATRIX (WORDS VS TOPICS PROBABILITY) #
############################################
phi = tmp.get_phi(model)
phi.head()

topics,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
‚Äç,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08
‚Äç@,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08
‚Äçp,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08
‚Äçt,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08
‚Äç‚Äç,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08


In [13]:
#######################################
# FOR EACH TOPIC, SAVE RELEVANT WORDS #
#######################################
for i in range(0, NUM_TOPICS):
    terms_probs = tmp.calc_terms_probs_ratio(phi, topic=i, lambda_=0.60)
    tmp.plot_terms(terms_probs).save(f"{MODEL_FOLDER}/topic{i}_relevantwords.html")

In [11]:
##################################################
# THETA MATRIX (TOPICS VS DOCUMENTS PROBABILITY) #
##################################################
theta = tmp.get_theta(model)
theta.head()

docs,0,1,2,3,4,5,6,7,8,9,...,1004174,1004175,1004176,1004177,1004178,1004179,1004180,1004181,1004182,1004183
topics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.008496,0.0004190155,0.007215,0.004104,0.010844,0.003224,0.000692,0.023247,0.003367,0.027114,...,0.0,0.007459,0.099934,0.001634,0.028477,0.0004364889,0.000857,0.001941,0.002965,0.0
1,0.030035,0.032829,0.001988,0.004772,0.007032,0.021364,0.000361,0.012171,0.019154,0.023248,...,0.0,0.018211,0.003118,0.004687,0.000236,0.002108498,0.004286,0.000245,0.077908,0.0
2,0.011068,5.957034e-05,0.02709,0.008768,0.000175,0.004295,5.3e-05,0.006682,0.004373,0.032464,...,0.0,0.005248,0.001845,0.000329,0.005428,3.028636e-07,0.001656,0.000328,0.004261,0.0
3,0.011663,7.447948e-09,0.024118,0.012973,0.010109,0.002516,0.000441,0.047507,0.000442,0.029814,...,0.0,0.135543,0.00058,0.029977,0.030733,7.708663e-05,0.0035,0.172282,0.145928,0.0
4,0.002631,0.0005330783,0.006495,0.004518,1.6e-05,0.002236,0.000337,0.007751,0.018097,0.028531,...,0.0,0.011952,0.00313,0.00289,0.185355,4.312373e-06,0.001623,0.001529,0.009774,0.0


In [12]:
##################################################################
# Get documents with maximum probabilities P(t|d) for each topic #
##################################################################
tmp.get_top_docs(texts, model=model)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,...,topic40,topic41,topic42,topic43,topic44,topic45,topic46,topic47,topic48,topic49
0,How is machine learning being used to improve ...,PostyFest is coming to your living room today!...,Get the inside scoop on potential disruptors t...,"At VF we are committed to Inclusion, Diversity...",#PCA Vincennes donated boxes to the Evansville...,Ice melt ‚úì Chimney sweep ‚úì Wreathes ‚úì A winter...,8 tips for a better life at home this year: ht...,How can #Automotive system designers ensure th...,"Join two conservation groups, the @AmForestFnd...",. @cellectis Details Proof-of-Concept for ‚ÄúSma...,...,"20932 Shine Dr., Saugus, CA $2,300/Month, 3 BR...",We‚Äôre gonna REWIND and BRING BACK TWO previous...,‚ÄúWhile 2020 will be remembered as the year of ...,"Portland workshop, 9/6/13: Exposing the Potent...",Restock your shelves with new discounts on ant...,Chadd and Val share a bit of Illidan Stormrage...,It‚Äôs National Pollinators Week! Did you know t...,@3dr_Amendment Thanks for the RTs. We apprecia...,Thx for following us @14landase @SnackIndustry...,On-Premises vs. Cloud? How @Equinix &amp; @Ora...
1,2 years. $400T in assets. 1 global phenomenon.,Lessons from #tech products that failed early....,Genomic biomarkers lead to a reduction in trea...,"We have an average household income of $100,00...",When Webb begins to beam back images of the ea...,Putting a ring on it? Nice! Here's how you can...,Patients with #diabetes are more likely to suf...,Parker can help avoid dry running in fluid tra...,Happy National Cat Day from those of us at Zoe...,Do you know the #4Rs of #NutrientStewardship? ...,...,$1550 / 3br - 29676 Woodlands Ave - Classy Hom...,We're excited to be kicking off #SNAXPO2018 in...,Making gels for a western blot today @BioRadLi...,Our friends at @bostonsci gave their best toda...,Foresight filter plates and columns are prepac...,What's your favorite Activision game of all ti...,#Xylem deploys teams in #China to meet the nee...,Our CEO Kristin Peck shares what each of her p...,It‚Äôs Day 2 (Tuesday) of #ISPD2022 and we want ...,Enterprises can now provision @Alibaba_Cloud E...
2,Transitional Design using NERI Industrial #Cha...,"It‚Äôs #tbt Throwback Thursday! In January 1960,...",The #packaging industry has so many great care...,@Ableroofing brightened a newlywed couple's ou...,#inthenews: Our Linea Lite line scan cameras f...,"Brian Duperrault, President &amp; CEO of @AIGi...",Not every corporate headquarters has a bottlin...,Looking to convert your #Diesel fleet to #NatG...,"There are different types of leukemia, includi...",Do you need to refresh your knowledge of #Flow...,...,@roofstock @americanhomes4 @firstkeyhomes @Inv...,BIO-RAD: Accelerate Analysis of Immune Respons...,Now on @ReutersInsider https://t.co/zxjMLGqjhp...,ICYMI: @ConstellationEG signed an agreement w/...,Only 30 minutes until our #KBtribechat fun hos...,No call for a moratorium on CRISPR babies at t...,Prostate Cancer ‚ÄúSpews‚Äù Tumor-Promoting Protei...,"Do you believe, Cleveland? \n\n#ALLin216 https...",RT @Pursuitof300: Weed mgmt. is important for ...,"Sincere congratulations to Andreas Lutz, DuPon..."
3,UP's new online magazine for #transportation a...,Parker's #Mobile #IoT a digital integrated sol...,We're glad that we could be good company! Look...,May is women's health month and my friend @She...,@Ruaan_L @davide_2010 @Anka_T @EurocharmGroup ...,700 miles and counting. See how a researcher i...,RT @4Rnutrients: @MosaicCompany and @nature_or...,Watch the video on how Parker is making #Minin...,"Santa, carolers &amp; holiday merriment at Gra...","Say üëã to Vanessa, our #reallifescientist of th...",...,How would you use this spacious backyard? To s...,INSIGHT: The future goals around #carbondioxid...,RT @FFAfoundation: @ADMupdates marks 55 years ...,The team from Constellation played in yesterda...,TOMORROW at #KBtribechat -\n\nThe Laundry Room...,Onsite Account Specialist Brendan Doyle's late...,"Congrats to Alex Davis, one of this year's rec...",BRING BACK THE TIME‚Äù IS OUT NOW!! Listen here:...,What is T-cell therapy? Watch to learn how thi...,Congratulations to our 2020 DuPont Laureates!\...
4,"Brea Oasis not a mirage:$2.5M sculpture, tribu...",We bought a gene gun! Thanks @BioRadLifeSci! #...,Not a bad afternoon after a good two days of c...,No plans for the weekend? Write a science blog...,@farasatk @Race2Diversity @fleejack @Vaillanco...,Reminder that we stop accepting orders via htt...,"Launching at #CES2020, this tiny drive offers ...","Attending #bauma2022? Meet us at Booth 215, Ha...",Check out one of our long-time grant partners ...,"Yes, humans already have the ability to edit g...",...,What do you think about this beautiful bathroo...,That‚Äôs one good lookin‚Äô spud! Our colleagues @...,"#ICYMI: ""The work to be performed here will ex...","Jayshree Ullal, Andy Bechtolsheim named semifi...",The Extreme Makeover: Home Edition designers k...,What does one wear to a moon rocket launch? üöÄ...,A warranted honor: The @USArmy recognizes our ...,LIVE on #Periscope: Live on Periscope with @JR...,Thx for following our Twitter! @directsupply1 ...,Download this @TechTarget success story and re...
