# Topic Model Visualization

Uses tmplot topic modeling visualization library to visualize topic models generated by `biterm_topic_modeling.py`.

In [1]:
NUM_TOPICS = 50
MODEL_FOLDER = f"{NUM_TOPICS}_topics_model"

In [2]:
import tmplot as tmp
import pandas as pd
import os
import pickle
import altair_saver

In [3]:
###############
# IMPORT DATA #
###############
# Read text from all S&P 500 tweets into a Python list.
data_folder = "../data/tweets/ten_years_en"
texts = []
for comp_csv in os.listdir(data_folder):
    print(f"Reading tweets from CSV: {comp_csv}")
    df = pd.read_csv(f"{data_folder}/{comp_csv}", lineterminator='\n')
    texts += df['text'].str.strip().tolist()

Reading tweets from CSV: pncbank_tweets.csv
Reading tweets from CSV: teradyneinc_tweets.csv
Reading tweets from CSV: bakerhughesco_tweets.csv
Reading tweets from CSV: ceridian_tweets.csv
Reading tweets from CSV: aiginsurance_tweets.csv
Reading tweets from CSV: expediamedia_tweets.csv
Reading tweets from CSV: costargroup_tweets.csv
Reading tweets from CSV: cboe_tweets.csv
Reading tweets from CSV: skyworksinc_tweets.csv
Reading tweets from CSV: bathbodyworks_tweets.csv
Reading tweets from CSV: raytheontech_tweets.csv
Reading tweets from CSV: kelloggcompany_tweets.csv
Reading tweets from CSV: truistnews_tweets.csv
Reading tweets from CSV: royalcaribbean_tweets.csv
Reading tweets from CSV: eastmanchemco_tweets.csv
Reading tweets from CSV: dominos_tweets.csv
Reading tweets from CSV: edwardslifesci_tweets.csv
Reading tweets from CSV: labcorp_tweets.csv
Reading tweets from CSV: allegionplc_tweets.csv
Reading tweets from CSV: kraftheinzco_tweets.csv
Reading tweets from CSV: evergypower_tweets.

Reading tweets from CSV: cvshealth_tweets.csv
Reading tweets from CSV: lowes_tweets.csv
Reading tweets from CSV: idexcorp_tweets.csv
Reading tweets from CSV: aligntechinc_tweets.csv
Reading tweets from CSV: b_binsurance_tweets.csv
Reading tweets from CSV: teleflex_tweets.csv
Reading tweets from CSV: mgmresortsintl_tweets.csv
Reading tweets from CSV: fleetcor_tweets.csv
Reading tweets from CSV: mandt_bank_tweets.csv
Reading tweets from CSV: sbasite_tweets.csv
Reading tweets from CSV: incyte_tweets.csv
Reading tweets from CSV: resmed_tweets.csv
Reading tweets from CSV: waterscorp_tweets.csv
Reading tweets from CSV: ventasreit_tweets.csv
Reading tweets from CSV: federalrealty_tweets.csv
Reading tweets from CSV: epamsystems_tweets.csv
Reading tweets from CSV: aon_plc_tweets.csv
Reading tweets from CSV: campbellsoupco_tweets.csv
Reading tweets from CSV: charlesschwab_tweets.csv
Reading tweets from CSV: amwater_tweets.csv
Reading tweets from CSV: klacorp_tweets.csv
Reading tweets from CSV: r

Reading tweets from CSV: cloroxco_tweets.csv
Reading tweets from CSV: pentair_tweets.csv
Reading tweets from CSV: quanta_services_tweets.csv
Reading tweets from CSV: southerncompany_tweets.csv
Reading tweets from CSV: bostonsci_tweets.csv
Reading tweets from CSV: biotechne_tweets.csv
Reading tweets from CSV: marathonpetroco_tweets.csv
Reading tweets from CSV: interpublicipg_tweets.csv
Reading tweets from CSV: microsoft_tweets.csv
Reading tweets from CSV: psegdelivers_tweets.csv
Reading tweets from CSV: ptc_tweets.csv
Reading tweets from CSV: sempra_tweets.csv
Reading tweets from CSV: questdx_tweets.csv
Reading tweets from CSV: cocacolaco_tweets.csv
Reading tweets from CSV: digitalrealty_tweets.csv
Reading tweets from CSV: travelers_tweets.csv
Reading tweets from CSV: camdenliving_tweets.csv
Reading tweets from CSV: campbells_tweets.csv
Reading tweets from CSV: iff_tweets.csv
Reading tweets from CSV: silversea_tweets.csv
Reading tweets from CSV: devonenergy_tweets.csv
Reading tweets fro

In [4]:
##########################
# LOAD A PRE-SAVED MODEL #
##########################
# Load the model.
print("Loading presaved model")
with open(f"{MODEL_FOLDER}/model.pkl", "rb") as file:
    model = pickle.load(file)

Loading presaved model


In [5]:
#######################
# VISUALIZE RESULTS #
#######################
# Run the interactive report interface
tmp.report(model=model, docs=texts)

VBox(children=(VBox(children=(HBox(children=(HTML(value='<b>Select a topic</b>:'), Dropdown(options=((0, 0), (…

In [6]:
##################################################################
# SAVE SCATTERPLOT OF TOPICS BASED ON INTERTOPIC DISTANCE VALUES #
##################################################################

In [7]:
# Calculate the coordinates of topics based on intertopic distance values.
# By default, the combination of t-distributed Stochastic Neighbor Embedding
# and symmetric Kullback-Leibler divergence is used to calculate topics coordinates in 2D.
topics_coords = tmp.prepare_coords(model)

In [8]:
# Plot topics
topic_scatterplot = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label')
altair_saver.save(topic_scatterplot, f"{MODEL_FOLDER}/intertopicdistance_tsne.html")

In [9]:
############################################
# PHI MATRIX (WORDS VS TOPICS PROBABILITY) #
############################################
phi = tmp.get_phi(model)
phi.head()

topics,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
‍,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08
‍@,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08
‍p,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08
‍t,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08
‍‍,1.122633e-08,7.543783e-09,8.310515e-09,5.063248e-09,5.356695e-09,9.505678e-09,1.01873e-08,1.042539e-08,7.150831e-09,6.657103e-09,...,6.126449e-09,5.043646e-09,6.170437e-09,5.46584e-09,3.54681e-09,8.251773e-09,9.393368e-09,3.869435e-09,7.574972e-09,1.171146e-08


In [10]:
#######################################
# FOR EACH TOPIC, SAVE RELEVANT WORDS #
#######################################
for i in range(0, NUM_TOPICS):
    terms_probs = tmp.calc_terms_probs_ratio(phi, topic=i, lambda_=0.60)
    tmp.plot_terms(terms_probs).save(f"{MODEL_FOLDER}/topic{i}_relevantwords.html")

In [11]:
##################################################
# THETA MATRIX (TOPICS VS DOCUMENTS PROBABILITY) #
##################################################
theta = tmp.get_theta(model)
theta.head()

docs,0,1,2,3,4,5,6,7,8,9,...,1004174,1004175,1004176,1004177,1004178,1004179,1004180,1004181,1004182,1004183
topics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.008496,0.0004190155,0.007215,0.004104,0.010844,0.003224,0.000692,0.023247,0.003367,0.027114,...,0.0,0.007459,0.099934,0.001634,0.028477,0.0004364889,0.000857,0.001941,0.002965,0.0
1,0.030035,0.032829,0.001988,0.004772,0.007032,0.021364,0.000361,0.012171,0.019154,0.023248,...,0.0,0.018211,0.003118,0.004687,0.000236,0.002108498,0.004286,0.000245,0.077908,0.0
2,0.011068,5.957034e-05,0.02709,0.008768,0.000175,0.004295,5.3e-05,0.006682,0.004373,0.032464,...,0.0,0.005248,0.001845,0.000329,0.005428,3.028636e-07,0.001656,0.000328,0.004261,0.0
3,0.011663,7.447948e-09,0.024118,0.012973,0.010109,0.002516,0.000441,0.047507,0.000442,0.029814,...,0.0,0.135543,0.00058,0.029977,0.030733,7.708663e-05,0.0035,0.172282,0.145928,0.0
4,0.002631,0.0005330783,0.006495,0.004518,1.6e-05,0.002236,0.000337,0.007751,0.018097,0.028531,...,0.0,0.011952,0.00313,0.00289,0.185355,4.312373e-06,0.001623,0.001529,0.009774,0.0


In [12]:
##################################################################
# Get documents with maximum probabilities P(t|d) for each topic #
##################################################################
tmp.get_top_docs(texts, model=model)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,...,topic40,topic41,topic42,topic43,topic44,topic45,topic46,topic47,topic48,topic49
0,CME Group Monthly Market Statistics\n📂 October...,@FMCCorp #tapsnap #fmc #GetNerdyPHL http://t....,"#Trucking regs in limbo: What will emerge, and...",Which one are you driving? #OReillyPowered\n\n...,Thx for following! @raleighdowntown @Packaging...,@ben_deadman @SBungers @sirfrasersays @pchir...,#genomes100k #ashg2017 https://t.co/J12pBfcDbj,How Do I Avoid Clingy and Desperate Men? &gt;&...,Vrooom vroom vroom vroom #StressReliefIn4Words,@dentalarch @northshoremedsp @MoeAlq @theright...,...,Which would you choose? #dreamhome #thisorthat...,˙sıɥʇ ʇnoqɐ pǝʇıɔxǝ ʎɹǝʌ ǝɹɐ ǝʍ https://t.co/f...,@toothimplants @RutherfordLab @fofhealthcenter...,We’ll be at the RISC-V Summit tomorrow! See yo...,Zoetis $ZTS declares Q4'16 #dividend https://t...,𝙺𝙽𝙾𝚆 𝚈𝙾𝚄𝚁 𝙷𝙸𝚂𝚃𝙾𝚁𝚈 𝙾𝚁 𝙱𝙴 𝙳𝙾𝙾𝙼𝙴𝙳 𝚃𝙾 𝚁𝙴𝙿𝙴𝙰𝚃 𝙸𝚃. \...,01101100 01101111 01110110 01100101 00100000 0...,@GregBunker313 @jamal3804 @sathoyaiaong5 @chen...,Thanks for following! @assetmachinery @GAshPho...,Thank you for your RT's! @aPlusPromoSolut @Ala...
1,CME Group Monthly Market Statistics\n\n📂 Novem...,Smiles from Kisumu with love!\n#smileswithlove...,@brandoral @SoundDentistry @graigbrowndds @v...,#FF TY for following! @ATLretailXperts @NetLea...,Teledyne #DALSA Launches its Next Generation V...,@its_missmandy @alastairbaker1 @DuttonChemis...,@sweettoothdj @jobsinmedicalUK @commonmedical ...,You cannot have one without the other. #dating...,@blakandwite @DrSatpreetS @drnickseddon @DrNor...,1105 Briarcliff Rd Atlanta 4BR 3.5BA 2198SF $3...,...,Which is your favorite? #thisorthat #dreamhome...,Thanks for following! @ew_Groess @lisasavanna ...,@GregoryLaMorte @RBlackhurstDDS @newarkdedenti...,𝗗𝗲𝗳𝗶𝗻𝗶𝘁𝗲𝗹𝘆 #MustSeeTV📺\n\n𝙒𝙚 𝙘𝙖𝙣 𝙖𝙡𝙧𝙚𝙖𝙙𝙮 𝙨𝙚𝙚 𝙒...,Zoetis $ZTS declares Q2'16 #dividend https://t...,𝙺𝙽𝙾𝚆 𝚈𝙾𝚄𝚁 𝙷𝙸𝚂𝚃𝙾𝚁𝚈 𝙾𝚁 𝙱𝙴 𝙳𝙾𝙾𝙼𝙴𝙳 𝚃𝙾 𝚁𝙴𝙿𝙴𝙰𝚃 𝙸𝚃. \...,01000110 01101100 01101111 00100000 01000010 0...,@engagedrapp @pangwuxin @freshdentallond @oral...,@dental_workflow @SolangeSfeir1 @Ann_Marie_LC...,@quimicasandiego @MaroSaucedo @nikko310 @TEDpa...
2,gagttttatc gcttccatga cgcagaagtt aacactttcg ga...,"#DYK that Norfolk Southern has 4,160 employees...",Thx for the follow! @GCDies @TimeOutMarks3zt @...,.@UCONN establishes @SYFNews Center of Excelle...,Teledyne DALSA Introduces Wafer Level-Packagin...,@kimberlyanngeo @Sedgwick @AXA @onecallcm @saf...,Today is World Lymphoma Awareness Day #WLAD. F...,Unable to Stop Thinking About Your Ex? Here's ...,@ladydentaldds @socalhyg @OraMD @escalateinst ...,@kamilalkatib @forbesortho @Theexeterdental @D...,...,Which one would you choose?! #dreamhome #thiso...,@dr_piazze @shinedentalca @drmoezik @SuttonOrt...,👫 ❤️ 🍷 https://t.co/boTsXbAIFc,"Catch these livestreams from @magicityhippies,...",Nordson Corporation Declares Dividend $NDSN #s...,@thepeppermint1 @melillodental @pshannon4 @...,@sarajames01mak @SurvivorSmiles1 @AubrieRicket...,Thank you @IngersollRand for your #generous #d...,Agree! RT @elaineperez22: Great Panel &amp; Mo...,Pleased to see your RTs! Thank you @marymtairy...
3,"Torrence, Pedregon, Enders and Savoie Secure V...",Why You Should Try “Simmering” To Improve Your...,Show us your #truck @JBHuntCarrier! #LTLTrucke...,In the news! DXC Technology Collaborates with ...,@geagestionaerea @ojas_mehta74 @wilmtoday @n...,Spectrum Networks Averaged 2.7 Million Daily V...,RT @UnitedWayTC: @mnhungerwalk Marshalls @amyk...,Dating At Midlife: Are You Too Picky…Or Not Pi...,Welcome! @mabiagini @mandiengram @ClairemontTC...,@career_dental @models4dds @DntlRcdScanning @S...,...,Are you a FAN of this FLOORPLAN?! #dreamhome #...,Looking Beyond the Clouds: A U.S. Cyber Insura...,BIG NEWS lol jk still Twitter,Catch these @Veeps livestreams from @magicityh...,Nordson Declares Second Quarter Dividend #stoc...,Let the Ladies show you how we do this! @fatbe...,It's the Swiss Army knife of radars. Discover ...,"Crunch, Boom, and Chill @Boomchickapop! \n#fi...",@alldaychemist_ @Digibel_md @assure_dental @Dr...,Pleased to see your shares! Thank you @aditivo...
4,Thanks for the follow! @AreaDevelopment @JanEs...,@drshonw @maison_dentaire @VersahLLC @GordonNY...,C.H. Robinson with Successful Rail Shipments a...,@AXIOMMED2017 @JorgensenLabs @Drsalmansdental ...,@ironorehopper @rsamii @kevinerwilkins @Wendel...,Planning the Perfect Oscars Viewing Party in Y...,"#PremiereAlert: Axl, Sue and Brick are back TO...",Wait.... is there not a combo where we can get...,Watching Vern become 3D at @Corning CES booth!...,@trioseinc @KimberFnp @jacobdriggers @Boynton...,...,Do you ADORE the FLOORS in this home?! #dreamh...,Congrats Simon Chikumbu on the COVER Excellenc...,Disney animator Eric Goldberg remembers Robin ...,@aristanetworks Free Seminar Series on #SDN. J...,$NDSN Declares First Quarter Dividend; Authori...,Ready to get the @BMWchamps underway! @columbi...,It's the Swiss Army knife of radars. Discover ...,Learn more about our 6-year partnership with @...,Thanks for the follow! @MaglineInc @hytrol @Co...,@Cruchitas @quimicasandiego @RoseSPerkins @du...
