In [None]:
# DOING SEMI SUPERVISED TOPIC MODELING WITH COREX

# Code to do corex models is inspired by
# Anchored CorEx: Hierarchical Topic Modeling with Minimal Domain Knowledge
# https://github.com/gregversteeg/corex_topic
# CorEx implementation example: https://gist.github.com/patrickvankessel/0d5bd690910edece831dbdf32fb2fb2d



In [33]:
# Loading previously used packages to do LDA modeling - using gensim, and using gensim mallet wrapper.
import re
import pandas as pd
import numpy as np
import networkx as nx
import itertools # peoblwm
import collections # problwm
import spacy
from pprint import pprint # problem 


# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet
# NLTK
from nltk import bigrams
from nltk.stem import PorterStemmer
sns.set(font_scale=1.5)
sns.set_style("whitegrid")
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
#%matplotlib inlin


In [34]:
# Now we also need to import some other packages to do CorEx topic modeling (Correlation Explanation)
# pip install corextopic
import numpy as np
import scipy.sparse as ss
from corextopic import corextopic as ct


In [46]:
# LOAD LATEST VERSION OF PREPROCESSED DATA
# Now open each of the 4 versions of the preprocessed df as df 1-4

# df1 = data minus (query) 
#df1 = pd.read_pickle('updated_stemmedlemmas_data_minus_q.pkl')

# df2 = data minus (query + vaccine + vaccines)
#df2 = pd.read_pickle('updated_stemmedlemmas_data_minus_q_vaccine.pkl')

# df3 = data minus (query + vaccine + vaccines + covid)
#df3 = pd.read_pickle('updated_stemmedlemmas_data_minus_q_vaccine_covid.pkl')

# df4 = data minus (query + vaccine + vaccines + covid + pfizer)
#df4 = pd.read_pickle('updated_stemmedlemmas_data_minus_q_vaccine_covid_pfizer.pkl')

#tweets_df = pd.read_pickle('oralexamupdated_final_data_version_minustop10.pkl')
#tweets_df.head()
tweets_df = pd.read_pickle('final_data_version_minustop10.pkl')


In [43]:
######### COREX TOPIC MODELLING ############# 
 # use lemmas_string and reduced lemmas(tokenized)

In [154]:
# make new df with only tweet text
# (because: Define a matrix where rows are samples (docs/tweets) and columns are features (lemmas)
lemmas_df = tweets_df[['lemmas_string']].copy()

# save to csv file:
lemmas_df.to_csv('lemmas_df.csv', header=True, mode='a')


In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.5,
    min_df=10,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)
vectorizer = vectorizer.fit(lemmas_df['lemmas_string'])
tfidf = vectorizer.transform(lemmas_df['lemmas_string'])
vocab = vectorizer.get_feature_names()
print(len(vocab))
# lemmas: 851

851


In [25]:
from corextopic import corextopic as ct

In [255]:
# First we try to run the corex model on the tweets without anchors
# 10 topics (use seed to reproduce results)
anchors = []
model = ct.Corex(n_hidden=10, seed=42)
model = model.fit(
    tfidf,
    words=vocab
)

In [256]:
# evaluate model using corex' tc (topic correlation)
model.tc
# tc stands for topic correlation and is a measure of how well topics correlate, it is not the same as coherence score, and it is mainly used by the model as a guide to find the best topics, and can be used to compare different corex models. And also as a measure to see how many topics to include - if additional topics does not only improve tc a tiny bit then this point should be considered the "cutoff" point for not adding more topics

0.8641318742744378

In [226]:
# Return the 10 words with the most mutual information 
# "The topic words are those with the highest *mutual information* with the topic, rather than those with highest probability within the topic as in LDA."  - Ryan J. Gallagher, Anchored CorEx: Topic Modeling with Minimal Domain Knowledge

for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i, ", ".join(topic_ngrams)))

# Gives pretty good topics!!! 

Topic #0: health, care, worker, health care, health official, care worker, vaccinate, official, center, care facility
Topic #1: stock, emergency, own stock, own, emergency authorization, authorization, trump stock, authorize emergency, grant, buy stock
Topic #2: new, strain, new strain, case, new case, variant, virus, operation_warp, operation_warp speed, new variant
Topic #3: trump, money, order, invest, try, make, desantis, probably, bet, purchase
Topic #4: expect, authorize, ship, newly, recommend, panel, initial, expect arrive, vote, fedex
Topic #5: live, morning, update, stimulus, watch, store, coronavirus, live update, package, watch live
Topic #6: arrive, next, approval, news, announce, continue, tomorrow, break
Topic #7: second, state, distribute, government, set, million, total
Topic #8: datum, phase, group, study, efficacy, old, prevent, accord, safety, age
Topic #9: mean, people, even, high, chief, severe, drug, due, low, possible


In [277]:
# Evaluate the model by getting coherence score using gensim's CoherenceModel
##########################
# NOT WORKING......
############################
# Evaluate model:
# get coherence score corex
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora

# documents
documents = tweets_df['lemmas_reduced']
# Creating the term dictionary, where every unique term is assigned an index
dictionary = corpora.Dictionary(documents)
#print(dictionary)
# Creating corpus using dictionary prepared above (term document frequency)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Get top words for each topic from the trained corex model
topics = model.get_topics(n_words=100)
corex_topic_words = [[word for word, tc in topic] for topic in topics]

# Get coherence score
cm_corex = CoherenceModel(topics=corex_topic_words, texts=documents, corpus=corpus, dictionary=dictionary, coherence='c_v')
cm_corex.get_coherence()

KeyError: 'emergency'

In [279]:
# Top 10 frequent lemmas (when query is removed) 
#vaccine         7445 - already gone in df4
#covid           3013 - already gone in df4 
#dose            1368-
#receive         1240-
#first            941-
#approve          663-
#week             608-
#begin            575-
#shipment         563-
#today            535-

# top 10 frequent lemmas of updated data
#stock            431
#trump            401
#people           359
#new              313
#know             308
#say              285
#get              278
#health           266
#expect           255
#arrive           249

# get most frequent words 
frequentw_list = pd.Series(' '.join(tweets_df.lemmas_string).split()).value_counts()[:50]print(frequentw_list)

stock            431
trump            401
people           359
new              313
know             308
say              285
get              278
health           266
expect           255
arrive           249
distribution     233
work             230
second           217
think            211
start            209
state            207
go               203
ship             199
hospital         193
good             187
emergency        181
thank            180
need             178
wait             177
worker           177
trial            176
vaccinate        172
money            154
company          151
take             151
update           150
care             148
live             144
vaccination      144
virus            138
shot             138
own              133
next             127
morning          123
year             123
time             123
official         122
authorize        121
administer       121
read             121
news             120
already          119
authorization

In [274]:

anchors = [

    ["emergency", "authorization", "approval", "government"],
    ["health care", "care", "health", "worker", "care facility", "facility", "hospital"],
    ["stock", "buy", "company", "invest"],
    ["live", "update", "news", "break"],
    ["new strain", "variant", "virus", "mutation", "mutate"],
    ["datum", "phase", "study", "group", "efficacy", "research"],
    ["distribution", "distribute" "deliver", "expect", "arrive", "ship"],
    ["side_effect", "severe"],
    ["freezer", "storage"]

]
# vaccinate, vaccination, new, expect
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchors
]

model = ct.Corex(n_hidden=10, seed=42)
model = model.fit(
    tfidf,
    words=vocab,
    anchors=anchors, # Pass the anchors in here
    anchor_strength=3 # Tell the model how much it should rely on the anchors (setting anchor_strength between 1 and 3 gently nudges a topic towards the anchor words, and setting it above 5 more strongly encourages the topic towards the anchor words. We encourage users to experiment with anchor_strength for their own purposes - quote from gregversteeg's github)
)

In [273]:
# Get topic correlation as evaluation metric 
# Anchor model performs better than the model without any specified anchors
model.tc

3.725143474760344

In [275]:
# # Return the 10 words with the most mutual information with topic 
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: emergency, authorization, approval, government, emergency authorization, second, authorize, authorize emergency, grant emergency, grant
Topic #2: health, hospital, worker, care, facility, health care, care facility, health official, care worker, official
Topic #3: stock, company, invest, buy, own stock, own, trump, trump stock, buy stock, trump own
Topic #4: update, live, news, break, live update, watch live, watch, good news, stimulus bill, break news
Topic #5: virus, new strain, variant, mutation, new, mutate, strain, new case, new variant, test
Topic #6: datum, phase, study, group, efficacy, research, phase trial, trial, placebo, rate
Topic #7: expect, arrive, distribution, ship, expect arrive, distribution center, shot expect, leave distribution, state expect, ship second
Topic #8: side_effect, severe, case death, immune, reaction, people die, death, die, response, allergic_reaction
Topic #9: storage, freezer, operation_warp, operation_warp speed, speed, temperature, temp

In [None]:
Topic #1: emergency, authorization, approval, government, emergency authorization, second, authorize, authorize emergency, grant emergency, grant
Topic #2: health, hospital, worker, care, facility, health care, care facility, health official, care worker, official
Topic #3: stock, company, invest, buy, own stock, own, trump, trump stock, buy stock, trump own
Topic #4: update, live, news, break, live update, watch live, watch, good news, stimulus bill, break news
Topic #5: virus, new strain, variant, mutation, new, mutate, strain, new case, new variant, test
Topic #6: datum, phase, study, group, efficacy, research, phase trial, trial, placebo, rate
Topic #7: expect, arrive, distribution, ship, expect arrive, distribution center, shot expect, leave distribution, state expect, ship second
Topic #8: side_effect, severe, case death, immune, reaction, people die, death, die, response, allergic_reaction
Topic #9: storage, freezer, operation_warp, operation_warp speed, speed, temperature, temp, chief, wear, store
Topic #10: money, order, hold, wait, let, probably, go

In [214]:
# create df with tweet and topic membership

# "CorEx is a *discriminative* model, whereas LDA is a *generative* model. This means that while LDA outputs a probability distribution over each document, CorEx instead estimates the probability a document belongs to a topic given that document's words. As a result, the probabilities across topics for a given document do not have to add up to 1." - Ryan J. Gallagher, Anchored CorEx: Topic Modeling with Minimal Domain Knowledge
# He also states that: "Since CorEx does not prescribe a probability distribution of topics over each document, this means that a document could possibly belong to no topics" 

topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(10)]
).astype(float)
topic_df.index = lemmas_df.index
output_df = pd.concat([lemmas_df, topic_df], axis=1)

In [215]:
output_df.sample(5, random_state=42)


Unnamed: 0,lemmas_string,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10
8034,stimulate strong immune response fever chill,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3729,read stimulus bragging financially lay path ye...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5164,hospital prepare arrival,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4368,tell fund version,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5252,friend spend day hospital home back,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [216]:
sum_t1 = output_df['topic_1'].sum()
sum_t2 = output_df['topic_2'].sum()
sum_t3 = output_df['topic_3'].sum()
sum_t4 = output_df['topic_4'].sum()
sum_t5 = output_df['topic_5'].sum()
sum_t6 = output_df['topic_6'].sum()
sum_t7 = output_df['topic_7'].sum()
sum_t8 = output_df['topic_8'].sum()
sum_t9 = output_df['topic_9'].sum()
sum_t10 = output_df['topic_10'].sum()



In [219]:
# Summming up topic prevalence 
print("tweets captured by topics 1-10 in order: ", sum_t1, sum_t2, sum_t3, sum_t4, sum_t5, sum_t6, sum_t7, sum_t8, sum_t9, sum_t10)
number = sum_t1 + sum_t2 + sum_t3 + sum_t4 + sum_t5 + sum_t6 + sum_t7 + sum_t8 + sum_t9 + sum_t10.sum()
tot_len = len(output_df['lemmas_string'])
print("And the total number of tweets captured by the specified topics/model are: %d out of %d" %(number, tot_len))

#tweets captured by topics 1-10 in order:  426.0 665.0 678.0 420.0 314.0 321.0 848.0 128.0 160.0 2929.0
#And the total number of tweets captured by the specified topics/model are: 6889 out of 8406

tweets captured by topics 1-10 in order:  426.0 665.0 678.0 420.0 314.0 321.0 848.0 128.0 160.0 2929.0
And the total number of tweets captured by the specified topics/model are: 6889 out of 8406


In [254]:
# Another faile attempt to get coherence score from the anchor model ... 

from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
from tqdm import tqdm

# Creating the term dictionary, where every unique term is assigned an index
documents = tweets_df['lemmas_reduced'] #documents.tolist()
#documents = list(tweets_df["lemmas_reduced"])

dictionary = corpora.Dictionary(documents)
 
# Creating corpus using dictionary prepared above
corpus = [dictionary.doc2bow(doc) for doc in tqdm(documents)]

# Get top words for each topic from the trained corex model
topics = model.get_topics(n_words=100)
corex_topic_words = [[word for word, tc in topic] for topic in topics]

# Get coherence score
cm_corex = CoherenceModel(topics=corex_topic_words, texts=documents, corpus=corpus, dictionary=dictionary, coherence='c_v')
cm_corex.get_coherence()


100%|██████████| 8406/8406 [00:00<00:00, 70154.75it/s]


KeyError: 'emergency'