In [1]:
%pylab inline
import pandas as pd
import pickle
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from scipy import sparse as sp

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('data/top_health_posts.csv', index_col=0)

In [3]:
df.head(10)

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created
0,Congresswoman forces CDC to agree to offer fre...,1735,fhszdr,Health,https://losangeles.cbslocal.com/2020/03/12/oc-...,58,,1584097000.0
1,'I had all symptoms of Coronavirus. I could ha...,1647,fijk9f,Health,https://www.telegraph.co.uk/news/2020/03/14/ha...,170,,1584227000.0
2,The first coronavirus case in the U.S. and Sou...,1602,fl9r23,Health,https://www.reuters.com/article/us-health-coro...,114,,1584652000.0
3,"As coronavirus spreads, the people who prepare...",1409,fdwr56,Health,https://www.washingtonpost.com/national/as-cor...,132,,1583451000.0
4,Those who intentionally spread coronavirus cou...,1273,for03u,Health,https://www.politico.com/news/2020/03/24/coron...,83,,1585175000.0
5,Drive-thru coronavirus testing site in Denver ...,1054,fj1leg,Health,https://www.denverpost.com/2020/03/14/colorado...,95,,1584309000.0
6,A Seattle lab uncovered Washington's coronavir...,1035,fi47nl,Health,https://theweek.com/speedreads/901405/seattle-...,45,,1584153000.0
7,Coronavirus can live on surfaces for up to 3 d...,995,fh5t03,Health,https://ktla.com/news/coronavirus/coronavirus-...,86,,1583995000.0
8,"A coronavirus patient refused to quarantine, s...",991,fk8r7f,Health,https://www.cnn.com/2020/03/17/us/kentucky-ref...,163,,1584493000.0
9,Alibaba's Jack Ma Sends Boxes of Coronavirus T...,951,fjkouq,Health,https://time.com/5803791/jack-ma-alibaba-coron...,44,,1584393000.0


In [4]:
df.shape

(330, 8)

In [5]:
# remove unused columns
df = df.drop(columns= ['score', 'id', 'url', 'num_comments', 'body', 'created','subreddit'])

In [6]:
df.head(10)

Unnamed: 0,title
0,Congresswoman forces CDC to agree to offer fre...
1,'I had all symptoms of Coronavirus. I could ha...
2,The first coronavirus case in the U.S. and Sou...
3,"As coronavirus spreads, the people who prepare..."
4,Those who intentionally spread coronavirus cou...
5,Drive-thru coronavirus testing site in Denver ...
6,A Seattle lab uncovered Washington's coronavir...
7,Coronavirus can live on surfaces for up to 3 d...
8,"A coronavirus patient refused to quarantine, s..."
9,Alibaba's Jack Ma Sends Boxes of Coronavirus T...


In [7]:
# remove punctuation
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

In [8]:
df['title'] = df['title'].apply(lambda x: remove_punctuation(x))

In [9]:
# remove empty cells
import numpy as np
df['title'] = df['title'].replace(r'^\s*$', np.nan, regex=True)

In [10]:
df = df.dropna()

In [11]:
df

Unnamed: 0,title
0,Congresswoman forces CDC to agree to offer fre...
1,I had all symptoms of Coronavirus I could have...
2,The first coronavirus case in the US and South...
3,As coronavirus spreads the people who prepare ...
4,Those who intentionally spread coronavirus cou...
...,...
325,More than 140 nursing homes have reported coro...
326,‘He’s gonna get us all killed’ sense of unease...
327,Trump berates NBCs Peter Alexander over corona...
328,Coronavirus death estimates now reduced by 95


In [14]:
# collect docs as array
docs = array(df['title'])
print(len(docs))
print(type(docs))
type(docs[1])

330
<class 'numpy.ndarray'>


str

In [15]:
# stopwords function
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

# preprocessing
def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower() #convert to lowercase
        docs[idx] = tokenizer.tokenize(docs[idx]) #split into words
    
    # remove stopwords
    docs = [[token for token in doc if token not in stopwords.words('english')] for doc in docs]
    
    # remove numbers
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # remove words < 1 character
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    
    return docs

In [16]:
docs = docs_preprocessor(docs)

In [17]:
# comput bigrams/trigrams
from gensim.models import Phrases
# add bigrams and trigrams to docs - only ones that appear 10 times or more
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            #token is a bigram - add to doc
            docs[idx].append(token)
    for token in trigram[docs[idx]]:
        if '_' in token:
            #token is trigram - add to doc
            docs[idx].append(token)

In [18]:
# remove rare and common tokens
from gensim.corpora import Dictionary

# create a dictionary representation of the documents
dictionary = Dictionary(docs)
print(f"Number of unique words in initial docs: {len(dictionary)}")

# filter out words that occur in less than 10 or more than 20%
dictionary.filter_extremes(no_below=10, no_above=0.2)
print(f"Number of unique words after removing rare and common words: {len(dictionary)}")


Number of unique words in initial docs: 1432
Number of unique words after removing rare and common words: 29


In [19]:
#vectorize - get a 'bag-of-words' representation 
corpus = [dictionary.doc2bow(doc) for doc in docs]
print(f"Number of unique tokens: {len(dictionary)}")
print(f"Number of documents: {len(corpus)}")

Number of unique tokens: 29
Number of documents: 330


In [20]:
# train LDA model
from gensim.models import LdaModel

#set training params
num_topics = 4
chunksize = 500 # size of doc each pass examines
passes = 20 # num of passes
iterations = 400
eval_every = 1 # do not eval model perplexity

#make index to word dict
temp = dictionary[0] # load dict
id2word = dictionary.id2token

# track the CPU time and let the model run
%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                      alpha='auto', eta='auto', \
                      iterations=iterations, num_topics=num_topics, \
                      passes=passes, eval_every=eval_every)


CPU times: user 1.89 s, sys: 12 ms, total: 1.9 s
Wall time: 1.9 s


In [21]:
# visualization
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [22]:
pyLDAvis.gensim.prepare(model, corpus, dictionary)

In [23]:
# evaluation
from sklearn.metrics.pairwise import cosine_similarity

# split each doc into 2 parts
df['tokens'] = docs
docs1 = df['tokens'].apply(lambda l: l[:int0(len(l)/2)])
docs2 = df['tokens'].apply(lambda l: l[int0(len(l)/2):])

In [24]:
# transform data using LDA model
corpus1 = [dictionary.doc2bow(doc) for doc in docs1]
corpus2 = [dictionary.doc2bow(doc) for doc in docs2]

lda_corpus1 = model[corpus1]
lda_corpus2 = model[corpus2]

In [25]:
from collections import OrderedDict
#(LDA) matrix transformation of docs in the topic space
def get_doc_topic_dist(model, corpus, kwords=False):
    top_dist = []
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [array(vals)]
        if kwords:
            keys +- [arrays(vals).argmax()]
    return array(top_dist), keys    

In [26]:
top_dist1, _ = get_doc_topic_dist(model, lda_corpus1)
top_dist2, _ = get_doc_topic_dist(model, lda_corpus2)

print("Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):")
print(mean([cosine_similarity(c1.reshape(1, -1), c2.reshape(1, -1))[0][0] for c1,c2 in zip(top_dist1, top_dist2)]))

random_pairs = np.random.randint(0, len(df['title']), size=(400, 2))

print("Inter similarity: cosine similarity between random parts (lower is better):")
print(np.mean([cosine_similarity(top_dist1[i[0]].reshape(1, -1), top_dist2[i[1]].reshape(1, -1)) for i in random_pairs]))


Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):
0.9696374
Inter similarity: cosine similarity between random parts (lower is better):
0.9603348


In [27]:
# explore frequent terms in topic
def explore_topic(lda_model, topic_number, topn, output=True):
    #prints formatted list of the topn terms
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    return terms

In [28]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(model,topic_number=i, topn=10, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print

term                 frequency

Topic 0 |---------------------

test                 0.183
people               0.183
mask                 0.123
state                0.123
positive             0.103
first                0.089
american             0.063
death                0.049
week                 0.026
home                 0.021
Topic 1 |---------------------

covid19              0.218
risk                 0.095
doctor               0.095
patient              0.087
study                0.087
drug                 0.087
pandemic             0.078
disease              0.072
health               0.052
home                 0.037
Topic 2 |---------------------

say                  0.176
outbreak             0.115
could                0.112
world                0.106
patient              0.071
virus                0.071
week                 0.066
pandemic             0.063
hospital             0.045
need                 0.037
Topic 3 |---------------------

case                 0.278
tes