In [1]:
%pylab inline
import pandas as pd
import pickle
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from scipy import sparse as sp

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('data/raw_comments_from_top_health_posts.csv', index_col=0)

In [3]:
df.head(10)

Unnamed: 0,comment_id,comment_score,comment_created,comment_body
0,fkd6xyq,189,1584099000.0,Happy I had the privilege to vote for Katie Po...
1,fkdb1ca,84,1584102000.0,This serves as a reminder that the pomp and ci...
2,fkdjt6m,29,1584110000.0,And how many won’t have jobs tomorrow if they ...
3,fkdid6b,19,1584109000.0,Katie Porter rocks!
4,fkebcd7,14,1584138000.0,I see a lot of comments from people who may be...
5,fkdem27,24,1584105000.0,Treatment is what is needed as well. I know pe...
6,fkdniru,9,1584115000.0,Now that is a woman/person that I find amazing...
7,fke1n5m,7,1584132000.0,That’s great except we don’t have any tests to...
8,fkdvnri,4,1584126000.0,Getting him to say 'yes' felt like pulling teeth.
9,fkf10fr,4,1584154000.0,Can we have HER as president?! \n\nIn an earl...


In [4]:
df.shape

(5005, 4)

In [5]:
# remove unused columns
df = df.drop(columns= ['comment_id', 'comment_score', 'comment_created'])

In [6]:
df.head(10)

Unnamed: 0,comment_body
0,Happy I had the privilege to vote for Katie Po...
1,This serves as a reminder that the pomp and ci...
2,And how many won’t have jobs tomorrow if they ...
3,Katie Porter rocks!
4,I see a lot of comments from people who may be...
5,Treatment is what is needed as well. I know pe...
6,Now that is a woman/person that I find amazing...
7,That’s great except we don’t have any tests to...
8,Getting him to say 'yes' felt like pulling teeth.
9,Can we have HER as president?! \n\nIn an earl...


In [7]:
# remove [deleted] or [none] rows
rowsToDrop = df[(df['comment_body'] == "[removed]") | (df['comment_body'] == "[deleted]")].index
df.drop(rowsToDrop, inplace=True)

In [8]:
# remove newline chars
def remove_newlines(txt):
    no_nl = txt.replace('\n',' ')
    return no_nl
df['comment_body'] = df['comment_body'].apply(lambda x: remove_newlines(x))

In [9]:
# remove urls
df['comment_body'] = df['comment_body'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

In [10]:
# remove punctuation
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

In [11]:
df['comment_body'] = df['comment_body'].apply(lambda x: remove_punctuation(x))

In [12]:
# remove empty cells
import numpy as np
df['comment_body'] = df['comment_body'].replace(r'^\s*$', np.nan, regex=True)

In [13]:
df = df.dropna()

In [14]:
df

Unnamed: 0,comment_body
0,Happy I had the privilege to vote for Katie Po...
1,This serves as a reminder that the pomp and ci...
2,And how many won’t have jobs tomorrow if they ...
3,Katie Porter rocks
4,I see a lot of comments from people who may be...
...,...
4999,Wuhan endless queues for ashes of coronaviru...
5000,Do you have a link from something like the New...
5002,More than 140 nursing homes in the United St...
5003,Federal Officials I am sure many people will w...


In [15]:
# collect docs as array
docs = array(df['comment_body'])
print(len(docs))
print(type(docs))
type(docs[1])
docs

4819
<class 'numpy.ndarray'>


array(['Happy I had the privilege to vote for Katie Porter',
       'This serves as a reminder that the pomp and circumstance some of Congress members put on during trials and meetings can ultimately serve a purpose  We need truth  We need results  We need our representatives to go to bat for us otherwise nothing is accomplished',
       'And how many won’t have jobs tomorrow if they take today off sick Not worth getting tested They’ll go in and work until they fall over America is great',
       ...,
       '  More than 140 nursing homes in the United States have at least one resident who tested positive for the coronavirus according to federal government figures released earlier this week but exactly which homes are affected and where they are federal officials won’t say    Their refusal to release the information has angered families industry watchdogs and emergency personnel who say it deprives them of critical information as they try to ensure the safety of nursing home residents 

In [16]:
# stopwords function
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

# preprocessing
def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower() #convert to lowercase
        docs[idx] = tokenizer.tokenize(docs[idx]) #split into words
    
    # remove stopwords
    docs = [[token for token in doc if token not in stopwords.words('english')] for doc in docs]
    
    # remove numbers
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # remove words < 1 character
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    
    return docs

In [17]:
docs = docs_preprocessor(docs)

In [18]:
# comput bigrams/trigrams
from gensim.models import Phrases
# add bigrams and trigrams to docs - only ones that appear 10 times or more
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            #token is a bigram - add to doc
            docs[idx].append(token)
    for token in trigram[docs[idx]]:
        if '_' in token:
            #token is trigram - add to doc
            docs[idx].append(token)

In [19]:
# remove rare and common tokens
from gensim.corpora import Dictionary

# create a dictionary representation of the documents
dictionary = Dictionary(docs)
print(f"Number of unique words in initial docs: {len(dictionary)}")

# filter out words that occur in less than 10 or more than 20%
dictionary.filter_extremes(no_below=10, no_above=0.2)
print(f"Number of unique words after removing rare and common words: {len(dictionary)}")


Number of unique words in initial docs: 9977
Number of unique words after removing rare and common words: 1455


In [20]:
#vectorize - get a 'bag-of-words' representation 
corpus = [dictionary.doc2bow(doc) for doc in docs]
print(f"Number of unique tokens: {len(dictionary)}")
print(f"Number of documents: {len(corpus)}")

Number of unique tokens: 1455
Number of documents: 4819


In [21]:
# train LDA model
from gensim.models import LdaModel

#set training params
num_topics = 4
chunksize = 500 # size of doc each pass examines
passes = 20 # num of passes
iterations = 400
eval_every = 1 # do not eval model perplexity

#make index to word dict
temp = dictionary[0] # load dict
id2word = dictionary.id2token

# track the CPU time and let the model run
%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                      alpha='auto', eta='auto', \
                      iterations=iterations, num_topics=num_topics, \
                      passes=passes, eval_every=eval_every)


CPU times: user 34.8 s, sys: 105 ms, total: 34.9 s
Wall time: 34.9 s


In [23]:
# visualization
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [24]:
pyLDAvis.gensim.prepare(model, corpus, dictionary)

In [25]:
# evaluation
from sklearn.metrics.pairwise import cosine_similarity

# split each doc into 2 parts
df['tokens'] = docs
docs1 = df['tokens'].apply(lambda l: l[:int0(len(l)/2)])
docs2 = df['tokens'].apply(lambda l: l[int0(len(l)/2):])

In [26]:
# transform data using LDA model
corpus1 = [dictionary.doc2bow(doc) for doc in docs1]
corpus2 = [dictionary.doc2bow(doc) for doc in docs2]

lda_corpus1 = model[corpus1]
lda_corpus2 = model[corpus2]

In [27]:
from collections import OrderedDict
#(LDA) matrix transformation of docs in the topic space
def get_doc_topic_dist(model, corpus, kwords=False):
    top_dist = []
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [array(vals)]
        if kwords:
            keys +- [arrays(vals).argmax()]
    return array(top_dist), keys    

In [29]:
top_dist1, _ = get_doc_topic_dist(model, lda_corpus1)
top_dist2, _ = get_doc_topic_dist(model, lda_corpus2)

print("Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):")
print(mean([cosine_similarity(c1.reshape(1, -1), c2.reshape(1, -1))[0][0] for c1,c2 in zip(top_dist1, top_dist2)]))

random_pairs = np.random.randint(0, len(df['comment_body']), size=(400, 2))

print("Inter similarity: cosine similarity between random parts (lower is better):")
print(np.mean([cosine_similarity(top_dist1[i[0]].reshape(1, -1), top_dist2[i[1]].reshape(1, -1)) for i in random_pairs]))


Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):
0.9996776
Inter similarity: cosine similarity between random parts (lower is better):
0.9994464


In [30]:
# explore frequent terms in topic
def explore_topic(lda_model, topic_number, topn, output=True):
    #prints formatted list of the topn terms
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    return terms

In [31]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(model,topic_number=i, topn=10, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print

term                 frequency

Topic 0 |---------------------

people               0.036
like                 0.022
case                 0.013
also                 0.013
time                 0.012
week                 0.011
number               0.010
test                 0.010
still                0.010
coronavirus          0.009
Topic 1 |---------------------

work                 0.020
thing                0.020
health               0.017
care                 0.017
take                 0.016
home                 0.013
right                0.013
year                 0.013
health_care          0.013
risk                 0.012
Topic 2 |---------------------

healthcare           0.020
hand                 0.018
italy                0.017
system               0.017
wash_hand            0.014
healthcare_worker    0.013
corona_virus         0.013
face                 0.013
healthcare_system    0.013
immune_system        0.011
Topic 3 |---------------------

mask                 0.027
nee