In [8]:
import pandas as pd
import os
import wikipedia

articles = pd.read_csv('test.csv')
articles.head()

Unnamed: 0,Rank,Page title,Views,Daily average,Assessment,Importance
0,1,"Charles, Prince of Wales",2368520,76403,B,Low
1,2,COVID-19 pandemic,1373905,44319,C,Top
2,3,COVID-19 pandemic by country and territory,1145334,36946,List,Top
3,4,Coronavirus disease 2019,815584,26309,B,Top
4,5,UFC 256,792502,25564,Start,Low


In [9]:
text = wikipedia.page("Coronavirus").content
text

'Coronaviruses are a group of related RNA viruses that cause diseases in mammals and birds. In humans and birds, they cause respiratory tract infections that can range from mild to lethal. Mild illnesses in humans include some cases of the common cold (which is also caused by other viruses, predominantly rhinoviruses), while more lethal varieties can cause SARS, MERS, and COVID-19. In cows and pigs they cause diarrhea, while in mice they cause hepatitis and encephalomyelitis.\nCoronaviruses constitute the subfamily Orthocoronavirinae, in the family Coronaviridae, order Nidovirales, and realm Riboviria. They are enveloped viruses with a positive-sense single-stranded RNA genome and a nucleocapsid of helical symmetry. The genome size of coronaviruses ranges from approximately 26 to 32 kilobases, one of the largest among RNA viruses. They have characteristic club-shaped spikes that project from their surface, which in electron micrographs create an image reminiscent of the solar corona, f

In [38]:
import re

new_text = re.sub('()[,\.!?]','', text).lower()
new_text

d = {'col1': [0], 'paper_text': [new_text]}

papers = pd.DataFrame(data = d)
papers

Unnamed: 0,col1,paper_text
0,0,coronaviruses are a group of related rna virus...


In [25]:
from wordcloud import WordCloud

In [26]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michaellam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data = papers.paper_text.tolist()
data_words = list(sent_to_words(data))

data_words = remove_stopwords(data_words)

data_words[:1][0][:30]

['coronaviruses',
 'group',
 'related',
 'rna',
 'viruses',
 'cause',
 'diseases',
 'mammals',
 'birds',
 'humans',
 'birds',
 'cause',
 'respiratory',
 'tract',
 'infections',
 'range',
 'mild',
 'lethal',
 'mild',
 'illnesses',
 'humans',
 'include',
 'cases',
 'common',
 'cold',
 'also',
 'caused',
 'viruses',
 'predominantly',
 'rhinoviruses']

In [55]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_words)

texts = data_words

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1][0][:30])

[(0, 4), (1, 2), (2, 1), (3, 1), (4, 3), (5, 2), (6, 3), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 9), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 4), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1)]


In [56]:
from pprint import pprint

num_topics = 10

lda_model = gensim.models.LdaMulticore(corpus = corpus, id2word = id2word, num_topics = num_topics)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"coronavirus" + 0.009*"human" + 0.007*"virus" + 0.006*"coronaviruses" '
  '+ 0.005*"protein" + 0.005*"cov" + 0.005*"host" + 0.005*"proteins" + '
  '0.005*"species" + 0.004*"viruses"'),
 (1,
  '0.040*"coronavirus" + 0.015*"virus" + 0.012*"species" + '
  '0.010*"coronaviruses" + 0.010*"human" + 0.008*"rna" + 0.008*"proteins" + '
  '0.008*"protein" + 0.007*"cell" + 0.007*"cov"'),
 (2,
  '0.025*"coronavirus" + 0.012*"coronaviruses" + 0.011*"virus" + '
  '0.008*"species" + 0.008*"cov" + 0.008*"host" + 0.008*"proteins" + '
  '0.007*"human" + 0.007*"viruses" + 0.006*"sars"'),
 (3,
  '0.022*"coronavirus" + 0.013*"coronaviruses" + 0.012*"human" + 0.011*"virus" '
  '+ 0.009*"cov" + 0.008*"species" + 0.008*"rna" + 0.007*"protein" + '
  '0.007*"host" + 0.006*"proteins"'),
 (4,
  '0.028*"coronavirus" + 0.017*"virus" + 0.013*"coronaviruses" + '
  '0.011*"species" + 0.010*"human" + 0.009*"protein" + 0.008*"proteins" + '
  '0.007*"rna" + 0.007*"cov" + 0.007*"host"'),
 (5,
  '0.023*"coron

In [57]:
import pyLDAvis.gensim
import pickle
import pyLDAvis

In [58]:
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(num_topics))

if True:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
        
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_' + str(num_topics) + '.html')

LDAvis_prepared