In [1]:
import pandas as pd
import pickle
print('loading...')
with open('article_objects_noempty.pkl', 'rb') as file_in:
    text_objects = pickle.load(file_in)
df = pd.DataFrame(text_objects)
text_data = df['cleaned_text'].to_list()

loading...


In [2]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from unidecode import unidecode
import re
from tqdm.notebook import tqdm

stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
cachedStopWords = stopwords.words("english")

def preprocess(raw):
    result = unidecode(raw)
    result = re.sub(r'(https?:\/\/)?([a-zA-Z0-9-_]{1,}\.){1,}[a-zA-Z0-9-_]{1,}(\/[A-Za-z0-9-._~:?#\[\]@!$&\'()*+,;%=]{1,}){0,}\/?', '', result)
    result = re.sub(r"[^a-zA-Z]", " ", result)
    result = re.sub(' +', ' ', result)
    result = result.lower()
    result = stemmer.stem(result)
    result = [lemmatiser.lemmatize(w) for w in result.split(' ')]
    result = [w for w in result if w not in cachedStopWords]
    result = [w for w in result if len(w) > 4]
    return result

tp = []
for article in tqdm(text_data):
    preprocessed_article = preprocess(article)
    if len(preprocessed_article) >= 40:
        tp.append(preprocessed_article)

HBox(children=(FloatProgress(value=0.0, max=4467.0), HTML(value='')))




In [4]:
len(tp)

4138

In [5]:
import random
random.shuffle(tp)

tp_train = tp[500:]
tp_test = tp[:500]

In [6]:
import gensim

dictionary = gensim.corpora.Dictionary(tp_train)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in tp]

In [7]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [11]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [12]:
for idx, topic in lda_model_tfidf.print_topics(num_words=10):
    print('Topic: {} Word: {}\n'.format(idx, topic))

Topic: 0 Word: 0.005*"county" + 0.003*"china" + 0.003*"death" + 0.002*"california" + 0.002*"patient" + 0.002*"official" + 0.002*"reported" + 0.002*"friday" + 0.002*"positive" + 0.002*"total"

Topic: 1 Word: 0.004*"county" + 0.002*"patient" + 0.002*"death" + 0.002*"reported" + 0.002*"california" + 0.002*"symptom" + 0.002*"country" + 0.001*"official" + 0.001*"testing" + 0.001*"mutation"

Topic: 2 Word: 0.004*"vaccine" + 0.003*"china" + 0.002*"country" + 0.002*"death" + 0.002*"trump" + 0.002*"company" + 0.002*"chinese" + 0.002*"government" + 0.002*"patient" + 0.002*"county"

Topic: 3 Word: 0.003*"county" + 0.003*"death" + 0.002*"china" + 0.002*"patient" + 0.002*"vaccine" + 0.002*"positive" + 0.002*"antibody" + 0.002*"total" + 0.002*"country" + 0.002*"trump"

Topic: 4 Word: 0.005*"vaccine" + 0.004*"antibody" + 0.003*"trial" + 0.003*"patient" + 0.002*"study" + 0.002*"company" + 0.002*"treatment" + 0.002*"immune" + 0.002*"moderna" + 0.002*"phase"

Topic: 5 Word: 0.003*"patient" + 0.003*"plas

In [13]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
vis

In [34]:
from gensim.test.utils import datapath

lda_model_tfidf.save('lda.model')

In [36]:
import numpy
numpy.__version__

'1.18.5'

In [18]:
lda = gensim.models.LdaMulticore.load(temp_file)

In [20]:
tp_test[0]

['first',
 'since',
 'began',
 'reporting',
 'january',
 'chinese',
 'authority',
 'registered',
 'infection',
 'coronavirus',
 'brazil',
 'confirmed',
 'total',
 'coronavirus',
 'overtaking',
 'russia',
 'become',
 'world',
 'second',
 'hotspot',
 'covid',
 'million',
 'infant',
 'could',
 'vaccine',
 'preventable',
 'disease',
 'diphtheria',
 'measles',
 'polio',
 'disruption',
 'routine',
 'immunisation',
 'caused',
 'pandemic',
 'agency',
 'million',
 'people',
 'around',
 'world',
 'confirmed',
 'coronavirus',
 'according',
 'compiled',
 'hopkins',
 'university',
 'people',
 'globally',
 'million',
 'people',
 'recovered',
 'latest',
 'update',
 'woman',
 'raised',
 'question',
 'florida',
 'covid',
 'removed',
 'curator',
 'reprimanded',
 'several',
 'violating',
 'health',
 'department',
 'policy',
 'including',
 'posting',
 'political',
 'commentary',
 'information',
 'state',
 'record',
 'rebekah',
 'jones',
 'comment',
 'email',
 'researcher',
 'interview',
 'handful',
 'medi

In [30]:
test_corpus = [dictionary.doc2bow(doc) for doc in tp_test]
sorted(lda[test_corpus[8]], key=lambda x:-x[1])

[(4, 0.35516104),
 (7, 0.20677634),
 (0, 0.19640242),
 (8, 0.1613356),
 (9, 0.07622183)]

In [46]:
x = [[a, float(b)] for a, b in lda[test_corpus[8]]]
y = [[a, float(b)] for a, b in lda[test_corpus[9]]]

In [47]:
import time
ts1 = time.time()
for i in range(100 * 100):
    sim = gensim.matutils.cossim(a, b)
print(time.time() - ts1)
print(sim)

0.19605278968811035
0.4414360460028187


In [38]:
sim

0.3967043563884227