In [1]:
# load data
import pandas as pd

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [12]:
print(len(documents))

1103665


In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import numpy as np
np.random.seed(2018)

In [3]:
import nltk
#nltk.download('wordnet')

In [8]:
# remove words who is in stopwords or length < 3
from nltk import PorterStemmer

def lemmatize_stemming(text):
    #return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    return PorterStemmer().stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
# test preprocess
doc_sample = documents[documents['index'] == 4310].values[0][0]
print(doc_sample)
print(preprocess(doc_sample))

rain helps dampen bushfires
['rain', 'help', 'dampen', 'bushfir']


In [10]:
# preprocess all headlines
process_docs = documents['headline_text'].map(preprocess)

In [21]:
dictionary = gensim.corpora.Dictionary(process_docs)

In [22]:
print(len(dictionary.iteritems()))

62379


In [23]:
# filter out tokens that appear in

# 1. less than 15 documents (absolute number) or
# 2. more than 0.5 documents (fraction of total corpus size, not absolute number).
# 3. after the above two steps, keep only the first 100000 most frequent tokens.

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [32]:
# get all docs' word distribution
bow_corpus = [dictionary.doc2bow(doc) for doc in process_docs]

In [50]:
# test bag of words
cnt = 0
for doc in bow_corpus:
    print(doc)
    cnt += 1
    if cnt > 5:
        break

[(0, 1), (1, 1), (2, 1), (3, 1)]
[(4, 1), (5, 1), (6, 1)]
[(7, 1), (8, 1), (9, 1), (10, 1)]
[(11, 1), (12, 1), (13, 1), (14, 1)]
[(14, 1), (15, 1), (16, 1), (17, 1)]
[(18, 1), (19, 1), (20, 1), (21, 1)]


In [35]:
# generate tf_idf score
from gensim import corpora, models

tf_idf = models.TfidfModel(bow_corpus)
corpus_tfidf = tf_idf[bow_corpus]

In [49]:
# test tf_idf scores
cnt = 0
for doc in corpus_tfidf:
    print(doc)
    cnt += 1
    if cnt > 5:
        break

[(0, 0.5903602896750699), (1, 0.38524510107363613), (2, 0.4974556071174764), (3, 0.5055678583740412)]
[(4, 0.5950795791857478), (5, 0.6246275884087716), (6, 0.505688313326258)]
[(7, 0.38320252667383414), (8, 0.5606508321974041), (9, 0.4720788907709393), (10, 0.5621102994925704)]
[(11, 0.5300483107405607), (12, 0.4377844649314713), (13, 0.5401253870279202), (14, 0.48544630684935974)]
[(14, 0.4584894785270075), (15, 0.5645993838792257), (16, 0.3883597714037109), (17, 0.5658547709072431)]
[(18, 0.6495723838017874), (19, 0.46245282544959365), (20, 0.5028027888963171), (21, 0.3337401053422206)]


In [53]:
# use bag of words to perform lda
lda_model_bow = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [54]:
for idx, topic in lda_model_bow.print_topics():
    print('topic',idx)
    print(topic)

topic 0
0.029*"year" + 0.028*"charg" + 0.026*"court" + 0.020*"murder" + 0.017*"face" + 0.014*"interview" + 0.014*"woman" + 0.014*"jail" + 0.014*"south" + 0.014*"polic"
topic 1
0.021*"polic" + 0.019*"adelaid" + 0.019*"women" + 0.016*"donald" + 0.014*"investig" + 0.013*"life" + 0.013*"concern" + 0.013*"home" + 0.012*"test" + 0.012*"protest"
topic 2
0.019*"school" + 0.018*"market" + 0.016*"health" + 0.016*"rural" + 0.014*"hour" + 0.013*"share" + 0.013*"servic" + 0.012*"deal" + 0.012*"north" + 0.011*"worker"
topic 3
0.030*"govern" + 0.024*"australia" + 0.024*"world" + 0.020*"nation" + 0.017*"countri" + 0.016*"final" + 0.015*"open" + 0.014*"tasmanian" + 0.011*"leagu" + 0.010*"win"
topic 4
0.021*"canberra" + 0.021*"coast" + 0.014*"price" + 0.014*"gold" + 0.011*"time" + 0.010*"australia" + 0.010*"research" + 0.009*"marriag" + 0.009*"star" + 0.008*"like"
topic 5
0.021*"crash" + 0.019*"die" + 0.016*"tasmania" + 0.013*"break" + 0.012*"road" + 0.012*"abus" + 0.012*"bank" + 0.011*"show" + 0.011*"p

In [51]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [52]:
for idx, topic in lda_model_tfidf.print_topics():
    print('topic',idx)
    print(topic)

topic 0
0.011*"elect" + 0.010*"govern" + 0.008*"marriag" + 0.006*"say" + 0.006*"senat" + 0.006*"parti" + 0.006*"labor" + 0.005*"april" + 0.005*"liber" + 0.005*"updat"
topic 1
0.016*"charg" + 0.014*"murder" + 0.014*"crash" + 0.012*"polic" + 0.011*"woman" + 0.011*"court" + 0.010*"jail" + 0.009*"shoot" + 0.009*"assault" + 0.008*"sentenc"
topic 2
0.009*"health" + 0.008*"fund" + 0.008*"drum" + 0.007*"govern" + 0.006*"sport" + 0.005*"servic" + 0.005*"school" + 0.005*"budget" + 0.005*"mental" + 0.004*"indigen"
topic 3
0.024*"countri" + 0.022*"hour" + 0.020*"rural" + 0.015*"news" + 0.012*"podcast" + 0.009*"live" + 0.008*"busi" + 0.007*"john" + 0.007*"nation" + 0.007*"climat"
topic 4
0.015*"trump" + 0.009*"world" + 0.009*"final" + 0.008*"australia" + 0.007*"leagu" + 0.007*"market" + 0.007*"australian" + 0.007*"open" + 0.006*"grandstand" + 0.006*"share"
topic 5
0.009*"turnbul" + 0.008*"violenc" + 0.007*"korea" + 0.006*"tuesday" + 0.006*"kill" + 0.006*"domest" + 0.005*"syria" + 0.005*"say" + 0.00

In [62]:
# test a known doc
known_doc = bow_corpus[4000]

for index, score in lda_model_bow[known_doc]:
    print('score',score)
    print(lda_model.print_topic(index))
    
print()

for index, score in lda_model_tfidf[known_doc]:
    print('score',score)
    print(lda_model.print_topic(index))

score 0.02
0.031*"queensland" + 0.020*"countri" + 0.018*"hospit" + 0.017*"tasmanian" + 0.017*"hour" + 0.015*"work" + 0.014*"children" + 0.013*"student" + 0.013*"citi" + 0.012*"school"
score 0.020004459
0.029*"australia" + 0.023*"world" + 0.017*"market" + 0.012*"record" + 0.012*"share" + 0.011*"victoria" + 0.010*"industri" + 0.010*"australian" + 0.009*"port" + 0.008*"fall"
score 0.6193763
0.035*"trump" + 0.022*"kill" + 0.022*"north" + 0.020*"coast" + 0.016*"china" + 0.016*"attack" + 0.015*"adelaid" + 0.014*"west" + 0.013*"price" + 0.013*"gold"
score 0.22060585
0.027*"govern" + 0.017*"plan" + 0.015*"council" + 0.014*"rural" + 0.013*"say" + 0.012*"indigen" + 0.012*"chang" + 0.012*"turnbul" + 0.012*"water" + 0.011*"commun"
score 0.02
0.017*"tasmania" + 0.015*"time" + 0.015*"fight" + 0.014*"life" + 0.014*"leagu" + 0.013*"lose" + 0.010*"john" + 0.009*"premier" + 0.009*"unit" + 0.008*"compani"
score 0.02000162
0.027*"elect" + 0.023*"south" + 0.022*"nation" + 0.017*"final" + 0.016*"open" + 0.0

In [66]:
# test an unknown doc
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model_bow[bow_vector], key=lambda tup: -1*tup[1]):
    print('topic',index)
    print('score',score)
    #print(lda_model.print_topic(index))
    
print()

for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print('topic',index)
    print('score',score)
    #print(lda_model.print_topic(index))

topic 2
score 0.5166655
topic 6
score 0.18333334
topic 1
score 0.1833333
topic 5
score 0.016667439
topic 7
score 0.016667051
topic 3
score 0.016666686
topic 4
score 0.016666675
topic 0
score 0.016666668
topic 8
score 0.016666668
topic 9
score 0.016666668

topic 0
score 0.50218195
topic 4
score 0.19451046
topic 9
score 0.1866321
topic 5
score 0.016669193
topic 2
score 0.01666825
topic 8
score 0.01666818
topic 3
score 0.016667897
topic 7
score 0.016667632
topic 6
score 0.016667472
topic 1
score 0.016666887
