In [1]:
import pandas as pd
#import spacy
import re

import numpy as np
import lda
import lda.datasets
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from nltk.corpus import stopwords

In [3]:
#nlp = spacy.load('en')

In [3]:
data = pd.read_csv('data/reddit2017.csv')

In [4]:
data['Created'] = pd.to_numeric(data['Created'],downcast='integer')

In [5]:
titles = data['Title'].as_matrix()

In [6]:
titles

array(['Trump hires private attorney for Russia probe',
       'Study finds mushrooms are the safest recreational drug - People taking mushrooms in 2016 needed medical treatment less than for MDMA, LSD and cocaine, while one of the riskiest drugs was synthetic cannabis',
       "Isis Flag Raised In Philippines As Militants 'Take Over Hospital And Set Fire To Jail And School'",
       ...,
       'U.S., Iraqi sources say Islamic State leader Baghdadi alive despite death reports',
       "Week in Review: U.S. Ambushes Israel in UN, Xi's Power Grab, Iran Presses Advantage, and Much More",
       'Pakistan to hand over dossier on India to new UN chief'], dtype=object)

In [7]:
tokenized = [re.sub("[^a-zA-Z0-9 ]"," ",title).lower() for title in titles]

In [8]:
tokenized = [title.split() for title in tokenized]

In [9]:
stopwords = stopwords.words("english")
# Note: "not" is removed here, not sure if we want that
tokenized = [[word for word in title if not word in stopwords] for title in tokenized]

In [10]:
tokenized = np.asarray([" ".join(words) for words in tokenized])

In [13]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)
bow = vectorizer.fit_transform(tokenized)

In [12]:
np.savetxt('headlines2017.txt',tokenized,fmt="%s")
np.savetxt('headlines2017_info.txt',data[["Domain","Created"]].values,fmt="%s")

In [14]:
X = bow.todense()
vocab = vectorizer.get_feature_names()

In [14]:
_, idx = np.where(np.sum(bow,axis=1).squeeze()==0)
print(titles[idx])
print(tokenized_sent[idx])

["Mossack Fonseca's Reponse to ICIJ"
 'Italians Compare the Arrival of Starbucks to the Apocalypse'
 'The Planets are About to Align' 'Monty Python to reunite'
 'No Morsel Too Minuscule for All-Consuming N.S.A.'
 '"Czechs: we\'re not Chechens"' "Felix Baumgartner's skydive is postponed"
 'Cadbury can now trademark their hue of purple. '
 'Shenzhou-9 docks with Tiangong-1' 'Rags to riches..'
 'Jupiter and Venus conjunction dazzles amateur astrologists'
 'Confessions of a Stratfor subscriber'
 '\u200bI\u200br\u200ba\u200bn\u200b \u200bs\u200bt\u200bo\u200bp\u200bs\u200b \u200bo\u200bi\u200bl\u200b \u200be\u200bx\u200bp\u200bo\u200br\u200bt\u200bs\u200b \u200bt\u200bo\u200b \u200bs\u200bi\u200bx\u200b \u200bE\u200bU\u200b \u200bc\u200bo\u200bu\u200bn\u200bt\u200br\u200bi\u200be\u200bs\u200b.'
 "H\u200ba\u200bg\u200bu\u200be\u200b \u200bf\u200be\u200ba\u200br\u200bs\u200b \u200bI\u200br\u200ba\u200bn\u200b \u200bc\u200bo\u200bu\u200bl\u200bd\u200b \u200bs\u200bt\u200ba\u200br\u200bt\u200b 

In [15]:
# Remove zero rows from bag of words
_, idx = np.where(np.sum(bow,axis=1).squeeze()>0)
X = X[idx]

In [16]:
X = np.asarray(X)

In [19]:
X_r = lda.datasets.load_reuters()
vocab_r = lda.datasets.load_reuters_vocab()
titles_r = lda.datasets.load_reuters_titles()

In [45]:
model = lda.LDA(n_topics=40, n_iter=3000, random_state=1, alpha=0.05)
model.fit(X)

INFO:lda:n_documents: 99993
INFO:lda:vocab_size: 5000
INFO:lda:n_words: 969467
INFO:lda:n_topics: 40
INFO:lda:n_iter: 3000
INFO:lda:<0> log likelihood: -12857463
INFO:lda:<10> log likelihood: -9217884
INFO:lda:<20> log likelihood: -8584346
INFO:lda:<30> log likelihood: -8319647
INFO:lda:<40> log likelihood: -8180635
INFO:lda:<50> log likelihood: -8094527
INFO:lda:<60> log likelihood: -8038819
INFO:lda:<70> log likelihood: -8005029
INFO:lda:<80> log likelihood: -7971781
INFO:lda:<90> log likelihood: -7953448
INFO:lda:<100> log likelihood: -7935398
INFO:lda:<110> log likelihood: -7923506
INFO:lda:<120> log likelihood: -7913334
INFO:lda:<130> log likelihood: -7906243
INFO:lda:<140> log likelihood: -7899696
INFO:lda:<150> log likelihood: -7896234
INFO:lda:<160> log likelihood: -7889713
INFO:lda:<170> log likelihood: -7884926
INFO:lda:<180> log likelihood: -7882210
INFO:lda:<190> log likelihood: -7879779
INFO:lda:<200> log likelihood: -7875402
INFO:lda:<210> log likelihood: -7877567
INFO:ld

<lda.lda.LDA at 0x1a1c0d90e10>

In [46]:
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: power china energy nuclear world india solar fukushima
Topic 1: refugees germany migrants europe german refugee merkel eu
Topic 2: syria syrian us al assad weapons rebels chemical
Topic 3: uk eu brexit says may european britain british
Topic 4: minister turkey president prime turkish erdogan says former
Topic 5: rights court human war crimes international report un
Topic 6: climate change global world study could scientists report
Topic 7: food marijuana canada use ban cannabis drug city
Topic 8: new law bill government laws anti parliament australian
Topic 9: russian plane flight crash jet air missing malaysia
Topic 10: china us sea chinese japan south russia military
Topic 11: sex women pope child abuse children gay francis
Topic 12: africa south african japan china new world ivory
Topic 13: news bbc media tv internet facebook social twitter
Topic 14: year world 000 since years million record percent
Topic 15: police protest thousands protesters protests hong anti kong
Topic

In [60]:
idx = np.where(np.array(vocab)=="trump")[0][0]
print("Topic: {}".format(np.argmax(topic_word[:,idx])))
probs = topic_word[:,idx]
print((probs-np.mean(probs))/np.std(probs))

Topic: 21
[-0.1871394  -0.18713018 -0.18713935 -0.18712991 -0.18712667 -0.18713265
 -0.1871426  -0.18712519 -0.18713593 -0.18712105 -0.18712973 -0.18712889
 -0.18712468 -0.18713034 -0.18714602 -0.18713384 -0.18714022 -0.18713331
 -0.18713356  0.12435151 -0.18712474  6.21651134 -0.18714681 -0.18713305
 -0.18712562 -0.18712886 -0.18713172 -0.18713441 -0.18712725 -0.18712511
  0.21332113 -0.18712949 -0.18713975 -0.18714452 -0.1871314  -0.18714099
 -0.18713454 -0.18714135 -0.1871221   0.18260122]


In [50]:
doc_topic = model.doc_topic_
for i in range(10):
    print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))

Hospitals across England hit by large-scale cyber-attack (top topic: 35)
North Korea launches a new unidentified missile as tensions with US fester (top topic: 19)
FBI acting boss Andrew McCabe stands by Russia probe - BBC News (top topic: 4)
Facebook Shuts Down World’s Biggest Page For Atheists (top topic: 13)
U.S. in final stages of $100 billion arms deal for Saudi Arabia: White House official (top topic: 38)
Macron inaugurated at Elysee Palace (top topic: 29)
Trump revealed highly classified information to Russian foreign minister and ambassador (top topic: 22)
Pakistan: People who smoke, eat openly during Ramazan face 3-month imprisonment (top topic: 37)
French Dislike Donald Trump Even More Than Putin, Xi and Merkel, Poll Finds (top topic: 21)
Trump's disclosure endangered spy placed inside ISIS by Israel, officials say (top topic: 33)


In [36]:
model.loglikelihood()

-7861723.590164094

In [61]:
idxs = np.where(doc_topic[:,21]>0.8)[0]
for i in range(len(idxs)):
    print("Title: {}".format(titles[idxs[i]]))

Title: Worried world urges Trump not to pull out of Paris climate agreement
Title: White House advisors called Ottawa to urge Trudeau to help talk Trump down from scrapping NAFTA
Title: Suspend visa-free EU travel for U.S. citizens, lawmakers say
Title: Suspend Visa-free EU Travel for US Citizens, Lawmakers Say
Title: Netanyahu denies he had expressed support for US President Donald Trump’s push for a border wall with Mexico
Title: Russia is 'tearing down' world order, US ambassador to UN says in final speech
Title: Canada Hopeful TPP Can Survive Without U.S., Trade Minister Says - Canada will consider pursuing a new multilateral Pacific Rim trade deal now that President Donald Trump has signaled the U.S. is abandoning the Trans-Pacific Partnership.
Title: Trump Trade Strategy's Start Worries China: US to Withdraw From TPP, Renegotiate NAFTA Deal
Title: Xi to be first Chinese leader to attend Davos World Economic Forum
Title: Obama says China would not take change in U.S. policy on Tai