In [1]:
import pandas as pd
#import spacy
import re

import numpy as np
import lda
import lda.datasets
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from nltk.corpus import stopwords

In [3]:
#nlp = spacy.load('en')

In [4]:
data = pd.read_csv('reddit.csv')

In [5]:
data['Created'] = pd.to_numeric(data['Created'],downcast='integer')

In [6]:
titles = data['Title'].as_matrix()

In [7]:
titles

array(['Macron wins French presidency by decisive margin over Le Pen',
       'Theresa May announces snap election for June 8th',
       "Macron-Le Pen 'in French run-off'", ...,
       'Merkel to meet leaders of Turkey, United Arab Emirates ',
       'Jump-start economy: Give health care to all ',
       'The U.N. Mismanagement Program'], dtype=object)

In [8]:
alphanumeric = [re.sub("[^a-zA-Z0-9 ]"," ",title).lower() for title in titles]

In [9]:
split = [title.split() for title in alphanumeric]

In [10]:
stopwords = stopwords.words("english")
tokenized = [[word for word in title if not word in stopwords] for title in split]

In [11]:
tokenized_sent = np.asarray([" ".join(words) for words in tokenized])

In [12]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)
bow = vectorizer.fit_transform(tokenized_sent)

In [13]:
print(bow.shape)
X = bow.todense()
vocab = vectorizer.get_feature_names()

(51456, 5000)
(1, 5000)


In [59]:
_, idx = np.where(np.sum(bow,axis=1).squeeze()==0)
print(titles[idx])
print(tokenized_sent[idx])

['"Czechs: we\'re not Chechens"'
 'WHO classifies diesel fumes as a carcinogen' 'How Ironic. '
 'Devastation on Madeira  ' 'Cockermouth is flooded' 'Rotterdam, Eurabia'
 "It's the PAX!" "Don't mess with these chicks or you'll be sari"
 'Dig pinpoints Stonehenge origins' '● ● ● ▬ ▬ ▬ ● ● ●' 'The Matrix Poem'
 'Londoners oust Livingstone' 'The Fortress of Liechtenstein Is Wobbling']
['czechs chechens' 'classifies diesel fumes carcinogen' 'ironic'
 'devastation madeira' 'cockermouth flooded' 'rotterdam eurabia' 'pax'
 'mess chicks sari' 'dig pinpoints stonehenge origins' '' 'matrix poem'
 'londoners oust livingstone' 'fortress liechtenstein wobbling']


In [60]:
# Remove zero rows from bag of words
_, idx = np.where(np.sum(bow,axis=1).squeeze()>0)
X = X[idx]

IndexError: index 51443 is out of bounds for axis 0 with size 51443

In [85]:
X = np.asarray(X)

In [19]:
X_r = lda.datasets.load_reuters()
vocab_r = lda.datasets.load_reuters_vocab()
titles_r = lda.datasets.load_reuters_titles()

In [86]:
print(X.reshape(51443,5000).shape)
print(len(titles))
print(X[0])
print(len(vocab))
print('reuters')
print(X_r.shape)
print(X_r[0:3])
print(len(titles_r))
print(X_r[0])
print(len(vocab_r))

(51443, 5000)
51456
[0 0 0 ..., 0 0 0]
5000
reuters
(395, 4258)
[[1 0 1 ..., 0 0 0]
 [7 0 2 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
395
[1 0 1 ..., 0 0 0]
4258


In [112]:
model = lda.LDA(n_topics=30, n_iter=1500, random_state=1)
model.fit(X)

INFO:lda:n_documents: 51443
INFO:lda:vocab_size: 5000
INFO:lda:n_words: 512575
INFO:lda:n_topics: 30
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -6651135
INFO:lda:<10> log likelihood: -4839542
INFO:lda:<20> log likelihood: -4563456
INFO:lda:<30> log likelihood: -4453960
INFO:lda:<40> log likelihood: -4401469
INFO:lda:<50> log likelihood: -4371028
INFO:lda:<60> log likelihood: -4346722
INFO:lda:<70> log likelihood: -4327880
INFO:lda:<80> log likelihood: -4312978
INFO:lda:<90> log likelihood: -4301306
INFO:lda:<100> log likelihood: -4292240
INFO:lda:<110> log likelihood: -4285505
INFO:lda:<120> log likelihood: -4277939
INFO:lda:<130> log likelihood: -4271829
INFO:lda:<140> log likelihood: -4265097
INFO:lda:<150> log likelihood: -4260939
INFO:lda:<160> log likelihood: -4254980
INFO:lda:<170> log likelihood: -4253736
INFO:lda:<180> log likelihood: -4251166
INFO:lda:<190> log likelihood: -4247902
INFO:lda:<200> log likelihood: -4246344
INFO:lda:<210> log likelihood: -4244174
INFO:lda

<lda.lda.LDA at 0x254043c9c50>

In [113]:
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: war crimes iraq torture court former cia police
Topic 1: rights human un turkey says kim jong president
Topic 2: muslim attack muslims attacks pakistan suicide islamic people
Topic 3: news bbc media china chinese internet facebook google
Topic 4: law court new bill government ban internet laws
Topic 5: russia russian ukraine putin military says us nato
Topic 6: china world oil power energy food water gas
Topic 7: isis us al syria syrian iraq afghanistan killed
Topic 8: gaza saudi israel israeli arabia hamas aid fire
Topic 9: police video man shot dead fire woman officers
Topic 10: women sex gay pope children child abuse men
Topic 11: pay bank million tax billion money banks 000
Topic 12: drug mexico war world drugs marijuana president mexican
Topic 13: 000 years year people last million since 10
Topic 14: assange wikileaks us house julian bay swedish white
Topic 15: israel israeli palestinian west palestinians jewish bank east
Topic 16: germany united states eu european europe

In [118]:
doc_topic = model.doc_topic_
for i in range(10):
    print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))

(51443, 30)
Macron wins French presidency by decisive margin over Le Pen (top topic: 27)
Theresa May announces snap election for June 8th (top topic: 20)
Macron-Le Pen 'in French run-off' (top topic: 27)
900 suspected pedophiles arrested as darknet child porn kingpin jailed for 30yrs (top topic: 10)
Trump defends 'right' to share secrets - BBC News (top topic: 3)
Macron campaign emails appear to be leaked online (top topic: 3)
Norway's Progress Party calls for ban on circumcision of boys (top topic: 10)
CNN exclusive: Grand jury subpoenas issued in FBI's Russia investigation (top topic: 0)
Champs Elysées in Paris closed, reports of 2 police officers shot. (top topic: 2)
Hospitals across England hit by large-scale cyber-attack (top topic: 22)


In [114]:
model.loglikelihood()

-4217017.299497036

In [120]:
idxs = np.where(doc_topic[:,2]>0.6)[0]
for i in range(len(idxs)):
    print("Title: {}".format(titles[idxs[i]]))

Title: Bombing reported near church in Egyptian city of Tanta
Title: Plot to hit German shopping centre with multiple suicide bombers is foiled after police are tipped off about 'imminent attack'
Title: Alexandre Bissonnette identified as suspect in #QuebecShooting. Mohamed Khadir, originally identified as a suspect, has now been named as a witness.
Title: Berlin market attack suspect killed in Milan, reports say
Title: Istanbul nightclub attack: ISIS claims responsibility
Title: 28 dead, 54 injured in bomb attacks in Baghdad
Title: At least 22 killed after suicide bomber targets crowded Baghdad market
Title: Muslim cleric banned in Pakistan is preaching in UK mosques: It is feared that Syed Muzaffar Shah Qadri, who praised the murder of a politician, will incite hatred between Muslims
Title: Muslim mob attacks Mosque in Pakistan
Title: American university in Kabul under attack, reports of gunfire, explosion
Title: A suicide car bomb killed at least 21 people and wounded more than 32 a