### Carry out some vanilla LDA with sklearn

In [9]:
import pandas as pd
import pickle
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
def display_topics(model, features, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([features[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

# Load the news data. Run get_news to refresh
df = pickle.load(open('newsdata.p', 'rb'))

# Make the 'document' as a concatenation of headline and body
df['document'] = df['title']+ ' ' + df['body']

In [10]:
# Make the publishing date into datetime format
def separate_date(timestamp):
    return pd.to_datetime(timestamp, unit = 's').date()

df['date'] = df['published_on'].apply(separate_date)

In [12]:
vectoriser = CountVectorizer(stop_words = 'english', max_features=1000)
doc = vectoriser.fit_transform(df['document'].values)
features = vectoriser.get_feature_names()

The number of topics, alpha and beta can all be varied in an attempt to make the topics more interpretable. 

In [13]:
# Assign the parameters for LDA
topics = 27   
# Symmetrical Dirichlet prior for documents to topics. The higher it is, the more topics per document.
alpha = 0.01
# Symmetrical Dirichlet prior for topics to words. The higher it is, the greater the mixture of words per topic.
beta = 0.2
# Fit LDA to the news data 
LDA = LatentDirichletAllocation(n_components = topics, doc_topic_prior = alpha, topic_word_prior = beta)
news_lda = LDA.fit(doc)



In [5]:
no_top_words = 10
display_topics(news_lda, features, no_top_words)

Topic 0:
2018 government week futures coins leading biggest neo volume used
Topic 1:
blockchain technology based industry china smart world new global tech
Topic 2:
ethereum eth founder support capital trustnodes post appeared venture funding
Topic 3:
token decentralized platform tokens ecosystem projects based game io company
Topic 4:
ico platform project sale launches public appeared post pre announces
Topic 5:
price analysis usd technical ripple key newsbtc trend litecoin xrp
Topic 6:
8217 com post appeared bitcoinist morethe ceo media 8216 8211
Topic 7:
coin ico initial sec need day weeks offerings backed facebook
Topic 8:
ccn release press research service responsible product company post appeared
Topic 9:
network bitcoin news analysis post appeared charts amp guides development
Topic 10:
crypto digital cryptocurrencies currency investors says currencies markets post appeared
Topic 11:
users app internet solution hard fork going monero mobile new
Topic 12:
bitcoin cash post appear

For many values of the above parameters there are certainly identifiable topics - this method works, just maybe not exactly for the use case considered. 