# Part I Training LDA model (50 pts)

Use the bodies of articles in your Webhose dataset to train an LDA model.  Your program should output topic clusters and up to 10 keywords in each cluster. Clusters should not overlap, and keywords should allow to approximate the meaning.

### preperation

#### read data

In [1]:
import webhoseio, os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re, spacy
from nltk.stem.wordnet import WordNetLemmatizer
import pyLDAvis
from pyLDAvis import sklearn

stopwords = set(nltk.corpus.stopwords.words('english'))

In [2]:
#read data
import json
json_data = open("/Users/arlenehuang/OneDrive - Columbia University/Summer 2020/APAN 5430/Assignment 7/webhose_Citigroup_deduplicated.json").readlines()
newsfeeds = []

for line in json_data:
    newsfeeds.append(json.loads(line))

In [3]:
#get text
feeds = []
for line in newsfeeds:
    feeds.append(line['text'])

#### define useful functions and prepare data

In [4]:
#Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

In [5]:
#tokenize, clean-up and lemmatize

def tokenize(text):
    import gensim  #just to use simple_preprocess
    tokens = gensim.utils.simple_preprocess(text, deacc=True)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    return filtered_tokens

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

def tokenize_lemmatize(text):
    data_words = []
    for line in text:
        data_words.append(tokenize(line))
    #Run in terminal: python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner']) #just incase
    data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    return data_lemmatized

In [6]:
data_lemmatized = tokenize_lemmatize(feeds)

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


### modify parameters

#### modify max_iter

To modify max_iter, I will try max_iter = [5,10,50]. For now, I will start with max_features = 100 and min_df = 0.1, max_df = 0.9.

when max_iter = 5

In [13]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.1, max_df = 0.9,
                             max_features=100,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
data_dense = data_vectorized.todense()
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=5,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Sparsicity:  49.55900996076064 %
Topic #0:
company stock | share company | share company stock | own share company | own share company stock | own share | company stock value | share company stock value | stock value | company stock worth
Topic #1:
research note | rating research note | rating research | hold rating | sell rating | move average | price target | neutral rating | buy rating | rating hold
Topic #2:
purchase additional | purchase additional share | purchase additional share last | share last | last quarter | additional share last | share last quarter | additional share last quarter | own share | additional share
Topic #3:
buy rating | average price | day move | day move average | move average | price target | hold rating | target price | move average price | day move average price
Topic #4:
purchase additional | purchase additional share | own share | share last | share last quarter | additional share last | additional share last quarter | last quarter | purchase additiona

when max_iter = 10

In [14]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.1, max_df = 0.9,
                             max_features=100,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
data_dense = data_vectorized.todense()
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Sparsicity:  49.55900996076064 %
Topic #0:
company stock | share company | share company stock | own share company | own share company stock | own share | company stock value | share company stock value | stock value | company stock worth
Topic #1:
research note | rating research note | rating research | hold rating | sell rating | move average | rating hold | neutral rating | buy rating | price target
Topic #2:
purchase additional | purchase additional share | share last | additional share last | additional share last quarter | share last quarter | last quarter | purchase additional share last | own share | additional share
Topic #3:
buy rating | average price | day move | day move average | move average | move average price | day move average price | price target | hold rating | target price
Topic #4:
purchase additional | purchase additional share | share last | share last quarter | additional share last | additional share last quarter | last quarter | own share | purchase additiona

when max_iter = 50

In [18]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.1, max_df = 0.9,
                             max_features=100,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
data_dense = data_vectorized.todense()
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=50,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Sparsicity:  49.55900996076064 %
Topic #0:
company stock | share company | share company stock | own share company | own share company stock | company stock value | own share | share company stock value | stock value | company stock worth
Topic #1:
research note | rating research note | rating research | hold rating | sell rating | rating hold | move average | neutral rating | buy rating | rating buy
Topic #2:
purchase additional | purchase additional share | share last | additional share last | additional share last quarter | share last quarter | last quarter | purchase additional share last | additional share | own share
Topic #3:
buy rating | average price | day move | day move average | move average | move average price | day move average price | price target | hold rating | target price
Topic #4:
purchase additional | purchase additional share | share last | share last quarter | additional share last | additional share last quarter | last quarter | purchase additional share last |

From the results shown above, it seems that max_iter = 10 or 50 works the best.
For simplicity, I will take max_iter = 10 for following analysis

#### modify max_features

To modify max_features, I will try max_features = [50,100,500,5000]

when max_features = 50

In [23]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.1, max_df = 0.9,
                             max_features=50,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Topic #0:
research note | rating research | hold rating | sell rating | move average | buy rating | price target | earning share | neutral rating | own share
Topic #1:
stock worth | own share | share last | last quarter | share last quarter | additional share last | additional share last quarter | additional share | price target | buy rating
Topic #2:
company stock | share company | share company stock | own share company | own share | average price | company stock value | own share company stock | stock value | buy rating
Topic #3:
stock value | own share | share last | last quarter | additional share last | share last quarter | additional share last quarter | additional share | buy rating | price target
Topic #4:
share company | company stock | share company stock | own share company | own share company stock | own share | company stock value | last quarter | share last | additional share last
Topic #5:
research report | rating research | sell rating | hold rating | move average | ne

when max_features = 100

In [24]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.1, max_df = 0.9,
                             max_features=100,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Topic #0:
company stock | share company | share company stock | own share company | own share company stock | own share | company stock value | share company stock value | stock value | company stock worth
Topic #1:
research note | rating research note | rating research | hold rating | sell rating | move average | rating hold | neutral rating | buy rating | price target
Topic #2:
purchase additional | purchase additional share | share last | additional share last | additional share last quarter | share last quarter | last quarter | purchase additional share last | own share | additional share
Topic #3:
buy rating | average price | day move | day move average | move average | move average price | day move average price | price target | hold rating | target price
Topic #4:
purchase additional | purchase additional share | share last | share last quarter | additional share last | additional share last quarter | last quarter | own share | purchase additional share last | additional share
T

when max_features = 500

In [25]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.1, max_df = 0.9,
                             max_features=500,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Topic #0:
buy additional | buy additional share | own share | additional share | buy additional share last | share last | additional share last | last quarter | share last quarter | additional share last quarter
Topic #1:
value acquire | acquire additional | acquire additional share | stock value acquire additional | stock value acquire | value acquire additional | value acquire additional share | stock value | additional share | own share
Topic #2:
share company | company stock | share company stock | own share company | own share | own share company stock | sell share | average price | share company stock value | company stock value
Topic #3:
email address | receive daily | late news | enter email | daily summary | enter email address | enter email address receive | email address receive | summary late news | company marketbeat newsletter
Topic #4:
buy rating | day move | day move average | average price | move average | hold rating | move average price | price target | day move aver

when max_features = 5000

In [26]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.1, max_df = 0.9,
                             max_features=5000,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Topic #0:
share company | company stock | sell share | average price | own share | sell average | sell average price | own share company | price total | average price total
Topic #1:
buy rating | price target | analyst rating | news rating | email newsletter | company daily | daily email | daily email newsletter | company daily email | company daily email newsletter
Topic #2:
day move average | day move | move average price | day move average price | average price | hold rating | buy rating | research note | earning share | move average
Topic #3:
purchase additional | purchase additional share | additional share | own share | purchase additional share last | share last | additional share last quarter | share last quarter | additional share last | last quarter
Topic #4:
company stock | share company | share company stock | own share company | own share company stock | company stock worth | share company stock value | company stock value | share company stock worth | quarter own share co

Overall, it seems that max_features = 100 works the best.

#### modify min_df and max_df

To modify min_df and max_df, I will try [0.05,0.95],[0.1,0.9],[0.2,0.8]

when min_df and max_df = [0.05,0.95]

In [27]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.05, max_df = 0.95,
                             max_features=100,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Topic #0:
company stock | share company | share company stock | own share company | own share company stock | own share | company stock value | share company stock value | stock value | company stock worth
Topic #1:
research note | rating research note | rating research | hold rating | sell rating | move average | rating hold | neutral rating | buy rating | price target
Topic #2:
purchase additional | purchase additional share | share last | additional share last | additional share last quarter | share last quarter | last quarter | purchase additional share last | own share | additional share
Topic #3:
buy rating | average price | day move | day move average | move average | move average price | day move average price | price target | hold rating | target price
Topic #4:
purchase additional | purchase additional share | share last | share last quarter | additional share last | additional share last quarter | last quarter | own share | purchase additional share last | additional share
T

when min_df and max_df = [0.1,0.9]

In [28]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.1, max_df = 0.9,
                             max_features=100,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Topic #0:
company stock | share company | share company stock | own share company | own share company stock | own share | company stock value | share company stock value | stock value | company stock worth
Topic #1:
research note | rating research note | rating research | hold rating | sell rating | move average | rating hold | neutral rating | buy rating | price target
Topic #2:
purchase additional | purchase additional share | share last | additional share last | additional share last quarter | share last quarter | last quarter | purchase additional share last | own share | additional share
Topic #3:
buy rating | average price | day move | day move average | move average | move average price | day move average price | price target | hold rating | target price
Topic #4:
purchase additional | purchase additional share | share last | share last quarter | additional share last | additional share last quarter | last quarter | own share | purchase additional share last | additional share
T

when min_df and max_df = [0.15,0.85]

In [29]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.15, max_df = 0.85,
                             max_features=100,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Topic #0:
company stock | own share | last quarter | share last | additional share last | share last quarter | additional share last quarter | share company | additional share | purchase additional
Topic #1:
research note | rating research note | rating research | hold rating | sell rating | neutral rating | rating hold | rating buy | buy rate | research analyst
Topic #2:
company stock | share company | simple move | simple move average | day simple move | day simple | day simple move average | price target | share company stock | sell share
Topic #3:
price target | rating share | hold rating | day move | day move average | research report | sell rating | target price | neutral rating | report may
Topic #4:
share last | share last quarter | additional share last | additional share last quarter | last quarter | own share | additional share | stock worth | provider stock | acquire additional
Topic #5:
target price | price share | target price share | rating issue | average price | rating

when min_df and max_df = [0.2,0.8]

In [30]:
n_top_words = 10
vectorizer = CountVectorizer(min_df = 0.2, max_df = 0.8,
                             max_features=100,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)
tf_feature_names = vectorizer.get_feature_names()
topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

Topic #0:
share last | additional share last | share last quarter | additional share last quarter | last quarter | own share | additional share | purchase additional share last | stock worth | purchase additional
Topic #1:
simple move | simple move average | day simple | day simple move | day simple move average | research note | sell rating | rating research | neutral rating | price objective
Topic #2:
quarter value | earning share | target price | fourth quarter | position share | institutional investor | revenue quarter | hedge fund | first quarter | revenue quarter compare
Topic #3:
average price | move average price | day move average price | day move | day move average | research note | rating research | rating research note | sell rating | neutral rating
Topic #4:
research report | rating research report | rating research | purchase additional | purchase additional share | additional share | report may | own share | stock value | target price
Topic #5:
target price | day move | 

From the results shown above, it seems that min_df and max_df = [0.05,0.95] or [0.1,0.9] works the best.
Choose [0.05,0.95] for following analysis.

### run the best model

Conclude from previous analysis, the best parameters are:

max_iter = 10, max_features = 100, min_df = 0.05, max_df = 0.95.

In [36]:
# Building the tf, check sparsicity
vectorizer = CountVectorizer(max_df=0.95, min_df=0.05, max_features=100,
                             ngram_range=(2,4))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
data_dense = data_vectorized.todense()
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  49.55900996076064 %


In [37]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=10,
                                      max_iter=10,
                                      learning_method='online',   
                                      random_state=1,
                                      learning_offset=10.)
lda_output = lda_model.fit_transform(data_vectorized)

In [32]:
# Print out topics
tf_feature_names = vectorizer.get_feature_names()

topics = dict()
for topic_idx, topic in enumerate(lda_model.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic #0:
sell share | additional share last | share company | additional share last quarter | hedge fund | move average day | additional share | price target | stock sell rating | price objective
Topic #1:
share last | share last quarter | day move average | day move average price | day simple | ratio quick | research report | rating buy | move average | own share
Topic #2:
purchase additional share last | debt equity | stock value | equity ratio | neutral rating | fourth quarter | research analyst | first quarter | earning share | research note
Topic #3:
average day | last year | day move | company stock worth | current ratio | ratio quick | rating buy | rating company | research report | move average
Topic #4:
ratio quick ratio | rating hold | rating buy | price objective | price share | additional share | ratio current | move average day | stock sell | stock value
Topic #5:
stock value | company stock worth | current ratio | own share company stock | rating report | research report

In [38]:
# Visualization
import pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
vis

In [39]:
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))

Log Likelihood:  -1270638.0628126084
Perplexity:  79.49438354849791


# Applying LDA model (50 pts)

Pick 10 random articles,  run them through the trained LDA model, and show which topic cluster each falls into. Print out the cluster IDs and keywords.

#### pick 10 random article and print out cluster ID and keywords

In [52]:
# pick 10 random ariticles
import random
indices = []
for i in range(10):
    indices.append(int(random.random()*len(feeds)))
print(indices)

[303, 649, 246, 3285, 1556, 3058, 3231, 2254, 2879, 364]


In [66]:
# calculate results for all articles

import pandas as pd
import numpy as np

lda_output = lda_model.fit_transform(data_vectorized)

topicnames = ["Topic" + str(i) for i in topics]
docnames = ["Doc" + str(i) for i in range(len(feeds))]
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [79]:
# dominant topic id & keywords for picked ariticles
for index in indices:
    print("For doc #", index)
    print("Dominant topic : Topic #", dominant_topic[index])
    print("Keywords", topics[dominant_topic[index]])

For doc # 303
Dominant topic : Topic # 2
Keywords ['purchase additional share last', 'debt equity', 'stock value', 'equity ratio', 'neutral rating', 'fourth quarter', 'research analyst', 'first quarter', 'earning share', 'research note']
For doc # 649
Dominant topic : Topic # 2
Keywords ['purchase additional share last', 'debt equity', 'stock value', 'equity ratio', 'neutral rating', 'fourth quarter', 'research analyst', 'first quarter', 'earning share', 'research note']
For doc # 246
Dominant topic : Topic # 7
Keywords ['move average day', 'additional share', 'share company stock', 'also recently', 'stock sell rating', 'purchase additional', 'purchase additional share', 'stock sell', 'acquire additional', 'acquire additional share']
For doc # 3285
Dominant topic : Topic # 0
Keywords ['sell share', 'additional share last', 'share company', 'additional share last quarter', 'hedge fund', 'move average day', 'additional share', 'price target', 'stock sell rating', 'price objective']
For d

#### FYI, other analysis

In [81]:
# FYI, results for all ariticles
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.03,0.03,0.03,0.03,0.03,0.03,0.7,0.03,0.03,0.03,6
Doc1,0.34,0.05,0.0,0.18,0.0,0.0,0.0,0.43,0.0,0.0,7
Doc2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc3,0.14,0.31,0.0,0.23,0.0,0.0,0.0,0.0,0.32,0.0,8
Doc4,0.23,0.0,0.0,0.0,0.0,0.47,0.0,0.0,0.29,0.0,5
Doc5,0.48,0.03,0.13,0.2,0.0,0.0,0.09,0.0,0.07,0.0,0
Doc6,0.0,0.06,0.74,0.0,0.0,0.0,0.19,0.0,0.0,0.0,2
Doc7,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc8,0.0,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,3
Doc9,0.0,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,3


In [87]:
# FYI, review topics distribution across documents
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Numbers of Documents")
df_topic_distribution.columns = ['Topic #', 'Numbers of Documents']
df_topic_distribution.sort_values('Topic #', inplace=True)
df_topic_distribution

Unnamed: 0,Topic #,Numbers of Documents
1,0,665
8,1,24
5,2,308
0,3,687
7,4,180
4,5,344
6,6,269
2,7,438
3,8,398
