In [4]:
import pandas as pd
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Plotly based imports for visualization
from plotly import tools
import chart_studio.plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [5]:
nlp = spacy.load('en_core_web_lg')\

In [6]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

In [2]:
news = pd.read_csv("Data/1million-abcnews.txt", delimiter="\n", header= None)
news.columns =['headline']

In [7]:
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [8]:
tqdm.pandas()
news["processed_headline"] = news["headline"].progress_apply(spacy_tokenizer)

100%|██████████| 1048575/1048575 [04:22<00:00, 3997.53it/s]


**max_df**: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

**min_df**: When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.


In [18]:
# Creating a vectorizer

vectorizer = CountVectorizer(min_df = 50, max_df=0.85, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(news["processed_headline"])

In [19]:
NUM_TOPICS = 10

In [20]:
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [24]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized)

In [25]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [26]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [27]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('man', 32050.137441299772), ('charge', 18034.773859411325), ('year', 12168.12137011313), ('fund', 11405.37221585575), ('murder', 10639.563734796167), ('south', 9648.184206311382), ('jail', 9444.934594192498), ('miss', 9095.446218042498), ('drug', 8019.281463761053), ('abc', 6823.949231997917)]
Topic 1:
[('death', 12371.578096388821), ('north', 7799.2812463958135), ('cup', 7460.833315465655), ('talk', 6839.509678751047), ('state', 6754.310044033408), ('work', 6718.11869368343), ('group', 6467.389700819862), ('west', 6467.268501436112), ('budget', 6391.265291373421), ('centre', 6159.715464213164)]
Topic 2:
[('police', 31034.997969795335), ('new', 28553.48931952139), ('interview', 17686.46528152312), ('country', 9991.223627820693), ('world', 9968.715514043048), ('health', 9436.416391632032), ('china', 7717.813375711208), ('final', 7618.584794993318), ('minister', 7424.353069321739), ('lead', 6664.689549236477)]
Topic 3:
[('open', 9853.33339225375), ('coast', 9288.935

In [28]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer)

NMF Model:
Topic 0:
[('police', 16.594438590534114), ('probe', 1.151099053137687), ('investigate', 1.1392721764804492), ('miss', 0.8508213549128046), ('search', 0.8394822801487126), ('death', 0.7903120314561721), ('officer', 0.7070281172965218), ('arrest', 0.6432358791618549), ('hunt', 0.6381443827287592), ('seek', 0.6214558019779424)]
Topic 1:
[('man', 11.803113659030124), ('jail', 1.2463883558025866), ('miss', 0.7219108967689252), ('murder', 0.6908579768183267), ('arrest', 0.6130967813620966), ('die', 0.5716040807802953), ('stab', 0.5429155669754465), ('shoot', 0.5420063817590117), ('guilty', 0.47266036137034595), ('attack', 0.4303777376225856)]
Topic 2:
[('new', 13.731262957855519), ('year', 0.645410524090025), ('zealand', 0.5624803226472831), ('law', 0.5464051697085981), ('open', 0.4305260753432229), ('set', 0.3281551231927001), ('home', 0.32622775328498355), ('deal', 0.29230644633804204), ('australia', 0.29202470945866577), ('centre', 0.28941043665474425)]
Topic 3:
[('plan', 13.08

In [29]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, vectorizer)

LSI Model:
Topic 0:
[('police', 0.6700128488637318), ('man', 0.5639319053969398), ('charge', 0.2329244771549109), ('court', 0.12083436586701707), ('new', 0.1092550250071349), ('murder', 0.10088065904514518), ('face', 0.08536223616693446), ('crash', 0.08277568191496402), ('death', 0.08275517631618368), ('car', 0.07236307008792865)]
Topic 1:
[('man', 0.641619440324082), ('charge', 0.24473566072076136), ('court', 0.13678762218108226), ('face', 0.09423298541570324), ('murder', 0.08249289886566952), ('jail', 0.0724344184641359), ('accuse', 0.0443073291987775), ('die', 0.03964827212269456), ('guilty', 0.035253228788953095), ('child', 0.034334456205930286)]
Topic 2:
[('new', 0.8651772357446955), ('plan', 0.24702117208782848), ('council', 0.16129760858401895), ('govt', 0.11802272937844062), ('fund', 0.06698080797027213), ('urge', 0.06286015666143288), ('water', 0.05710725167792753), ('year', 0.05012589269209477), ('health', 0.04770243261024411), ('law', 0.046190858147348184)]
Topic 3:
[('plan'

In [30]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

In [31]:
def spacy_bigram_tokenizer(phrase):
    doc = parser(phrase) # create spacy object
    token_not_noun = []
    notnoun_noun_list = []
    noun = ""

    for item in doc:
        if item.pos_ != "NOUN": # separate nouns and not nouns
            token_not_noun.append(item.text)
        if item.pos_ == "NOUN":
            noun = item.text
        
        for notnoun in token_not_noun:
            notnoun_noun_list.append(notnoun + " " + noun)

    return " ".join([i for i in notnoun_noun_list])

In [32]:
bivectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, ngram_range=(1,2))
bigram_vectorized = bivectorizer.fit_transform(news["processed_headline"])

In [None]:
bi_lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_bi_lda = bi_lda.fit_transform(bigram_vectorized)

In [None]:
print("Bi-LDA Model:")
selected_topics(bi_lda, bivectorizer)

In [None]:
text = spacy_tokenizer("clients acceptance of strategic imperatives")
print (text)
x = lda.transform(vectorizer.transform([text]))[0]
print(x)

In [None]:
text = spacy_tokenizer("new competitors")
print (text)
x = lda.transform(vectorizer.transform([text]))[0]
print(x)