In [2]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

# Plotly based imports for visualization
from plotly import tools
#import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Topic modelling 

This is a exploratry analysis of topic modelling using spaCy and scikit-learn

# What is topic-modelling?

> In machine learning and natural language processing, a topic model is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. Topic modeling is a frequently used text-mining tool for discovery of hidden semantic structures in a text body. Intuitively, given that a document is about a particular topic, one would expect particular words to appear in the document more or less frequently: "dog" and "bone" will appear more often in documents about dogs, "cat" and "meow" will appear in documents about cats, and "the" and "is" will appear equally in both. A document typically concerns multiple topics in different proportions; thus, in a document that is 10% about cats and 90% about dogs, there would probably be about 9 times more dog words than cat words. 

> The "topics" produced by topic modeling techniques are clusters of similar words. A topic model captures this intuition in a mathematical framework, which allows examining a set of documents and discovering, based on the statistics of the words in each, what the topics might be and what each document's balance of topics is. It involves various techniques of dimensionality reduction(mostly non-linear) and unsupervised learning like LDA, SVD, autoencoders etc.

> Source: [Wikipedia](https://en.wikipedia.org/wiki/Topic_model)

It can help with the following:
* discovering the hidden themes in the collection.
* classifying the documents into the discovered themes.
* using the classification to organize/summarize/search the documents.


In [3]:
# Load Data
data = pd.read_csv("../test_data.csv")

# Creating a spaCy object
nlp = spacy.load('en_core_web_md')

# List of puncation and stop words
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

# Parser for subsmissions
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens


> “Lemmatisation (or lemmatization) in linguistics, is the process of grouping together the different inflected forms of a word so they can be analysed as a single item.”



In [4]:
tqdm.pandas()
subs = data["title"].progress_apply(spacy_tokenizer)


100%|██████████| 686/686 [00:00<00:00, 3514.50it/s]


# Feature Extraction 

In order to use textual data for predictive modeling, the text must be parsed to remove certain words – this process is called tokenization. These words need to then be encoded as integers, or floating-point values, for use as inputs in machine learning algorithms. This process is called feature extraction (or vectorization).

CountVectorizer is used to convert a collection of text documents to a vector of term/token counts

<details>
    <summary>CountVectorizer Details</summary>
        <ul>
        <li> min|max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).  
        <li> stop_words:  words to ignore . 
        <li> Lowercase: Convert all characters to lowercase before tokenizing . 
        <li> token_pattern: Regular expression denoting what constitutes a “token . 
        

</details

In [5]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized = vectorizer.fit_transform(subs)

# Different Methods

I am going to look at three differenet methods LDA, NMF and LSI-SVD

## Latent Dirichlet Allocation Model
> Each document can be described by a distribution of topics and each topic can be described by a distribution of words

<details>
    <summary>LDA Details</summary>
    <h3> Latent Dirichlet Allocation with online variational Bayes algorithm </h3>
    <ul>
    <li> n_components: Number of topics.
    <li> max_iter: The maximum number of iterations.
    <li> learning_method: Method used to update _component.
</details>

In [6]:
NumTopics = 10
# Define LDA
lda = LatentDirichletAllocation(n_components=NumTopics, max_iter=10, learning_method='online',verbose=True)

# Fit LDA
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


## Non-Negative matrix factorization 
Imagine if you wanted to decompose a term-document matrix, where each column represented a document, and each element in the document represented the weight of a certain word (the weight might be the raw count or the tf-idf weighted count or some other encoding scheme; those details are not important here).

What happens when we decompose this into two matrices? Imagine if the documents came from news articles. The word "eat" would be likely to appear in food-related articles, and therefore co-occur with words like "tasty" and "food". Therefore, these words would probably be grouped together into a "food" component vector, and each article would have a certain weight of the "food" topic.

Therefore, an NMF decomposition of the term-document matrix would yield components that could be considered "topics", and decompose each document into a weighted sum of topics. This is called topic modeling and is an important application of NMF.

Note that this interpretation would not be possible with other decomposition methods. We cannot interpret what it means to have a "negative" weight of the food topic. This is another example where the underlying components (topics) and their weights should be non-negative.

Another interesting property of NMF is that it naturally produces sparse representations. This makes sense in the case of topic modeling: documents generally do not contain a large number of topics.

In [7]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NumTopics)
data_nmf = nmf.fit_transform(data_vectorized) 

# Latent Semantic Indexing Model using Truncated SVD

> Latent Semantic Indexing (LSI) [Deerwester et al] tries to overcome the
problems of lexical matching by using statistically derived conceptual indices
instead of individual words for retrieval. LSI assumes that there is some
underlying or latent structure in word usage that is partially obscured by
variability in word choice. A truncated singular value decomposition (SVD) is
used to estimate the structure in word usage across documents. Retrieval is then
performed using the database of singular values and vectors obtained from the
truncated SVD. Performance data shows that these statistically derived vectors are
more robust indicators of meaning than individual terms. 

> Latent Semantic Indexing is a technique that projects queries and documents into
a space with “latent” semantic dimensions.
In the latent semantic space, a query and a document can have high cosine
similarity even if they do not share any terms - as long as their terms are
semantically similar in a sense to be described later. We can look at LSI as a
similarity metric that is an alternative to word overlap measures like tf.idf.
The latent semantic space that we project into has fewer dimensions than the
original space (which has as many dimensions as terms). LSI is thus a method for
dimensionality reduction. A dimensionality reduction technique takes a set of
objects that exist in a high-dimensional space and represents them in a lowdimensional space, often in a two-dimensional or three-dimensional space for the
purpose of visualization.
Latent semantic indexing is the application of a particular mathematical
technique, called Singular Value Decomposition or SVD, to a word-by-document
matrix. SVD (and hence LSI) is a least-squares method. The projection into the
latent semantic space is chosen such that the representations in the original space
are changed as little as possible when measured by the sum of the squares of the
differences. 

> -- <cite>Rosario. B. [Latent Semantic Indexing: An Overview](https://www.cse.msu.edu/~cse960/Papers/LSI/LSI.pdf) </cite>


In [8]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NumTopics)
data_lsi = lsi.fit_transform(data_vectorized)

In [9]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [10]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('know', 9.786762052304141), ('day', 8.68957117267419), ('think', 8.500672207931789), ('good', 8.412240736059823), ('area', 7.122788658094914), ('state', 7.077361499636743), ('thread', 5.210995898034292), ('real', 5.146456991191833), ('hair', 4.35515226308807), ('thing', 4.248092811387372)]
Topic 1:
[('new', 22.64334194842441), ('today', 14.189993951773763), ('cases', 9.4905425039299), ('season', 6.026694894146353), ('chinese', 4.591961388840565), ('coronavirus', 4.042957331970942), ('year', 2.074395338828453), ('like', 1.9317663771987144), ('state', 1.7876166069617303), ('corona', 1.1857620719471111)]
Topic 2:
[('friends', 10.933857809026174), ('going', 8.837096365479693), ('way', 8.445376712804213), ('google', 5.908777835697299), ('mask', 4.769234891315201), ('hours', 4.332160501321969), ('work', 4.25260348491225), ('school', 2.792083750276947), ('day', 1.6793749452608968), ('like', 1.1444135703710463)]
Topic 3:
[('people', 21.17153252101002), ('covid-', 19.41398

In [11]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer)

NMF Model:
Topic 0:
[('coronavirus', 1.1336080305803033), ('help', 1.1199209104887107), ('end', 0.8843060178051088), ('messiah', 0.868322933846821), ('proxy', 0.868322933846821), ('buddha', 0.868322933846821), ('ncov', 0.868322933846821), ('savior', 0.868322933846821), ('war', 0.868322933846821), ('need', 0.03787592941769745)]
Topic 1:
[('people', 2.856426989074114), ('covid', 0.41693528322531437), ('right', 0.38792605159497323), ('feel', 0.3414572463114982), ('job', 0.17006820714189916), ('live', 0.16995003863177263), ('masks', 0.14943887123465444), ('black', 0.14437620124965708), ('let', 0.14206228075815439), ('things', 0.14081095327660212)]
Topic 2:
[('new', 3.267390038865004), ('cases', 0.7568590300158673), ('covid', 0.47995278138834974), ('coronavirus', 0.3959271752108882), ('area', 0.33313844379507335), ('state', 0.2824960112697608), ('looking', 0.26723934749713163), ('live', 0.24985153266101007), ('chinese', 0.1585185910917167), ('friends', 0.1530436581644465)]
Topic 3:
[('time'

In [12]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, vectorizer)

LSI Model:
Topic 0:
[('coronavirus', 0.4192004906943542), ('help', 0.39543246418687805), ('end', 0.3105242372962239), ('ncov', 0.30489958301171094), ('messiah', 0.30489958301171094), ('proxy', 0.30489958301171094), ('savior', 0.30489958301171094), ('war', 0.30489958301171094), ('buddha', 0.30489958301171094), ('people', 0.061841441070258796)]
Topic 1:
[('people', 0.7618666108319347), ('know', 0.3191375143951639), ('like', 0.2537965973843211), ('new', 0.25164855118301455), ('covid', 0.15577984033206252), ('time', 0.15270084935675501), ('got', 0.13796437836125564), ('feel', 0.13292701685099478), ('right', 0.10558486578747142), ('looking', 0.10524596905744742)]
Topic 2:
[('new', 0.39131514947274704), ('time', 0.3182285571600684), ('got', 0.3065525054755035), ('covid-', 0.2695373279029546), ('old', 0.2668691410500394), ('year', 0.2510113298577992), ('corona', 0.22176196232226936), ('cases', 0.1917460694214443), ('dad', 0.1745779350377186), ('day', 0.15762625436999972)]
Topic 3:
[('time', 0

In [13]:
# Transforming an individual sentence
text = spacy_tokenizer("Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.")
x = lda.transform(vectorizer.transform([text]))[0]
print(x)

[0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.55 0.05]


# Visualizing LDA results with pyLDAvis

In [14]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

## How to interpret this graph?
1. Topics on the left while their respective keywords are on the right.
2. Larger topics are more frequent and closer the topics, mor the similarity
3. Selection of keywords is based on their frequency and discriminancy.

**Hover over the topics on the left to get information about their keywords on the right.**

# Visualizing LSI(SVD) scatterplot
We will be visualizing our data for 2  topics to see similarity between keywords which is measured by distance with the markers using LSI model

In [15]:
svd_2d = TruncatedSVD(n_components=2)
data_2d = svd_2d.fit_transform(data_vectorized)

In [16]:
trace = go.Scattergl(
    x = data_2d[:,0],
    y = data_2d[:,1],
    mode = 'markers',
    marker = dict(
        color = '#FFBAD2',
        line = dict(width = 1)
    ),
    text = vectorizer.get_feature_names(),
    hovertext = vectorizer.get_feature_names(),
    hoverinfo = 'text' 
)
data = [trace]
iplot(data, filename='scatter-mode')