# Libraries and dataset import

In [19]:
%matplotlib inline
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
warnings.simplefilter("ignore", DeprecationWarning)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import pickle

from sklearn.model_selection import GridSearchCV

In [20]:
# Interactive visualisation for evaluation of LDA model 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [21]:
amazon2017_string=pd.read_pickle('../amazon2017_string.pkl')

In [22]:
samsung=amazon2017_string[amazon2017_string.brand == 'Samsung']

# VECTORS :Count vectorizer & Tf-IDF Vectorizer

Bag-of-Words Model We cannot work with text directly when using machine learning algorithms.
Instead, we need to convert the text to numbers. = > CountVectorizer

The CountVectorizer provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.
Create an instance of the CountVectorizer class.
Call the fit() function in order to learn a vocabulary from one or more documents.
Call the transform() function on one or more documents as needed to encode each as a vector.

In [23]:
# Before we can apply LDA or NMF, we need to create vocabulary of all the words in our data, 
# vectorized matrix of the vocabulary.

# COUNTVECTORIZER
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, strip_accents='ascii', stop_words='english')
doc_term_matrix_cv = count_vectorizer.fit_transform(samsung.text_processed) # vocabulary encoded into vectors with countvectorizer

# IF-IDF VECTORIZER
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, strip_accents='ascii', stop_words="english")
matrix_tfidfvect = tfidf_vectorizer.fit_transform(samsung.text_processed)

In [24]:
# Function to print the vocabulary based on vectorizer method
def print_vocabulary(vectorizer_method):
    list_vocab=list(vectorizer_method.vocabulary_.items())
    
    list_vocab.sort(reverse=True) #sort in reverse order
    print('Descending order is',list_vocab[:20])
    
def print_info(matrix_entry):
    shape_matrix=matrix_entry.shape # print total number of features
    arr_matrix=matrix_entry.toarray() # array representation of the matrix
    return(print(shape_matrix), print(arr_matrix))

In [25]:
print_vocabulary(count_vectorizer)
print()
print_vocabulary(tfidf_vectorizer)

Descending order is [('zte', 10555), ('zooming', 10554), ('zoomed', 10553), ('zoom', 10552), ('zones', 10551), ('zone', 10550), ('zona', 10549), ('zombie', 10548), ('zmax', 10547), ('zizo', 10546), ('zippier', 10545), ('zipcode', 10544), ('zip', 10543), ('zilch', 10542), ('zerolemon', 10541), ('zero', 10540), ('zenfone', 10539), ('zen', 10538), ('zbfatima', 10537), ('yrs', 10536)]

Descending order is [('zte', 10555), ('zooming', 10554), ('zoomed', 10553), ('zoom', 10552), ('zones', 10551), ('zone', 10550), ('zona', 10549), ('zombie', 10548), ('zmax', 10547), ('zizo', 10546), ('zippier', 10545), ('zipcode', 10544), ('zip', 10543), ('zilch', 10542), ('zerolemon', 10541), ('zero', 10540), ('zenfone', 10539), ('zen', 10538), ('zbfatima', 10537), ('yrs', 10536)]


In [26]:
print_info(doc_term_matrix_cv)
print()
print_info(matrix_tfidfvect)

(29505, 10556)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

(29505, 10556)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


(None, None)

In [27]:
# Materialize the sparse data
pd.DataFrame(doc_term_matrix_cv.todense(), columns=count_vectorizer.get_feature_names()).tail(2)

Unnamed: 0,aaaaa,aback,abajo,abandon,abandoning,abd,abierto,abilities,ability,abit,...,zizo,zmax,zombie,zona,zone,zones,zoom,zoomed,zooming,zte
29503,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29504,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# Materialize the sparse data
pd.DataFrame(matrix_tfidfvect.todense(), columns=tfidf_vectorizer.get_feature_names()).tail(2)

Unnamed: 0,aaaaa,aback,abajo,abandon,abandoning,abd,abierto,abilities,ability,abit,...,zizo,zmax,zombie,zona,zone,zones,zoom,zoomed,zooming,zte
29503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Check the Sparsicity
Sparsicity is the percentage of non-zero datapoints in the document-word matrix, that is data_vectorized.

Since most cells in this matrix will be zero, the question is what percentage of cells contain non-zero values.

In [29]:
# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity - percentage of cells containing non-zero values:", ((doc_term_matrix_cv.todense() > 0).sum()/doc_term_matrix_cv.todense().size)*100, "%")

Sparsicity - percentage of cells containing non-zero values: 0.14354379149358376 %


In [30]:
# https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/

# Building the Topic Model : Latent Dirichlet Allocation (LDA)  &  Non-Negative Matrix Factorization (NMF)

We have everything required to train the LDA model. In addition to the corpus and dictionary, need to provide the number of topics as well.
tried n of topics from 3 to 10, then after GridSerachCV - found out 5-6

Unsupervised machine learning. In this project only LDA and NMF models are used. There are however other existing ways (not studied here)

- LSA matrix decomposition
    - Latent semantic analysis

- Probabilistic inference
    - pLSA probabilistic LSA
    - LDA

In [31]:
number_components = 5
no_top_words = 20
# Function to print the topics
def display_topics(model, feature_names, no_top_words):    
    for topic_idx, topic in enumerate(model.components_):        
        print("Topic %d:" % (topic_idx))       
        print (",".join([feature_names[i]                       
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

## Both models on CV

In [32]:
model_LDA_cv = LatentDirichletAllocation(n_components=number_components)
model_LDA_cv.fit_transform(doc_term_matrix_cv)

array([[0.0290496 , 0.02891718, 0.12401535, 0.02873973, 0.78927814],
       [0.26897001, 0.06731604, 0.52766002, 0.06730662, 0.0687473 ],
       [0.04059865, 0.04085262, 0.83820562, 0.04029031, 0.0400528 ],
       ...,
       [0.02522593, 0.02515986, 0.89893111, 0.02542949, 0.02525361],
       [0.02881289, 0.02878882, 0.88484204, 0.02858289, 0.02897336],
       [0.54766502, 0.42365121, 0.00957158, 0.00955348, 0.00955872]])

In [33]:
model_NMF_cv = NMF(n_components=number_components)
model_NMF_cv.fit_transform(doc_term_matrix_cv)

array([[0.        , 0.        , 0.        , 0.12213476, 0.        ],
       [0.0006574 , 0.00366784, 0.        , 0.00460132, 0.00704501],
       [0.        , 0.01843895, 0.0082014 , 0.00117792, 0.01674442],
       ...,
       [0.03143396, 0.07610508, 0.        , 0.12125006, 0.        ],
       [0.        , 0.1595353 , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.17469433]])

In [34]:
print('COUNT VECTORIZER. TWO MODELS')
t0 = time.time()
print('LDA MODEL TOPICS')
display_topics(model_LDA_cv, count_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

print()

print('NMF MODEL TOPICS')
display_topics(model_NMF_cv, count_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

COUNT VECTORIZER. TWO MODELS
LDA MODEL TOPICS
Topic 0:
new,sim,card,unlocked,came,charger,like,box,att,brand,works,verizon,great,good,samsung,condition,refurbished,used,tmobile,got
Topic 1:
work,buy,samsung,verizon,return,bought,unlocked,working,dont,months,seller,use,got,att,time,star,amazon,product,service,new
Topic 2:
great,good,stars,works,love,new,price,like,product,excellent,condition,perfect,happy,far,nice,buy,awesome,fast,quality,looks
Topic 3:
samsung,apps,android,iphone,gb,use,app,excelente,storage,telefono,bixby,apple,google,memory,fingerprint,galaxy,sd,bueno,buen,using
Topic 4:
battery,screen,life,good,camera,like,note,use,samsung,great,really,dont,im,better,time,ive,galaxy,fast,day,charge
Done in 0.039s.

NMF MODEL TOPICS
Topic 0:
samsung,note,screen,galaxy,camera,use,best,really,love,time,better,case,device,gb,buy,pen,fast,im,features,edge
Topic 1:
great,works,price,love,stars,condition,fast,product,far,camera,quality,deal,came,value,buy,perfect,looks,awesome,easy,excelle

## Both model on Tf-idf vectorizer

In [43]:
model_LDA_tf= LatentDirichletAllocation(n_components=number_components)
model_LDA_tf.fit(matrix_tfidfvect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [44]:
model_NMF_tf = NMF(n_components=number_components)
model_NMF_tf.fit_transform(matrix_tfidfvect)

array([[0.        , 0.        , 0.07232935, 0.        , 0.        ],
       [0.        , 0.12786186, 0.        , 0.00521772, 0.        ],
       [0.01989308, 0.001946  , 0.00816752, 0.02174253, 0.        ],
       ...,
       [0.02792951, 0.        , 0.10040305, 0.00850954, 0.00240272],
       [0.05990107, 0.        , 0.        , 0.00355847, 0.00077252],
       [0.        , 0.        , 0.        , 0.03956967, 0.        ]])

In [45]:
print('TF-IDF VECTORIZER. TWO MODELS')
t0 = time.time()
print('LDA MODEL')
display_topics(model_LDA_tf, tfidf_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

print()

print('NMF MODEL')
display_topics(model_NMF_tf, tfidf_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

TF-IDF VECTORIZER. TWO MODELS
LDA MODEL
Topic 0:
excelente,stars,loves,nice,ok,bueno,awesome,gift,telefono,buen,bien,producto,battery,great,husband,life,perfecto,wife,celular,llego
Topic 1:
battery,screen,great,new,good,samsung,like,charger,buy,dont,charge,use,life,got,months,im,note,work,bought,time
Topic 2:
great,stars,love,good,works,new,like,excellent,perfect,product,price,condition,happy,far,brand,looks,expected,best,described,nice
Topic 3:
unlocked,work,verizon,star,att,sim,locked,tmobile,network,card,sprint,carrier,return,use,compatible,buy,service,activate,mobile,unlock
Topic 4:
good,easy,use,stars,great,thanks,value,nice,quality,liked,love,mom,personal,product,price,far,bad,money,set,timely
Done in 0.040s.

NMF MODEL
Topic 0:
great,works,price,product,condition,value,buy,awesome,deal,quality,fast,far,purchase,camera,service,described,problems,loves,shipping,issues
Topic 1:
stars,excellent,nice,perfect,ok,thanks,works,expected,excelente,product,happy,condition,advertised,like,w

## LDA vizualisation on both count and tfidf vectorizer

In [41]:
pyLDAvis.sklearn.prepare(model_LDA_tf, matrix_tfidfvect, tfidf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [47]:
pyLDAvis.sklearn.prepare(model_LDA_cv, doc_term_matrix_cv, count_vectorizer)
%time

Wall time: 0 ns


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# GridSearchCV. Find out number of topics for LDA (tfIDF vectorized matrix is used)

In [64]:
from sklearn.model_selection import GridSearchCV
# Define Search Param
search_params = {'n_components': [5, 7, 10, 15], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(matrix_tfidfvect)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                   

In [116]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(matrix_tfidfvect))

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 5}
Best Log Likelihood Score:  -297422.38015523285
Model Perplexity:  4924.579152343306


Estimation of number of topics based on "Samsung" dataset

- 2, 3, 5, 10 components: the minimum of 2 is "recommended" by GridSearchCV
search_params = {'n_components': [2, 3, 5, 10], 'learning_decay': [.5, .7, .9]}                                              
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 2}
Best Log Likelihood Score:  -277760.92158545385
Model Perplexity:  3345.039938342581

- 3, 5 or 9 components: Here 3 topics - also minimum
search_params = {'n_components': [3, 5, 9], 'learning_decay': [.5, .7, .9]}
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 3}
Best Log Likelihood Score:  -283679.74330586474
Model Perplexity:  3960.19078360036

- 5, 7, 10, 15 components: 5 topics - also minimum number given in the param list
search_params = {'n_components': [5, 7, 10, 15], 'learning_decay': [.5, .7, .9]}
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 5}
Best Log Likelihood Score:  -297422.38015523285
Model Perplexity:  4924.579152343306

In [122]:
# will use number of topics around 5 or 6