# Libraries and dataset import

In [1]:
%matplotlib inline
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
warnings.simplefilter("ignore", DeprecationWarning)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import pickle

from sklearn.model_selection import GridSearchCV

In [2]:
# Interactive visualisation for evaluation of LDA model 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

  from collections import Iterable


In [3]:
amazon2017_string=pd.read_pickle('../amazon2017_string.pkl')

In [4]:
samsung=amazon2017_string[amazon2017_string.brand == 'Samsung']

# VECTORS :Count vectorizer & Tf-IDF Vectorizer

In [5]:
# Before we can apply LDA or NMF, we need to create vocabulary of all the words in our data, 
# vectorized matrix of the vocabulary.

# COUNTVECTORIZER
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, strip_accents='ascii', stop_words='english')
doc_term_matrix_cv = count_vectorizer.fit_transform(samsung.text_processed) # vocabulary encoded into vectors with countvectorizer

# IF-IDF VECTORIZER
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, strip_accents='ascii', stop_words="english")
matrix_tfidfvect = tfidf_vectorizer.fit_transform(samsung.text_processed)

# 7 topics. Latent Dirichlet Allocation (LDA)  &  Non-Negative Matrix Factorization (NMF)

In [14]:
# n_samples = 2000
# n_features = 1000
# number_components = 10
number_components = 7
no_top_words = 15
# Function to print the topics
def display_topics(model, feature_names, no_top_words):    
    for topic_idx, topic in enumerate(model.components_):        
        print("Topic %d:" % (topic_idx))       
        print (",".join([feature_names[i]                       
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

## Both models on CV

In [15]:
model_LDA_cv = LatentDirichletAllocation(n_components=number_components)
model_LDA_cv.fit_transform(doc_term_matrix_cv)

array([[0.18993663, 0.02049952, 0.22760358, ..., 0.020443  , 0.02046634,
        0.02047408],
       [0.04776648, 0.04791709, 0.38084259, ..., 0.04772336, 0.04785701,
        0.04772892],
       [0.38521896, 0.02861692, 0.02892449, ..., 0.02866111, 0.4713956 ,
        0.0285968 ],
       ...,
       [0.01786616, 0.01789993, 0.61921396, ..., 0.01788484, 0.01795354,
        0.017866  ],
       [0.02042167, 0.02041765, 0.02046694, ..., 0.02043612, 0.87736962,
        0.0204197 ],
       [0.89031631, 0.0756186 , 0.00680423, ..., 0.00681322, 0.00682665,
        0.00681026]])

In [16]:
model_NMF_cv = NMF(n_components=number_components)
model_NMF_cv.fit_transform(doc_term_matrix_cv)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.19476587,
        0.        ],
       [0.00392603, 0.00368796, 0.        , ..., 0.00424943, 0.00494404,
        0.00221419],
       [0.00524245, 0.01835557, 0.0077136 , ..., 0.01777005, 0.00913892,
        0.        ],
       ...,
       [0.        , 0.07430582, 0.        , ..., 0.        , 0.20283528,
        0.        ],
       [0.        , 0.16424528, 0.        , ..., 0.00090371, 0.        ,
        0.00296772],
       [0.00489404, 0.        , 0.        , ..., 0.3054975 , 0.        ,
        0.        ]])

In [17]:
print('COUNT VECTORIZER. TWO MODELS')
t0 = time.time()
print('LDA MODEL TOPICS')
display_topics(model_LDA_cv, count_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

print()

print('NMF MODEL TOPICS')
display_topics(model_NMF_cv, count_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

COUNT VECTORIZER. TWO MODELS
LDA MODEL TOPICS
Topic 0:
unlocked,sim,att,verizon,work,card,tmobile,use,network,sprint,locked,carrier,service,good,said
Topic 1:
samsung,apps,like,card,dont,app,note,use,android,galaxy,gb,sd,im,google,new
Topic 2:
stars,good,great,product,love,excellent,price,nice,quality,excelente,telefono,bueno,ok,best,buen
Topic 3:
battery,great,good,life,camera,screen,love,note,use,like,samsung,best,fast,price,better
Topic 4:
battery,buy,working,charge,samsung,months,new,charger,amazon,bought,bad,money,dont,seller,refurbished
Topic 5:
great,new,works,like,condition,came,brand,happy,good,looks,perfect,far,love,price,refurbished
Topic 6:
screen,work,case,got,glass,broken,dropped,protector,defective,like,months,water,broke,doesnt,return
Done in 0.031s.

NMF MODEL TOPICS
Topic 0:
new,like,brand,looks,refurbished,came,works,box,im,charger,happy,buy,seller,condition,used
Topic 1:
great,works,price,love,condition,stars,fast,product,far,camera,quality,deal,came,value,buy
Topic

## Both model on Tf-idf vectorizer

In [18]:
model_LDA_tf= LatentDirichletAllocation(n_components=number_components)
model_LDA_tf.fit(matrix_tfidfvect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [19]:
model_NMF_tf = NMF(n_components=number_components)
model_NMF_tf.fit_transform(matrix_tfidfvect)

array([[0.        , 0.        , 0.07294379, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.13052225, 0.        , ..., 0.        , 0.00298958,
        0.        ],
       [0.00723251, 0.00030306, 0.00637404, ..., 0.        , 0.        ,
        0.09528956],
       ...,
       [0.02742242, 0.        , 0.10124036, ..., 0.00224488, 0.00200725,
        0.0037327 ],
       [0.05846962, 0.        , 0.        , ..., 0.00052221, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01446004,
        0.        ]])

In [20]:
print('TF-IDF VECTORIZER. TWO MODELS')
t0 = time.time()
print('LDA MODEL')
display_topics(model_LDA_tf, tfidf_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

print()

print('NMF MODEL')
display_topics(model_NMF_tf, tfidf_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

TF-IDF VECTORIZER. TWO MODELS
LDA MODEL
Topic 0:
stars,great,good,love,works,excellent,product,price,nice,like,perfect,new,awesome,condition,far
Topic 1:
broken,reliable,defective,color,expectations,buy,enjoying,durable,money,penny,worth,screen,piece,crap,worst
Topic 2:
new,great,like,brand,good,works,came,battery,charger,looks,screen,refurbished,scratches,condition,price
Topic 3:
battery,samsung,great,camera,life,good,screen,use,note,galaxy,like,apps,love,dont,better
Topic 4:
excelente,bueno,telefono,buen,bien,producto,excelent,stars,perfecto,celular,llego,sim,funciona,card,precio
Topic 5:
unlocked,work,verizon,att,sim,buy,working,return,locked,battery,use,dont,card,star,tmobile
Topic 6:
star,loves,gift,screen,bought,wife,husband,broke,great,bad,cracked,glass,work,personal,good
Done in 0.023s.

NMF MODEL
Topic 0:
great,price,product,value,condition,buy,deal,awesome,quality,fast,purchase,camera,far,service,happy
Topic 1:
stars,excellent,nice,perfect,ok,thanks,expected,excelente,product

## LDA vizualisation on both count and tfidf vectorizer

In [21]:
pyLDAvis.sklearn.prepare(model_LDA_tf, matrix_tfidfvect, tfidf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
pyLDAvis.sklearn.prepare(model_LDA_cv, doc_term_matrix_cv, count_vectorizer)
%time