# Libraries and dataset import

In [1]:
%matplotlib inline
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
warnings.simplefilter("ignore", DeprecationWarning)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import pickle

from sklearn.model_selection import GridSearchCV

In [2]:
# Interactive visualisation for evaluation of LDA model 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

  from collections import Iterable


In [3]:
amazon2017_string=pd.read_pickle('../amazon2017_string.pkl')

In [4]:
samsung=amazon2017_string[amazon2017_string.brand == 'Samsung']

# VECTORS :Count vectorizer & Tf-IDF Vectorizer

In [5]:
# Before we can apply LDA or NMF, we need to create vocabulary of all the words in our data, 
# vectorized matrix of the vocabulary.

# COUNTVECTORIZER
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, strip_accents='ascii', stop_words='english')
doc_term_matrix_cv = count_vectorizer.fit_transform(samsung.text_processed) # vocabulary encoded into vectors with countvectorizer

# IF-IDF VECTORIZER
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, strip_accents='ascii', stop_words="english")
matrix_tfidfvect = tfidf_vectorizer.fit_transform(samsung.text_processed)

# 10 topics. Latent Dirichlet Allocation (LDA)  &  Non-Negative Matrix Factorization (NMF)

In [6]:
# n_samples = 2000
# n_features = 1000
# number_components = 10
number_components = 10
no_top_words = 15
# Function to print the topics
def display_topics(model, feature_names, no_top_words):    
    for topic_idx, topic in enumerate(model.components_):        
        print("Topic %d:" % (topic_idx))       
        print (",".join([feature_names[i]                       
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

## Both models on CV

In [7]:
model_LDA_cv = LatentDirichletAllocation(n_components=number_components)
model_LDA_cv.fit_transform(doc_term_matrix_cv)

array([[0.01428849, 0.34702411, 0.37347068, ..., 0.01428576, 0.01428705,
        0.01428912],
       [0.03333946, 0.37663701, 0.03333775, ..., 0.03333379, 0.03333788,
        0.03334806],
       [0.0200008 , 0.02000516, 0.02000108, ..., 0.02000049, 0.02000118,
        0.81996725],
       ...,
       [0.01250198, 0.44803598, 0.01250048, ..., 0.0125001 , 0.01250042,
        0.45195627],
       [0.01428661, 0.01428759, 0.0142869 , ..., 0.01428638, 0.01428666,
        0.87141564],
       [0.00476243, 0.0047629 , 0.00476225, ..., 0.00476276, 0.00476236,
        0.0047622 ]])

In [8]:
model_NMF_cv = NMF(n_components=number_components)
model_NMF_cv.fit_transform(doc_term_matrix_cv)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00354241, 0.003577  , 0.        , ..., 0.0006637 , 0.        ,
        0.00730012],
       [0.00524717, 0.01833741, 0.00916735, ..., 0.        , 0.        ,
        0.00726601],
       ...,
       [0.        , 0.07418932, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.16331676, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00634581, 0.        , 0.        , ..., 0.        , 0.        ,
        0.30837072]])

In [9]:
print('COUNT VECTORIZER. TWO MODELS')
t0 = time.time()
print('LDA MODEL TOPICS')
display_topics(model_LDA_cv, count_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

print()

print('NMF MODEL TOPICS')
display_topics(model_NMF_cv, count_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

COUNT VECTORIZER. TWO MODELS
LDA MODEL TOPICS
Topic 0:
samsung,note,camera,screen,like,use,battery,apps,good,better,galaxy,iphone,dont,android,fingerprint
Topic 1:
good,stars,condition,perfect,works,excellent,product,great,like,far,happy,came,expected,new,battery
Topic 2:
screen,battery,life,refurbished,ive,poor,note,issues,protector,bad,new,got,used,glass,scratches
Topic 3:
new,like,works,great,brand,looks,refurbished,came,sim,card,got,far,verizon,purchase,old
Topic 4:
unlocked,att,verizon,work,network,locked,tmobile,carrier,sprint,use,compatible,excelente,version,unlock,sim
Topic 5:
screen,samsung,months,working,buy,worked,ok,warranty,stopped,year,broken,time,broke,dropped,month
Topic 6:
sim,card,box,service,seller,charger,samsung,received,came,customer,new,didnt,amazon,device,come
Topic 7:
work,star,charger,gift,didnt,doesnt,got,bought,return,money,stolen,returned,nice,sent,refund
Topic 8:
battery,dont,charge,buy,money,im,bought,time,bad,use,wont,work,got,like,months
Topic 9:
great,

## Both model on Tf-idf vectorizer

In [10]:
model_LDA_tf= LatentDirichletAllocation(n_components=number_components)
model_LDA_tf.fit(matrix_tfidfvect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [11]:
model_NMF_tf = NMF(n_components=number_components)
model_NMF_tf.fit_transform(matrix_tfidfvect)

array([[0.00000000e+00, 0.00000000e+00, 7.34517515e-02, ...,
        0.00000000e+00, 0.00000000e+00, 2.00565076e-03],
       [0.00000000e+00, 1.34446115e-01, 1.31971459e-04, ...,
        0.00000000e+00, 0.00000000e+00, 4.37058819e-03],
       [2.53583480e-03, 0.00000000e+00, 2.40487923e-03, ...,
        5.80132711e-02, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.67634494e-02, 1.74863715e-04, 1.02792926e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.52455706e-02, 0.00000000e+00, 0.00000000e+00, ...,
        1.59704655e-03, 9.49738483e-04, 7.79192075e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [12]:
print('TF-IDF VECTORIZER. TWO MODELS')
t0 = time.time()
print('LDA MODEL')
display_topics(model_LDA_tf, tfidf_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

print()

print('NMF MODEL')
display_topics(model_NMF_tf, tfidf_vectorizer.get_feature_names(), no_top_words)
print("Done in %0.3fs." % (time.time() - t0))

TF-IDF VECTORIZER. TWO MODELS
LDA MODEL
Topic 0:
screen,great,battery,good,like,samsung,new,camera,life,note,use,love,price,ok,works
Topic 1:
star,work,broken,charger,bad,stars,pen,didnt,broke,unlocked,brother,buy,screen,returned,came
Topic 2:
excelente,bueno,telefono,buen,bien,producto,perfecto,celular,llego,funciona,precio,buena,nuevo,bateria,camara
Topic 3:
best,great,love,easy,good,use,samsung,stars,ive,fantastic,new,galaxy,price,like,size
Topic 4:
new,like,brand,looks,works,great,stars,perfectly,refurbished,scratches,condition,looked,perfect,excelente,came
Topic 5:
loves,gift,great,wife,bought,husband,love,loved,stars,reliable,daughter,mom,good,works,friend
Topic 6:
battery,charge,months,life,buy,working,screen,dont,stopped,work,good,hot,charger,star,bad
Topic 7:
excellent,perfect,stars,thanks,expectations,condition,metro,box,great,yes,durable,pcs,met,headphones,came
Topic 8:
stars,great,good,love,works,product,price,nice,awesome,far,condition,excellent,like,new,happy
Topic 9:
unl

## LDA vizualisation on both count and tfidf vectorizer

In [13]:
pyLDAvis.sklearn.prepare(model_LDA_tf, matrix_tfidfvect, tfidf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [17]:
pyLDAvis.sklearn.prepare(model_LDA_cv, doc_term_matrix_cv, count_vectorizer)
%time

Wall time: 0 ns


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
