# Import package for data processing and LDA model.

In [1]:
import os, csv, nltk, lda
import pandas as pd
import numpy as np
from nltk.tokenize import PunktSentenceTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from nltk.tokenize import PunktSentenceTokenizer,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Prepare packages to exclude punctuations, stopwords, alphabets and digits. And lemmatizing the words.

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/hungting-
[nltk_data]     yi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hungting-
[nltk_data]     yi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hungting-
[nltk_data]     yi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
lda_df_verified1 = pd.read_csv("verified_reviews.csv",encoding='utf8')
lda_df_verified1

Unnamed: 0,verified_id,reviews
0,1,camera full featured entry level seasoned pro...
1,2,back hay day film owned canon cameras tell ma...
2,3,given camera yet developed posted photos came...
3,4,rebel easy use automatic modes especially one...
4,5,much research beat deal felt like huge risk b...
5,6,camera takes fantastic pictures compared poin...
6,7,purchased canon rebel best buy days ago antic...
7,8,wanted good photographing birds lake scenery ...
8,9,great camera exactly described always loved t...
9,10,camera exactly looking want get photography t...


In [4]:
lda_df_verified1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3459 entries, 0 to 3458
Data columns (total 2 columns):
verified_id    3459 non-null int64
reviews        3458 non-null object
dtypes: int64(1), object(1)
memory usage: 54.1+ KB


In [5]:
lda_df_verified1=lda_df_verified1.dropna()

In [6]:
restaurant_name = "verified_id"  #input('provide the column name for restaurant names: ')
restaurant_review = "reviews"  #input('provide the column name for restaurant reviews: ')
ntopics= input('Provide the number of latent topics to be estimated: ')


word_tokenizer=RegexpTokenizer(r'\w+')
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_nltk=set(stopwords.words('english'))


def tokenize_text(version_desc):
    lowercase=version_desc.lower()
    text = wordnet_lemmatizer.lemmatize(lowercase)
    tokens = word_tokenizer.tokenize(text)
    return tokens

vec_words = CountVectorizer(tokenizer=tokenize_text,stop_words=stopwords_nltk,decode_error='ignore')
total_features_words = vec_words.fit_transform(lda_df_verified1["reviews"])

print(total_features_words.shape)

model = lda.LDA(n_topics=int(ntopics), n_iter=500, random_state=1)
model.fit(total_features_words)




topic_word = model.topic_word_
doc_topic=model.doc_topic_
doc_topic=pd.DataFrame(doc_topic)
lda_df_verified1=lda_df_verified1.join(doc_topic)
restaurant=pd.DataFrame()

for i in range(int(ntopics)):
    topic="topic_"+str(i)
    restaurant[topic]=lda_df_verified1.groupby([restaurant_name])[i].mean()

restaurant=restaurant.reset_index()
topics=pd.DataFrame(topic_word)
topics.columns=vec_words.get_feature_names()
topics1=topics.transpose()
print ("Topics word distribution written in file topic_word_dist_re3.xlsx ")
topics1.to_excel("topic_word_dist_re3.xlsx")
restaurant.to_excel("bestbuy_topic_dist_re3.xlsx",index=False)
print ("BestBuy topic distribution written in file bestbuy_topic_dist_re3.xlsx ")

Provide the number of latent topics to be estimated: 3


INFO:lda:n_documents: 3458
INFO:lda:vocab_size: 2717
INFO:lda:n_words: 40858
INFO:lda:n_topics: 3
INFO:lda:n_iter: 500


(3458, 2717)


  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -320323
INFO:lda:<10> log likelihood: -274925
INFO:lda:<20> log likelihood: -272567
INFO:lda:<30> log likelihood: -271110
INFO:lda:<40> log likelihood: -269941
INFO:lda:<50> log likelihood: -268680
INFO:lda:<60> log likelihood: -268442
INFO:lda:<70> log likelihood: -267886
INFO:lda:<80> log likelihood: -267335
INFO:lda:<90> log likelihood: -267384
INFO:lda:<100> log likelihood: -267007
INFO:lda:<110> log likelihood: -266805
INFO:lda:<120> log likelihood: -266513
INFO:lda:<130> log likelihood: -266456
INFO:lda:<140> log likelihood: -266322
INFO:lda:<150> log likelihood: -265984
INFO:lda:<160> log likelihood: -265763
INFO:lda:<170> log likelihood: -265823
INFO:lda:<180> log likelihood: -265749
INFO:lda:<190> log likelihood: -265903
INFO:lda:<200> log likelihood: -265637
INFO:lda:<210> log likelihood: -265708
INFO:lda:<220> log likelihood: -265291
INFO:lda:<230> log likelihood: -265579
INFO:lda:<240> log 

Topics word distribution written in file topic_word_dist_re3.xlsx 
BestBuy topic distribution written in file bestbuy_topic_dist_re3.xlsx 


In [7]:
lda_df_unverified1 = pd.read_csv("unverified_reviews.csv",encoding='utf8')
lda_df_unverified1

Unnamed: 0,unverified_id,reviews
0,1,great starter camera kit comes everything nee...
1,2,wanted upgrade without paying entire life sav...
2,3,first hate using quality long point shoot cam...
3,4,first camera totally amazed resolution pictur...
4,5,money ever spent personally camera excited so...
5,6,wanting years wanting spend much money needin...
6,7,purchased camera used lot far used say amazin...
7,8,owning using sold determined would get anothe...
8,9,camera easy use take amazing pictures feature...
9,10,wife loves camera got bells wants get profess...


In [8]:
lda_df_unverified1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 2 columns):
unverified_id    31 non-null int64
reviews          31 non-null object
dtypes: int64(1), object(1)
memory usage: 576.0+ bytes


In [9]:
lda_df_unverified1=lda_df_unverified1.dropna()

In [10]:
restaurant_name = "unverified_id"  #input('provide the column name for restaurant names: ')
restaurant_review = "reviews"  #input('provide the column name for restaurant reviews: ')
ntopics= input('Provide the number of latent topics to be estimated: ')


word_tokenizer=RegexpTokenizer(r'\w+')
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_nltk=set(stopwords.words('english'))


def tokenize_text(version_desc):
    lowercase=version_desc.lower()
    text = wordnet_lemmatizer.lemmatize(lowercase)
    tokens = word_tokenizer.tokenize(text)
    return tokens

vec_words = CountVectorizer(tokenizer=tokenize_text,stop_words=stopwords_nltk,decode_error='ignore')
total_features_words = vec_words.fit_transform(lda_df_unverified1["reviews"])

print(total_features_words.shape)

model = lda.LDA(n_topics=int(ntopics), n_iter=500, random_state=1)
model.fit(total_features_words)




topic_word = model.topic_word_
doc_topic=model.doc_topic_
doc_topic=pd.DataFrame(doc_topic)
lda_df_unverified1=lda_df_unverified1.join(doc_topic)
restaurant=pd.DataFrame()

for i in range(int(ntopics)):
    topic="topic_"+str(i)
    restaurant[topic]=lda_df_unverified1.groupby([restaurant_name])[i].mean()

restaurant=restaurant.reset_index()
topics=pd.DataFrame(topic_word)
topics.columns=vec_words.get_feature_names()
topics1=topics.transpose()
print ("Topics word distribution written in file untopic_word_dist_re2.xlsx ")
topics1.to_excel("untopic_word_dist_re2.xlsx")
restaurant.to_excel("unbestbuy_topic_dist_re2.xlsx",index=False)
print ("BestBuy topic distribution written in file unbestbuy_topic_dist_re2.xlsx ")

Provide the number of latent topics to be estimated: 2


INFO:lda:n_documents: 31
INFO:lda:vocab_size: 390
INFO:lda:n_words: 806
INFO:lda:n_topics: 2
INFO:lda:n_iter: 500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -6737
INFO:lda:<10> log likelihood: -6052
INFO:lda:<20> log likelihood: -5992


(31, 390)


INFO:lda:<30> log likelihood: -6000
INFO:lda:<40> log likelihood: -5983
INFO:lda:<50> log likelihood: -5982
INFO:lda:<60> log likelihood: -5975
INFO:lda:<70> log likelihood: -5980
INFO:lda:<80> log likelihood: -5979
INFO:lda:<90> log likelihood: -5962
INFO:lda:<100> log likelihood: -5956
INFO:lda:<110> log likelihood: -5975
INFO:lda:<120> log likelihood: -5960
INFO:lda:<130> log likelihood: -5956
INFO:lda:<140> log likelihood: -5946
INFO:lda:<150> log likelihood: -5936
INFO:lda:<160> log likelihood: -5954
INFO:lda:<170> log likelihood: -5949
INFO:lda:<180> log likelihood: -5969
INFO:lda:<190> log likelihood: -5942
INFO:lda:<200> log likelihood: -5940
INFO:lda:<210> log likelihood: -5960
INFO:lda:<220> log likelihood: -5921
INFO:lda:<230> log likelihood: -5917
INFO:lda:<240> log likelihood: -5929
INFO:lda:<250> log likelihood: -5918
INFO:lda:<260> log likelihood: -5910
INFO:lda:<270> log likelihood: -5917
INFO:lda:<280> log likelihood: -5911
INFO:lda:<290> log likelihood: -5913
INFO:lda

Topics word distribution written in file untopic_word_dist_re2.xlsx 
BestBuy topic distribution written in file unbestbuy_topic_dist_re2.xlsx 
