In [3]:
import json
import re
import numpy as np
import pandas as pd
from pprint import pprint
from nltk.corpus import stopwords

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
! python -m spacy download en

# Removing Stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords



## Read json files and convert it to csv file

In [4]:
csv_file = open('review_extracted.csv', 'w')
csv_file.write('review_id,user_id,business_id,stars,text,date\n')
with open('yelp_dataset/review.json', 'r') as review_json:
    line = review_json.readline()
    while line:
        json_object = json.loads(line)
        one_entry = str()
        one_entry = json_object['review_id'] + "," +\
                    json_object['user_id'] + "," +\
                    json_object['business_id'] + "," +\
                    str(json_object['stars']) + "," +\
                    '\"'+ json_object['text'].replace("\"", " ") + '\"' +"," +\
                    json_object['date'] + "\n"
        csv_file.write(one_entry)
#         print("review_id: ", json_object["review_id"], " user_id: ", json_object["user_id"], " business_id: ", json_object["business_id"], " stars: ", json_object["stars"], " date: ", json_object["date"]);
        line = review_json.readline()
csv_file.close()

In [5]:
df = pd.read_csv('review_extracted.csv')
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,text,date
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36
1,GJXCdrto3ASJOqKeVWPi6Q,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,2017-01-14 21:30:33
2,2TzJjDVDEuAW6MR5Vuc1ug,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0,I have to say that this office really has it t...,2016-11-09 20:09:03
3,yi0R0Ugj_xUx_Nek0-_Qig,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5.0,Went in for a lunch. Steak sandwich was delici...,2018-01-09 20:56:38
4,11a8sVPMUFtaC7_ABRkmtw,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1.0,Today was my second out of three sessions I ha...,2018-01-30 23:07:38


## Dataset Sampling

In [8]:
print(len(df))
df_sampled = df.sample(frac=0.01,random_state=1)
print(len(df_sampled))

6685900
66859


# Remove Stopwords

In [11]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 'use']) # you can add more words into it to filter out other words
df_text_list = df_sampled['text'].tolist()
print(len(df_text_list))

## Remove keywords from the list

In [14]:
# Remove new line characters
data = [re.sub('\s+', ' ', str(sent)) for sent in df_text_list]
data = [re.sub("\'", "", sent) for sent in data]

In [16]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc

In [17]:
# Tokenize words and clean up text
data_words = list(sent_to_words(data))

In [42]:
print (data_words[0])
print (df_text_list[0])

['love', 'this', 'location', 'and', 'have', 'been', 'customer', 'for', 'well', 'over', 'year', 'have', 'always', 'had', 'pleasant', 'experience', 'and', 'the', 'staff', 'are', 'always', 'very', 'friendly']
I love this location and have been a customer for well over a year!  I have always had a pleasant experience and the staff are always very friendly.


## 9 Creating Bigram and Trigram Models

In [18]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['love', 'this', 'location', 'and', 'have', 'been', 'customer', 'for', 'well', 'over', 'year', 'have', 'always', 'had', 'pleasant', 'experience', 'and', 'the', 'staff', 'are', 'always', 'very', 'friendly']


In [50]:
bigram_mod[data_words[0]]
data_words[0]

['love',
 'this',
 'location',
 'and',
 'have',
 'been',
 'customer',
 'for',
 'well',
 'over',
 'year',
 'have',
 'always',
 'had',
 'pleasant',
 'experience',
 'and',
 'the',
 'staff',
 'are',
 'always',
 'very',
 'friendly']

## 10 Remove Stopwords, Make Bigrams and Lemmatize

In [22]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [29]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [53]:
data_lemmatized[0]

['love',
 'location',
 'customer',
 'well',
 'year',
 'always',
 'pleasant',
 'experience',
 'staff',
 'always',
 'friendly']

## 11 Create the Dictionary and Corpus needed Topic Modeling

In [30]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [65]:
id2word[28]

'order'

In [74]:
print(id2word.doc2bow(["food", "food", "food"]))
print(texts[1])

[(20, 3)]
['think', 'food', 'authentic', 'korean', 'food', 'everything', 'taste', 'fast', 'food', 'version', 'korean', 'food', 'food', 'complain', 'service', 'horrible', 'staff', 'member', 'seem', 'obedient', 'first', 'order', 'food', 'come', 'super', 'slowly', 'staff', 'member', 'seem', 'forget', 'ask', 'think', 'bad', 'korean_bbq', 'restaurant', 'ever', 'because', 'service', 'horrid', 'quality', 'food']


In [77]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]]
corpus[0]

[(0, 2),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1)]

## 12. Building the Topic Model

In [33]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

## Save the model

In [87]:
from gensim.test.utils import datapath
temp_file = datapath("lda_1_percent")
lda_model.save(temp_file)`

In [88]:
temp_file

'/home/ubuntu/anaconda3/lib/python3.7/site-packages/gensim/test/test_data/lda_1_percent'

## 13. View the topics in LDA model

In [78]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.119*"clean" + 0.022*"continue" + 0.022*"class" + 0.016*"incredibly" + '
  '0.014*"hop" + 0.014*"boba" + 0.013*"pour" + 0.012*"credit" + 0.012*"spa" + '
  '0.012*"tax"'),
 (1,
  '0.075*"room" + 0.035*"hotel" + 0.029*"kid" + 0.026*"parking" + 0.020*"area" '
  '+ 0.016*"pool" + 0.014*"plan" + 0.014*"brunch" + 0.013*"park" + '
  '0.013*"bathroom"'),
 (2,
  '0.174*"good" + 0.126*"food" + 0.043*"restaurant" + 0.043*"eat" + '
  '0.040*"price" + 0.030*"pretty" + 0.022*"burger" + 0.022*"fresh" + '
  '0.020*"everything" + 0.018*"menu"'),
 (3,
  '0.093*"tell" + 0.082*"call" + 0.036*"pay" + 0.030*"charge" + 0.030*"money" '
  '+ 0.025*"later" + 0.023*"receive" + 0.021*"issue" + 0.019*"rude" + '
  '0.018*"fix"'),
 (4,
  '0.088*"car" + 0.072*"favorite" + 0.047*"taco" + 0.031*"pork" + '
  '0.025*"mexican" + 0.021*"burrito" + 0.019*"authentic" + 0.018*"unique" + '
  '0.018*"salsa" + 0.018*"bean"'),
 (5,
  '0.100*"s" + 0.052*"beer" + 0.043*"selection" + 0.032*"cool" + 0.031*"that" '
  '+ 0.030

In [84]:
doc_lda[1][2]

[(7, [(14, 2.0)]),
 (10, [(7, 1.0)]),
 (11, [(4, 0.99999994)]),
 (12, [(7, 1.0000001)]),
 (13, [(15, 1.0)]),
 (14, [(7, 0.9851072)]),
 (15, [(15, 1.0)]),
 (16, [(2, 0.3455167), (7, 0.6544832)]),
 (17, [(2, 0.9516349), (12, 0.048365176)]),
 (18, [(2, 0.8698356), (14, 0.13016436)]),
 (19, [(7, 1.0)]),
 (20, [(2, 6.9999995)]),
 (21, [(11, 1.0)]),
 (22, [(3, 0.99999994)]),
 (23, [(15, 1.0)]),
 (24, [(15, 2.0)]),
 (25, [(15, 0.99999994)]),
 (26, [(15, 2.0)]),
 (27, [(15, 0.9755508)]),
 (28, [(13, 0.9999999)]),
 (29, [(11, 1.0)]),
 (30, [(2, 1.0)]),
 (31, [(7, 2.0)]),
 (32, [(14, 1.99427)]),
 (33, [(15, 1.0)]),
 (34, [(2, 1.0)]),
 (35, [(2, 0.87831247), (6, 0.031960063), (16, 0.08972739)]),
 (36, [(7, 2.0)]),
 (37, [(15, 1.0)])]

In [43]:
lda_model

<gensim.models.ldamodel.LdaModel at 0x7f2f7122e5c0>

## Testing on the unseen data

In [99]:
df_sampled2 = df.sample(frac=0.01,random_state=2)
df_sampled2.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,text,date
3857412,7oeO1IBLpqxmd9aNcBNMTA,m-LlVyTuMIWmUUIQxHItfg,4EaqFG4v0A8nPEkbQwi6Dw,2.0,This is the 2nd time I have gone to Meineke fo...,2016-04-07 01:18:20
6442705,3ECqq0ZjU37K5zq75e78kg,3L5VEHgrphZkz1APeVdVDg,fL-b760btOaGa85OJ9ut3w,5.0,I haven't had BBQ this good in years. We did ...,2015-03-29 16:46:29
962501,dAC0P9RZkdD6EcoiCT2GmQ,l-B4axUNxQ9FVVxeQm8t2A,RZWx7pIsINH6nVqW7ys9cg,4.0,Very good down-home cooking. I had the fried t...,2014-06-03 22:32:46


In [98]:
df_sampled.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,text,date
4634629,xw37uw9b2wFwVRLXSCtgNg,tIau2r-ox1LhC38wXX6oew,IUoj0ORP1VAV9xx4dmHP9g,5.0,I love this location and have been a customer ...,2018-09-07 20:41:56
4349124,jKxK47dz6D-ZpKqmd-oTeA,q0lV9MoRYDhyoLAQ9X91iA,hihud--QRriCYZw1zZvW4g,1.0,I think that the food is not at all authentic ...,2018-01-03 03:57:16
6056428,boaG5qtsdwR3KBwQbvnfAg,kUE8PMxx2_HTT32T16Z8Aw,zT3IxSfolhu4qU0tV0pTjg,2.0,If you schedule your appointment late in the a...,2018-06-27 01:40:52


In [103]:
def getReultsOnUnseenData(unseen_docs, lda_model):
    corpus = [id2word.doc2bow(text) for text in texts]
    lda_model.update(corpus)
    return lda_model[corpus]

In [101]:
def textPreprocessing(texts):
    data = [re.sub('\s+', ' ', str(sent)) for sent in texts]
    data = [re.sub("\'", "", sent) for sent in data]
    data_words = list(sent_to_words(data))
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
    return data_lemmatized

In [102]:
unseen_docs = textPreprocessing(df_sampled2['text'].tolist())



In [None]:
unseen_docs_res_distri = getReultsOnUnseenData(unseen_docs=unseen_docs, lda_model=lda_model)