# Yelp Review
## Business Problem


In [1]:
# Step 1 Read in data
# Step 2 Preprocess text data
# Step 3 Word Embedding
# Step 4 Deep Learning

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stop = stopwords.words('english')
import re
# importing keras packages
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences


from keras.layers import Flatten
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.models import Sequential #### required layer in our LSTM network
from keras.layers import Dense #### required layer in our LSTM network
 #### required layer in our LSTM network
from keras.layers.embeddings import Embedding #### required layer in our LSTM network
from keras.preprocessing import sequence #### Packaged preprocessing step in Keras
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [3]:
yelp=pd.read_csv('all_data20180608.csv')

In [4]:
# PreProcessing
#step 1 lower case
#step 2 punctuation
#step 3 stop word
#step 4 common word removal
#step 5 rare word removal
#step 6 token
#step 7 stemming
#step 8 lemma

In [5]:
#step 1
yelp['lower'] = yelp.text.apply(lambda x: " ".join(x.lower() for x in x.split()))
yelp.lower.head()

0    my friend gabi, i love your cute parisian inte...
1     had a good waiter, all the staff were very cool.
2    my only regret is not catching the name of our...
3    lotus of siam did not disappoint, the service ...
4    his name is carlos if you ever want to request...
Name: lower, dtype: object

In [6]:
#step 2
from nltk.tokenize import RegexpTokenizer
reg_tok = RegexpTokenizer(r'\w+')#+ is one or more
yelp['no_punc'] = yelp['lower'].apply(lambda x: ' '.join(reg_tok.tokenize(x)))
yelp.no_punc.head()

0    my friend gabi i love your cute parisian inter...
1       had a good waiter all the staff were very cool
2    my only regret is not catching the name of our...
3    lotus of siam did not disappoint the service w...
4    his name is carlos if you ever want to request...
Name: no_punc, dtype: object

In [7]:
#step 3
yelp['no_stop'] = yelp['no_punc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
yelp.no_stop.head()

0    friend gabi love cute parisian interior dim li...
1                               good waiter staff cool
2    regret catching name server best experienced f...
3        lotus siam disappoint service great attentive
4          name carlos ever want request service great
Name: no_stop, dtype: object

In [8]:
freq = pd.Series(' '.join(yelp['no_stop']).split()).value_counts()[:20]#combining all rows and then splitting and converitign and value count
freq
#looking at these, we actually want to keep them so no need to carry out this step

food          3504
good          1925
buffet        1556
service       1554
great         1365
place         1111
vegas          882
like           764
restaurant     656
one            642
get            641
best           635
really         625
quality        611
price          596
would          552
go             539
time           539
selection      470
better         463
dtype: int64

In [9]:
#step 4
rare = pd.Series(' '.join(yelp['no_stop']).split()).value_counts()[-600:]


In [10]:
#step 5
rare = list(rare.index)
yelp['no_rare'] = yelp['no_stop'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))


In [11]:
# just did for note
from textblob import TextBlob
# not really doing that for tutorial, this is just demo of it
yelp['no_stop'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    friend gave love cut parisian interior dim lig...
1                               good waiter staff cool
2    regret catching name server best experienced f...
3         lots siam disappoint service great attentive
4           name carlo ever want request service great
Name: no_stop, dtype: object

In [12]:
#step 6
from nltk.tokenize.treebank import TreebankWordTokenizer
_word_tokenize = TreebankWordTokenizer()
yelp['token'] = yelp['no_rare'].apply(lambda x: ' '.join(_word_tokenize.tokenize(x)))
yelp.token.head()

0    friend gabi love cute parisian interior dim li...
1                               good waiter staff cool
2    regret catching name server best experienced f...
3        lotus siam disappoint service great attentive
4          name carlos ever want request service great
Name: token, dtype: object

In [13]:
#step 7
from nltk.stem.snowball import SnowballStemmer
st = SnowballStemmer("english")
yelp['stemed']=yelp['token'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
yelp.stemed.head()

0    friend gabi love cute parisian interior dim li...
1                               good waiter staff cool
2    regret catch name server best experienc far tr...
3            lotus siam disappoint servic great attent
4            name carlo ever want request servic great
Name: stemed, dtype: object

In [14]:
#step 8
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
yelp['lemma']=yelp['stemed'].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))
yelp.lemma.head(20)

0     friend gabi love cute parisian interior dim li...
1                                good waiter staff cool
2     regret catch name server best experienc far tr...
3             lotus siam disappoint servic great attent
4             name carlo ever want request servic great
5                               room beauti server good
6     servic quick price ok get pretti darn good san...
7                                 good servic good food
8     say locat decor lotus siam never life find bet...
9                              servic snappi food tasti
10    came month ago food ok initi encount cashier g...
11                       hostess waitress friend attent
12                     shout boy wesley host cool peopl
13                            waitress awesom help ball
14     servic great busi afternoon outdoor set look day
15    arriv 3pm weekday prompt seat busi patio time ...
16    happi help take mani pictur request alway kept...
17    item order mon ami gabi oyster du jour 15 

In [15]:
# Prepping the Word Embedding by getting dictionary length and max sentence length

yelp.lemma.str.len().max()

607

In [16]:
from collections import Counter

count=Counter(" ".join(yelp.lemma).split(" ")).items()
# print(sorted(count))

In [17]:
#length of dictionary
len(count)


5257

In [18]:
#longest sentence
print(max(yelp.lemma, key=len))

like singl littl dish put tast portion deep fri broccoli chees casserol surpris favorit american plate love littl bucket tater tot waffl fri mini fri basket piec fri chicken sweet potato fri brisket nice rub outsid like option bbq sauc red velvet whoopi pie soft point flavor authent rendit dessert varieti cupcak cooki bread pud uniqu gelato flavor made order crepe sugar free dessert ton choos midlight good amount empti spot item look good guess popular ran lowlight shrimp cold one tast bit fishi hot one head overlook spici fri fish excit dish great probabl sit meat dri item great buffet other mediocr


In [None]:
# embeddings = tf.Variable(
#     tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
# embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [31]:

# define documents

# define class labels
encoder = LabelEncoder()
encoder.fit(yelp.category)
encoded_Y = encoder.transform(yelp.category)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

# integer encode the documents
vocab_size = 6000
encoded_docs = [one_hot(d, vocab_size) for d in yelp.lemma]
#print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 130
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# define the model
model = Sequential()

model.add(Embedding(vocab_size, 32, input_length=max_length))
model.add(Flatten())
model.add(Dense(12, activation='sigmoid'))
#model.add(Dropout(0.3))
#model.add(Dense(12, activation='sigmoid'))
#model.add(Dropout(0.3))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
X_train, X_test, y_train, y_test = train_test_split(padded_docs,dummy_y,test_size=0.2)
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=64)
# evaluate the model



[[3260 1433 4121 ...    0    0    0]
 [5512 5919 5765 ...    0    0    0]
 [2608 3658 1933 ...    0    0    0]
 ...
 [4361 3918 5720 ...    0    0    0]
 [4361 3918 5720 ...    0    0    0]
 [4361 3918 5720 ...    0    0    0]]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 130, 32)           192000    
_________________________________________________________________
flatten_5 (Flatten)          (None, 4160)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 12)                49932     
Total params: 241,932
Trainable params: 241,932
Non-trainable params: 0
_________________________________________________________________
None
Train on 10041 samples, validate on 2511 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 1

<keras.callbacks.History at 0x1f5ee9dee10>

In [34]:
embedding_vecor_length = 32
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_vecor_length,input_length=max_length))
from keras.layers import LSTM
model2.add(LSTM(100))
model2.add(Dense(12, activation='sigmoid'))
model2.add(Dense(12, activation='sigmoid'))
model2.add(Dense(12, activation='sigmoid'))
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model2.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 130, 32)           192000    
_________________________________________________________________
flatten_5 (Flatten)          (None, 4160)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 12)                49932     
Total params: 241,932
Trainable params: 241,932
Non-trainable params: 0
_________________________________________________________________
None
Train on 10041 samples, validate on 2511 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1f5f3793fd0>

In [33]:
from keras.layers import Dropout
embedding_vecor_length = 32
model3 = Sequential()
model3.add(Embedding(vocab_size, embedding_vecor_length,input_length=max_length))

model3.add(Dense(12, activation='sigmoid')) 
model3.add(Dropout(0.3))
model3.add(Dense(12, activation='sigmoid'))
model3.add(Dropout(0.3))
model3.add(Dense(12, activation='sigmoid'))
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model2.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 130, 32)           192000    
_________________________________________________________________
flatten_5 (Flatten)          (None, 4160)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 12)                49932     
Total params: 241,932
Trainable params: 241,932
Non-trainable params: 0
_________________________________________________________________
None
Train on 10041 samples, validate on 2511 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1f5eeb63240>