In [1]:
# Step 1 Read in data
# Step 2 Preprocess text data
# Step 3 Word Embedding
# Step 4 Deep Learning

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stop = stopwords.words('english')
import re

In [3]:
yelp=pd.read_csv('all_data20180608.csv')

In [4]:
# PreProcessing
#step 1 lower case
#step 2 punctuation
#step 3 stop word
#step 4 common word removal
#step 5 rare word removal
#step 6 token
#step 7 stemming
#step 8 lemma

In [5]:
#step 1
yelp['lower'] = yelp.text.apply(lambda x: " ".join(x.lower() for x in x.split()))
yelp.lower.head()

0    my friend gabi, i love your cute parisian inte...
1     had a good waiter, all the staff were very cool.
2    my only regret is not catching the name of our...
3    lotus of siam did not disappoint, the service ...
4    his name is carlos if you ever want to request...
Name: lower, dtype: object

In [6]:
#step 2
from nltk.tokenize import RegexpTokenizer
reg_tok = RegexpTokenizer(r'\w+')#+ is one or more
yelp['no_punc'] = yelp['lower'].apply(lambda x: ' '.join(reg_tok.tokenize(x)))
yelp.no_punc.head()

0    my friend gabi i love your cute parisian inter...
1       had a good waiter all the staff were very cool
2    my only regret is not catching the name of our...
3    lotus of siam did not disappoint the service w...
4    his name is carlos if you ever want to request...
Name: no_punc, dtype: object

In [7]:
#step 3
yelp['no_stop'] = yelp['no_punc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
yelp.no_stop.head()

0    friend gabi love cute parisian interior dim li...
1                               good waiter staff cool
2    regret catching name server best experienced f...
3        lotus siam disappoint service great attentive
4          name carlos ever want request service great
Name: no_stop, dtype: object

In [8]:
freq = pd.Series(' '.join(yelp['no_stop']).split()).value_counts()[:20]#combining all rows and then splitting and converitign and value count
freq
#looking at these, we actually want to keep them so no need to carry out this step

food          3504
good          1925
buffet        1556
service       1554
great         1365
place         1111
vegas          882
like           764
restaurant     656
one            642
get            641
best           635
really         625
quality        611
price          596
would          552
go             539
time           539
selection      470
better         463
dtype: int64

In [9]:
#step 4
rare = pd.Series(' '.join(yelp['no_stop']).split()).value_counts()[-600:]
rare


skirt            1
hennessy         1
makino           1
ads              1
chin             1
grumbled         1
munchies         1
student          1
hurry            1
wipe             1
sickly           1
gals             1
greet            1
12am             1
asadero          1
watchout         1
elotes           1
pomodori         1
heartland        1
mystery          1
arent            1
hummer           1
yakitori         1
inexpressive     1
winners          1
frittes          1
furnishings      1
tolerant         1
3xs              1
hushed           1
                ..
aesthetic        1
chicago          1
snobbish         1
bouillabaisse    1
hotdogs          1
boulliabase      1
unknown          1
bordering        1
stewed           1
evan             1
cheery           1
ranges           1
recommends       1
mobile           1
article          1
hashbrowns       1
stacks           1
quest            1
mare             1
cardio           1
flans            1
breeze      

In [10]:
#step 5
rare = list(rare.index)
yelp['no_rare'] = yelp['no_stop'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))


In [11]:
# just did for note
from textblob import TextBlob
# not really doing that for tutorial, this is just demo of it
yelp['no_stop'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    friend gave love cut parisian interior dim lig...
1                               good waiter staff cool
2    regret catching name server best experienced f...
3         lots siam disappoint service great attentive
4           name carlo ever want request service great
Name: no_stop, dtype: object

In [12]:
#step 6
from nltk.tokenize.treebank import TreebankWordTokenizer
_word_tokenize = TreebankWordTokenizer()
yelp['token'] = yelp['no_rare'].apply(lambda x: ' '.join(_word_tokenize.tokenize(x)))
yelp.token.head()

0    friend gabi love cute parisian interior dim li...
1                               good waiter staff cool
2    regret catching name server best experienced f...
3        lotus siam disappoint service great attentive
4          name carlos ever want request service great
Name: token, dtype: object

In [13]:
#step 7
from nltk.stem.snowball import SnowballStemmer
st = SnowballStemmer("english")
yelp['stemed']=yelp['token'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
yelp.stemed.head()

0    friend gabi love cute parisian interior dim li...
1                               good waiter staff cool
2    regret catch name server best experienc far tr...
3            lotus siam disappoint servic great attent
4            name carlo ever want request servic great
Name: stemed, dtype: object

In [14]:
#step 8
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
yelp['lemma']=yelp['stemed'].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))
yelp.lemma.head(20)

0     friend gabi love cute parisian interior dim li...
1                                good waiter staff cool
2     regret catch name server best experienc far tr...
3             lotus siam disappoint servic great attent
4             name carlo ever want request servic great
5                               room beauti server good
6     servic quick price ok get pretti darn good san...
7                                 good servic good food
8     say locat decor lotus siam never life find bet...
9                              servic snappi food tasti
10    came month ago food ok initi encount cashier g...
11                       hostess waitress friend attent
12                     shout boy wesley host cool peopl
13                            waitress awesom help ball
14     servic great busi afternoon outdoor set look day
15    arriv 3pm weekday prompt seat busi patio time ...
16    happi help take mani pictur request alway kept...
17    item order mon ami gabi oyster du jour 15 

In [15]:
# Prepping the Word Embedding by getting dictionary length and max sentence length

yelp.lemma.str.len().max()

610

In [16]:
from collections import Counter

count=Counter(" ".join(yelp.lemma).split(" ")).items()
print(sorted(count))

[('', 4), ('0', 10), ('00', 33), ('000', 6), ('00am', 1), ('00pm', 2), ('03', 1), ('05', 1), ('06', 3), ('07', 2), ('1', 98), ('10', 95), ('100', 22), ('1000', 8), ('102', 2), ('104', 1), ('10am', 1), ('10lbs', 1), ('10min', 1), ('10x', 1), ('11', 21), ('110', 5), ('112', 1), ('115', 1), ('11am', 4), ('12', 23), ('120', 1), ('12pm', 1), ('13', 23), ('14', 10), ('140', 3), ('15', 79), ('150', 1), ('1500', 1), ('15pm', 2), ('16', 13), ('164', 1), ('17', 9), ('1762', 2), ('18', 22), ('19', 7), ('1st', 7), ('2', 186), ('20', 69), ('200', 9), ('2002', 2), ('2008', 1), ('2010', 3), ('2011', 1), ('2012', 1), ('2016', 2), ('20min', 3), ('21', 7), ('22', 19), ('23', 2), ('23th', 1), ('23usd', 1), ('24', 45), ('24hrs', 6), ('25', 42), ('2500', 1), ('27', 13), ('270', 4), ('28', 4), ('29', 8), ('2nd', 14), ('2nds', 2), ('2pm', 3), ('3', 182), ('30', 83), ('300', 6), ('30am', 2), ('30ish', 2), ('30pm', 6), ('33', 5), ('330', 2), ('330pm', 1), ('34', 5), ('35', 28), ('35min', 1), ('36', 3), ('37', 

In [17]:
#length of dictionary
len(count)


5248

In [19]:
#longest sentence
print(max(yelp.lemma, key=len))

like singl littl dish put tast portion deep fri broccoli chees casserol surpris favorit american plate love littl tater tot waffl fri mini fri basket piec fri chicken sweet potato fri brisket nice rub outsid like option bbq sauc red velvet whoopi pie soft point flavor authent rendit dessert varieti cupcak cooki bread pud uniqu gelato flavor made order crepe sugar free dessert ton choos midlight good amount empti spot item look good guess popular ran lowlight shrimp cold one tast bit fishi hot one head overlook spici fri fish excit dish great probabl sit meat inconsist dri item great buffet other mediocr


In [None]:
# print(model)

In [None]:
# words = list(model.wv.vocab)
# print(words)

In [None]:
# batch_size = 128
# embedding_size = 128  # Dimension of the embedding vector.
# skip_window = 1       # How many words to consider left and right.
# num_skips = 2   

In [None]:
# import tensorflow as tf
# import numpy as np
# valid_size = 16     # Random set of words to evaluate similarity on.
# valid_window = 100  # Only pick dev samples in the head of the distribution.
# valid_examples = np.random.choice(valid_window, valid_size, replace=False)
# train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
# train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
# valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [None]:
# embeddings = tf.Variable(
#     tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
# embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [20]:
# importing keras packages
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
# define documents

# define class labels
encoder = LabelEncoder()
encoder.fit(yelp.category)
encoded_Y = encoder.transform(yelp.category)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

# integer encode the documents
vocab_size = 5248
encoded_docs = [one_hot(d, vocab_size) for d in yelp.leamma]
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 102
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# define the model
model = Sequential()

model.add(Embedding(vocab_size, 102, input_length=max_length))
model.add(Flatten())
model.add(Dense(12, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
X_train, X_test, y_train, y_test = train_test_split(padded_docs,dummy_y,test_size=0.1)
model.fit(X_train, y_train, epochs=100, verbose=0)
# evaluate the model
y_pred=model.predict(X_test)


Using TensorFlow backend.


[[1097, 3668, 3285, 621, 4255, 4914, 4350, 756, 1365, 3869, 1535, 4981], [5064, 2021, 3668, 2777], [4706, 2980, 2826, 4850, 3338, 2686, 4195, 508, 3540], [179, 3679, 1074, 3469, 907, 3790], [2826, 2358, 3634, 4584, 1867, 3469, 907], [1443, 3732, 2662, 5064], [3469, 145, 607, 1373, 5024, 1442, 2041, 5064, 2159, 4934, 3083, 4786, 3890, 3540], [5064, 3469, 5064, 4981], [1090, 3315, 276, 179, 3679, 3730, 726, 5051, 1726, 3469, 884], [3469, 835, 4981, 358], [3073, 561, 2277, 4981, 1373, 550, 4631, 1441, 5064, 5172, 3979, 153, 5050, 1401, 3205, 4616, 611, 661, 2742, 3757, 2295, 3109, 4810, 561, 4749, 4986, 1058, 3216, 1472, 714, 2154, 1058, 2084, 3291, 4284, 4352, 3900, 3464, 2093, 1095, 4376, 5131, 3291, 2654, 3403, 3936], [4669, 3285, 2694, 3790], [2730, 4921, 2881, 3934, 2777, 3587], [3285, 2886, 509, 3439], [3469, 907, 2084, 1354, 3271, 4278, 3774, 1106], [148, 1349, 449, 363, 2876, 2084, 5042, 3890, 3186, 4132, 3406, 1106, 1676, 505, 4308, 1364], [3987, 1810, 3705, 4284, 2178, 172, 1407

[[1097 3668 3285 ...,    0    0    0]
 [5064 2021 3668 ...,    0    0    0]
 [4706 2980 2826 ...,    0    0    0]
 ..., 
 [2070 2769 1443 ...,    0    0    0]
 [2070 2769 1443 ...,    0    0    0]
 [2070 2769 1443 ...,    0    0    0]]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 102, 102)          535296    
_________________________________________________________________
flatten_1 (Flatten)          (None, 10404)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 12)                124860    
Total params: 660,156
Trainable params: 660,156
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
# from sklearn.metrics import confusion_matrix
# conf_mat = confusion_matrix(y_test, y_pred)
# fig, ax = plt.subplots(figsize=(10,10))
# sns.heatmap(conf_mat, annot=True, fmt='d',
#             xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values)
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()
loss, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %f' % (accuracy*100))
print('loss: %f' % (loss))

Accuracy: 89.304671
loss: 0.554010


ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets

4.20661977543


[ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]


[  9.99861598e-01   5.95271667e-05   2.80780732e-05   9.18890578e-07
   1.08242202e-04   1.06071318e-04   1.73750359e-05   2.78542866e-05
   1.91618547e-05   4.65609482e-05   3.16380829e-05   9.03122236e-06]
