# Yelp Project - NLP


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('last_2yr_restaurant_reviews.csv')

In [3]:
df.head(2)

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,type,useful,user_id,count
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-06-26,0,nCqdz-NW64KazpxqnDr0sQ,1,I mainly went for the ceasar salad prepared ta...,review,0,0XVzm4kVIAaH4eQAxWbhvw,1
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-06-29,0,iwx6s6yQxc7yjS7NFANZig,4,Nice atmosphere and wonderful service. I had t...,review,0,2aeNFntqY2QDZLADNo8iQQ,1


### Define the text of the review as feature variable

In [4]:
documents = df['text'].values

In [21]:
documents.dtype, documents.shape

(dtype('O'), (347619,))

In [22]:
documents[:3]

array(['I mainly went for the ceasar salad prepared tableside.  I ate in the bar, the bartender was very nice and helpful.  I got the grilled cheese with tomato soup.  Grilled cheese was very good but the soup was nothing special.  Now the salad that i read one reviewer said the best in vegas, which is the only reason i came.  Knowing that they put anchovies in it when they prepare tableside, i was going to tell them to hold off on that once they get started.  So as im waiting for them to come up and make it, they bring it already prepared.  What is that?  The whole point of getting it is to watch it being done and see that its made fresh.  So obviously the anchovies were already in it, and since i explained i didnt want them, they made another.   I was told its a fire hazard to prepare it in the bar area so they made it on the side when i wasnt looking.  The few bites i took werent that good.  So i watch them make the 2nd salad in the hallway.  Needless to say, it was totally flavorle

### Define target variable

#### Perfect (5 stars) and imperfect (1-4 stars) rating

In [23]:
df['favorable'] = (df['stars'] > 4)

In [24]:
target = df['favorable'].values

In [25]:
target[:10]

array([False, False, False, False, False,  True,  True,  True,  True,
       False])

In [26]:
target.mean()

0.46076595352958266

In [27]:
documents.shape, target.shape

((347619,), (347619,))

## Create training dataset and test dataset

In [28]:
from sklearn.cross_validation import train_test_split

In [29]:
documents_train, documents_test, target_train, target_test = train_test_split(
    documents, target, test_size=0.8, random_state=42)

In [30]:
documents_train

array(["Lived in jersey moved to Las Vegas 21 years later and set out to find Nevada's best pizzeria. If you find yourself on the same journey as us look no further than PayLess Pizza and Ribs #2. Biting into a slice of Vinny's pizza instantly brought me back to my younger days eating pizza on the beach of Seaside Heights, NJ and is the closest thing to East Coast Pizza that you can get without jumping on a plane to the east coast.",
       'First off, I have eaten Texas-Oklahoma "Real Deal" barbecue for 30+ years.\n\nBarbecue to different people across the USA undoubtably means different expectations.\n\nI like meats fresh off the smoker grill, meats with a smoke ring and ribs with meat that melts in your mouth,standing alone on the house seasonings. \nThis means an ON-SITE smoker/pit grill.\n\nIf barbecue is served heavily sauced, something is being hidden. Usually dry overcooked or re-heated leftovers.\n\nReviews show heavily sauced product here.\nWe never got to the point of seeing

## NLP representation

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [33]:
# Train the model with your training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [38]:
vectors_train[1,1]

0.0

In [35]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()

In [36]:
words

['00',
 '000',
 '00pm',
 '10',
 '100',
 '10am',
 '10pm',
 '11',
 '110',
 '11am',
 '11pm',
 '12',
 '120',
 '13',
 '14',
 '15',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1am',
 '1pm',
 '1st',
 '20',
 '200',
 '2014',
 '2015',
 '2016',
 '21',
 '215',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '2am',
 '2nd',
 '2pm',
 '30',
 '300',
 '30am',
 '30pm',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '3am',
 '3pm',
 '3rd',
 '40',
 '42',
 '45',
 '45pm',
 '48',
 '49',
 '4am',
 '4pm',
 '4th',
 '50',
 '500',
 '52',
 '55',
 '5pm',
 '5th',
 '60',
 '65',
 '6pm',
 '70',
 '75',
 '7pm',
 '80',
 '800',
 '85',
 '8pm',
 '90',
 '95',
 '99',
 '9pm',
 'ability',
 'able',
 'absolute',
 'absolutely',
 'abundant',
 'ac',
 'acai',
 'accent',
 'accept',
 'acceptable',
 'accepted',
 'access',
 'accessible',
 'accident',
 'accidentally',
 'accommodate',
 'accommodated',
 'accommodating',
 'accompanied',
 'accompany',
 'accompanying',
 'according',
 'accordingly',
 'account',
 'accurate',
 'ac

In [19]:
vectors_train.shape

(69523, 5000)

In [20]:
vectors_test = vectorizer.transform(documents_test).toarray()

## Similar review search engine

In [21]:
import numpy as np

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
# Draw an arbitrary review from test (unseen in training) documents
some_random_number = 42
search_query = documents_test[some_random_number]
search_queries = [search_query]  # Need to be put into a list-like format
print search_query
print search_queries

Very tasty food! Atmosphere on the patio is amazing. Service was fantastic. Nothing we requested was forgotten. More bread, done! More wine, done! Food was delivered exactly as ordered. We were very impressed with our first visit to Salute. Have to give a shout out to our server, Kevin. Thanks for being so  good at your job! ;)
['Very tasty food! Atmosphere on the patio is amazing. Service was fantastic. Nothing we requested was forgotten. More bread, done! More wine, done! Food was delivered exactly as ordered. We were very impressed with our first visit to Salute. Have to give a shout out to our server, Kevin. Thanks for being so  good at your job! ;)']


In [24]:
# Transform the drawn review(s) to vector(s)
vector_search_queries = vectorizer.transform(search_queries).toarray()

In [25]:
vector_search_queries.shape

(1, 5000)

In [26]:
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_scores = cosine_similarity(vector_search_queries, vectors_train)

In [27]:
n = 5
returned_reviews = get_top_values(similarity_scores[0], n, documents_train)

In [28]:
print 'Our search query:'
print search_queries[0]

Our search query:
Very tasty food! Atmosphere on the patio is amazing. Service was fantastic. Nothing we requested was forgotten. More bread, done! More wine, done! Food was delivered exactly as ordered. We were very impressed with our first visit to Salute. Have to give a shout out to our server, Kevin. Thanks for being so  good at your job! ;)


In [29]:
print 'Most %s similar reviews:' % n
for i, review in enumerate(returned_reviews):
    print '#%s:' % i
    print review

Most 5 similar reviews:
#0:
Have a very good food, and Kevin is a very nice person also in side service is very good too.
#1:
Great service, food, and atmosphere. Shout out to Mark. Thanks for making our special night great.
#2:
You must go to Kevin haha he's a funny guy and great bartender. He def made my night lol thanks Kevin
#3:
Server Kevin was awesome. I love his service. He's been so friendly all the time during the dinner. Definitely will come back again.
#4:
Another wine dinner at Flemings. Incredible food and wine pairings. Outstanding, knowledgable staff. Kevin, the wine manager is amazing!


## Classifying positive/negative review

#### Naive-Bayes Classifier

In [30]:
# Build a Naive-Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
model_nb = MultinomialNB()
model_nb.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
# Get score for training set
model_nb.score(vectors_train, target_train)

0.80951627518950564

In [32]:
# Get score for test set
model_nb.score(vectors_test, target_test)

0.80082417582417587

#### Logistic Regression Classifier

In [33]:
# Build a Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
model_lrc = LogisticRegression()
model_lrc.fit(vectors_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
# Get score for training set
model_lrc.score(vectors_train, target_train)

0.84265638709491819

In [35]:
# Get score for test set
model_lrc.score(vectors_test, target_test)

0.82185648121512

#### Key features(words) make the positive prediction

In [36]:
n = 20
get_top_values(model_lrc.coef_[0], n, words)

[u'amazing',
 u'best',
 u'awesome',
 u'perfect',
 u'thank',
 u'delicious',
 u'highly',
 u'fantastic',
 u'great',
 u'incredible',
 u'phenomenal',
 u'heaven',
 u'favorite',
 u'love',
 u'gem',
 u'wow',
 u'notch',
 u'excellent',
 u'soooo',
 u'outstanding']

#### Key features(words) make the negative prediction

In [37]:
n = 20
get_bottom_values(model_lrc.coef_[0], n, words)

[u'worst',
 u'ok',
 u'rude',
 u'horrible',
 u'bland',
 u'slow',
 u'terrible',
 u'disappointing',
 u'okay',
 u'mediocre',
 u'average',
 u'overpriced',
 u'decent',
 u'poor',
 u'lacking',
 u'dry',
 u'awful',
 u'meh',
 u'lacked',
 u'reason']

#### Random Forest Classifier

In [38]:
# Build a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier(max_depth=None, n_estimators=5, min_samples_leaf=10)
model_rfc.fit(vectors_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [39]:
# Get score for training set
model_rfc.score(vectors_train, target_train)

0.81098341556031817

In [40]:
# Get score for test set
model_rfc.score(vectors_test, target_test)

0.76468917208446008

#### Features (words) are important by inspecting the RFC model

In [41]:
n = 20
get_top_values(model_rfc.feature_importances_, n, words)

[u'amazing',
 u'best',
 u'great',
 u'friendly',
 u'awesome',
 u'delicious',
 u'ok',
 u'vegas',
 u'love',
 u'horrible',
 u'rude',
 u'good',
 u'favorite',
 u'okay',
 u'disappointing',
 u'excellent',
 u'wasn',
 u'terrible',
 u'wonderful',
 u'las']