## Review Similarity & Sentiment Analysis

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('last_2_years_restaurant_reviews.csv')
df.head(2)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,name,categories,avg_stars
0,--9e1ONYQuAa-CB_Rrw7Tw,1,2016-05-17,0,0Qc1THNHSapDL7cv-ZzW5g,5,What can I say.. Wowzers! Probably one of the ...,0,4LxKRRIikhr65GfPDW626w,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0
1,--9e1ONYQuAa-CB_Rrw7Tw,0,2017-01-20,0,L8lo5SKXfZRlbn1bpPiC9w,5,Went here for guys weekend. Unbelievable. Ravi...,0,nT8zgjoc-PbdBoQsFEXFLw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0


#### Define Documents as the text of the reviews

In [4]:
documents = df['text'].values

In [5]:
documents.dtype, documents.shape

(dtype('O'), (400119,))

#### Defined favorable reviews as those reviews have five stars

In [6]:
df['favorable'] = (df['stars'] > 4)

In [7]:
target = df['favorable'].values

In [8]:
# Mean and standard deviation of "target"
target.mean(), target.std()

(0.46623379544585486, 0.49885854050021805)

#### Create a train test split

In [9]:
from sklearn.cross_validation import train_test_split



In [10]:
documents_train, documents_test, target_train, target_test = train_test_split( documents, target, test_size = 0.8, random_state = 42)

#### Get NLP representation of documents

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 5000)
vectors_train = vectorizer.fit_transform(documents_train).toarray()
words = vectorizer.get_feature_names()

In [16]:
vectors_train.shape

(80023, 5000)

In [17]:
vectors_test = vectorizer.transform(documents_test).toarray()

### 1. Similar Review Search

In [25]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
def get_top_values(lst, n, labels):
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]

In [20]:
def get_bottom_values(lst, n, labels):
    return [labels[i] for i in np.argsort(lst)[:n]]

In [30]:
# Draw an arbitrary review from test (unseen in training) documents
query = [documents_test[65]]
print(query)

["Food wasn't overpriced, but took a while to get them out. Ordered pho and teriyaki chicken. Pho was ok, but could use more spices. Had thought they would use dark meat for the teriyakichicken but they used white and I did not come out looking like the photo. Waitress was great though!"]


In [31]:
# Transform the drawn review to vector
query_vect = vectorizer.transform(query).toarray()

In [32]:
# Calculate the similarity score between vector and training vectors
similarity = cosine_similarity(query_vect, vectors_train)

In [33]:
# Find top 5 similar reviews
n = 5
top5_related = get_top_values(similarity[0], n, documents_train)

In [34]:
print('Our search query:')
print(query[0])

Our search query:
Food wasn't overpriced, but took a while to get them out. Ordered pho and teriyaki chicken. Pho was ok, but could use more spices. Had thought they would use dark meat for the teriyakichicken but they used white and I did not come out looking like the photo. Waitress was great though!


In [35]:
print('Most %s similar reviews:' % n)
for i, review in enumerate(top5_related):
    print('# %s:' % str(i+1))
    print(review)

Most 5 similar reviews:
# 1:
We came here because my Korean friend said it was very good. I think I am going to have rethink my notion of listening to him, or any Korean for that matter. So far, I have been less than impressed with both of my Korean friends telling me about a really good Pho place. Too West of Vietnam I guess. Maybe it's safer to ask them about Siberian food. That was cold. Doh. Ok, onwards to the pho.

The service is very friendly, the place is very clean as well. Yes, if you think you've stepped into an American restaurant, you are not mistaken. It really is a Vietnamese pho place. The pho however is ok. The broth is clean but it's not as tasty as I want it. Not Pho-ey enough. Man, you got to reek of pho in your hair, skin, and clothes, like you do when you leave a great pho place, to tell that it must have been a good pho place. Come on, you know it's true. Stereotype my bott.

But, if you don't want to drive down to Chinatown, then I guess this is ok and you just w

### 2. Review Sentiment Analysis

#### Naive Bayes Classifier

In [36]:
# Build a Naive-Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
model_nb = MultinomialNB()
model_nb.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
# Get score for training dataset
model_nb.score(vectors_train, target_train)

0.81077940092223488

In [38]:
# Get score for testing dataset
model_nb.score(vectors_test, target_test)

0.80386196641007701

#### Logistic Regression Classifier

In [40]:
# Build a Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
model_lrc = LogisticRegression()
model_lrc.fit(vectors_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
# Get score for training set
model_lrc.score(vectors_train, target_train)

0.84514452095022685

In [42]:
# Get score for test set
model_lrc.score(vectors_test, target_test)

0.82624275217434773

#### Words that predict positive sentiment

In [43]:
n = 20
get_top_values(model_lrc.coef_[0], n, words)

['amazing',
 'best',
 'awesome',
 'incredible',
 'great',
 'phenomenal',
 'delicious',
 'thank',
 'perfect',
 'excellent',
 'fantastic',
 'love',
 'wonderful',
 'outstanding',
 'bomb',
 'gem',
 'favorite',
 'heaven',
 'perfection',
 'notch']

#### Words that predict negative sentiment

In [44]:
n = 20
get_bottom_values(model_lrc.coef_[0], n, words)

['worst',
 'horrible',
 'ok',
 'disappointing',
 'slow',
 'mediocre',
 'bland',
 'terrible',
 'okay',
 'rude',
 'decent',
 'poor',
 'dry',
 'unfortunately',
 'average',
 'overpriced',
 'awful',
 'lacking',
 'overall',
 'worse']

#### Random Forest Classifier

In [45]:
from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier(max_depth = None, n_estimators = 5, min_samples_leaf = 10)
model_rfc.fit(vectors_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
# Get score for training set
model_rfc.score(vectors_train, target_train)

0.81457830873623838

In [47]:
# Get score for test set
model_rfc.score(vectors_test, target_test)

0.77307432770168949

#### Important words for classification from RF model

In [48]:
n = 20
get_top_values(model_rfc.feature_importances_, n, words)

['amazing',
 'best',
 'great',
 'delicious',
 'wasn',
 'awesome',
 'rude',
 'ok',
 'friendly',
 'worst',
 'order',
 'favorite',
 'minutes',
 'love',
 'horrible',
 'bland',
 'decent',
 'excellent',
 'like',
 'pretty']