Note:
    
    this notebook should be run in Vertex AI WorkBench 

In [5]:
# The following two lines are only necessary to run once.
# Comment out otherwise for speed-up.
from google.cloud.bigquery import Client, QueryJobConfig
client = Client()

query = """SELECT * FROM `teaching-project-380417.yelp002.yelp_ml_prep_001` """
job = client.query(query)
df = job.to_dataframe()

In [4]:
df

Unnamed: 0,user_useful,user_review_count,user_funny,user_fans,state,bus_stars,review_count,WiFi,review_text,stars
0,4,12,0,0,TX,2.0,9,,Get on the mailing list! They send monthly dis...,5.0
1,170,144,40,4,SD,4.5,42,u'free',Ask for Rob,5.0
2,687,77,578,10,MT,5.0,6,u'no',Ask any staff member about what they do and ho...,5.0
3,0,1,0,0,AB,2.5,70,u'free',"If coming on a Saturday, be prepared to wait, ...",1.0
4,122,152,43,11,AB,4.5,6,,Check the mindbody app or groupon for promotio...,4.0
...,...,...,...,...,...,...,...,...,...,...
995,0,1,0,0,AB,3.5,60,u'no',$$$$! but somtimes its worth it,5.0
996,169,156,71,10,AB,4.5,6,u'free',Make sure you arrive a bit early for your appo...,5.0
997,499,186,259,12,AB,3.0,10,u'free',Enjoy the free Wi-Fi that actually works!,4.0
998,5067,1578,1469,159,AB,3.5,52,'free',100% vegan,2.0


In [7]:
documents = df['review_text'].values
print(documents.shape, documents.dtype)

(406315,) object


In [8]:
documents

array(['Different menu and later hours from other location',
       "Watch for low hanging duct work if you're on the tall side",
       "Entrance has changed to Calgary Trail and not accessible from Gateway Blvd. Lots of construction still happening around there making it super hard to hear order being repeated and I'm guessing the same for those taking orders.",
       ..., 'Great new tap growler selection',
       'They offer a military discount',
       'They now close at 8!! Still super fantastic!'], dtype=object)

In [None]:
# perfect (5 stars) and imperfect (1-4 stars) rating

In [9]:
df['target'] = df['stars'] == 5
target = df['target'].values
target[:5]

array([False, False, False,  True, False])

In [10]:
df.target.mean()

0.5160306658626928

In [12]:
# create training dataset and test dataset
from sklearn.model_selection import train_test_split
documents_train, documents_test, target_train, target_test = train_test_split(
    documents, target, test_size = 0.7, random_state = 42)

In [14]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [15]:
# NLP representation of the documents
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
# Create TfidfVectorizer, and name it vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'), max_features=2000)

# Train the model with training data
vectors_train = tfidf_vectorizer.fit_transform(documents_train).toarray()

# Get the vocab of tfidf
wordspitts = tfidf_vectorizer.get_feature_names()

# Use the trained model to transform test data
vectors_test = tfidf_vectorizer.transform(documents_test).toarray()



In [17]:
wordspitts

['00',
 '10',
 '100',
 '10am',
 '10pm',
 '11',
 '11am',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1st',
 '20',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '21',
 '22',
 '24',
 '25',
 '27',
 '2nd',
 '2pm',
 '30',
 '30pm',
 '35',
 '3pm',
 '3rd',
 '40',
 '45',
 '4pm',
 '4th',
 '50',
 '5pm',
 '60',
 '6pm',
 '75',
 '7pm',
 '8pm',
 '90',
 '95',
 '99',
 '9pm',
 'able',
 'absolute',
 'absolutely',
 'ac',
 'accept',
 'access',
 'accommodating',
 'across',
 'actual',
 'actually',
 'add',
 'added',
 'addition',
 'additional',
 'address',
 'adult',
 'advance',
 'advantage',
 'affordable',
 'afraid',
 'afternoon',
 'ago',
 'ahead',
 'air',
 'airport',
 'al',
 'alcohol',
 'ale',
 'allow',
 'allowed',
 'almond',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'alternative',
 'although',
 'always',
 'amazing',
 'amazingly',
 'ambiance',
 'ambience',
 'american',
 'amount',
 'anniversary',
 'another',
 'answer',
 'anymore',
 'anyone',
 'anything',
 'anytime',
 '

In [18]:
# need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]

In [19]:
# use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
# Draw an arbitrary review from test (unseen in training) documents
random_number = 20
search_query = [documents_test[random_number]]
print(search_query)

['Knowledge is power!']


In [22]:
# Transform the drawn resview(s) to vector(s)
vector_search_query = tfidf_vectorizer.transform(search_query).toarray()

# Calculate the similarity score(s) between vector(s) and training vectors
similarity_scores = cosine_similarity(vector_search_query, vectors_train)

In [24]:
vectors_train.shape, vectors_test.shape , similarity_scores.shape 


((121894, 2000), (284421, 2000), (1, 121894))

In [37]:
# Let's find top 10 similar reviews
import numpy as np 
n = 10
top_similar_reviews = get_top_values(similarity_scores[0], n, documents_train)

In [33]:
top_similar_reviews

['Blackout... No power',
 'Alles inordnung...power !',
 'Power lunch!',
 'Power (Happy) Hour with @Sierra @Carla @Zarna',
 'Power breakfast sandwich.',
 "It's raining again. If the power goes out - we good!",
 'Beautiful place even with the power plant.',
 "Watch out for the for me the bouncer, he's on a major power trip.",
 'Breakfast power sandwich....a must try',
 'This location has a decent selection of power tools.',
 'There are not enough power outlets!',
 'You get a free power boost with every purchase!',
 'Only place with power. Nice people good food jiffy service. Happy',
 'Free WiFi & power plugs.  Bathrooms',
 'Under the bar ledge is a power outlet and purse hook at every other stool. Drink up, charge up!',
 'Power dining 100%.  Suit up, prep your contract, bring your client.  Make a killing.',
 'Check the phone booths in the terminals for free power outlets.',
 'Flower Power on tap, come join us!',
 'I can literally kill the spinach power salad in record time, it is that go

In [34]:
print('Our search query:')
print(search_query[0]) 

Our search query:
Knowledge is power!


In [38]:
print('query')
print(search_query)  

print('\n\nMost %s similar reviews:' % n)
for i, review in enumerate(top_similar_reviews):
    print('#%s:' % i)
    print(review)

query
['Knowledge is power!']


Most 10 similar reviews:
#0:
Blackout... No power
#1:
Alles inordnung...power !
#2:
Power lunch!
#3:
Power (Happy) Hour with @Sierra @Carla @Zarna
#4:
Power breakfast sandwich.
#5:
It's raining again. If the power goes out - we good!
#6:
Beautiful place even with the power plant.
#7:
Watch out for the for me the bouncer, he's on a major power trip.
#8:
Breakfast power sandwich....a must try
#9:
This location has a decent selection of power tools.


In [42]:
 # Classifying positive/negative review
# Naive-Bayes Classifier
# Build a Naive-Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()

model_nb.fit(vectors_train, target_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
# Get score for training set
model_nb.score(vectors_train, target_train)

0.6514594647808752

In [43]:

# Get score for test set
model_nb.score(vectors_test, target_test)

0.6405609993636194

In [44]:



# Logistic Regression Classifier
target_train.shape
(14742,)
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

model_lrc = LogisticRegression()
model_lrc.fit(vectors_train, target_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(multi_class='ovr', n_jobs=1, solver='liblinear')

In [45]:
# Get score for training set
model_lrc.score(vectors_train, target_train)

0.6678343478760235

In [46]:

# Get score for test set
model_lrc.score(vectors_test, target_test)

0.6520791362100549

In [47]:
# Q: What are the key features(words) that make the positive prediction?
# Let's find it out by ranking
n = 50
get_top_values(model_lrc.coef_[0], n, wordspitts)

['charro',
 'deme',
 'seca',
 'best',
 'amazing',
 'bri',
 'maggie',
 'heaven',
 'favorite',
 'professional',
 'highly',
 'mocha',
 'ftw',
 'jose',
 'regret',
 'delicious',
 'loved',
 'incredible',
 'awesome',
 'love',
 'honest',
 'reasonably',
 'disappoint',
 'cuts',
 'die',
 'wonderful',
 'advance',
 'everything',
 'great',
 'excellent',
 'fabulous',
 'fantastic',
 'perfection',
 'unbelievable',
 'phenomenal',
 'tucson',
 'fav',
 'beautiful',
 'boise',
 'killer',
 'unique',
 'notch',
 'knowledgeable',
 'perfect',
 'products',
 'row',
 'brazilian',
 'drip',
 'exceptional',
 '100']

In [48]:
# Q: What are the key features(words) that make the negative prediction?
# Let's find it out by ranking
n = 30
get_bottom_values(model_lrc.coef_[0], n, wordspitts)

['worst',
 'horrible',
 'rude',
 'terrible',
 'mediocre',
 'overpriced',
 'bland',
 'poor',
 'disgusting',
 'sucks',
 'worse',
 'slow',
 'somewhere',
 'average',
 'waste',
 'awful',
 'ok',
 'gross',
 'okay',
 'bother',
 'decent',
 'poisoning',
 'unprofessional',
 'meh',
 'managers',
 'however',
 'ugh',
 'told',
 'avoid',
 'expensive']

In [49]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier(max_depth = 25, n_estimators = 200, random_state = 42)

model_rfc.fit(vectors_train, target_train)

RandomForestClassifier(max_depth=25, n_estimators=200, random_state=42)

In [50]:
# Get score for training set
model_rfc.score(vectors_train, target_train)

0.6552168277355735

In [51]:
# Get score for test set
model_rfc.score(vectors_test, target_test)

0.6364016721690733

In [None]:
model_rfc

# Q: What do you see from the training score and the test score?
The trainng score and the test score are comparable, seems no negative impact of overfitting.

And the scores fo Random forest are slightly better than Logistic regression. If the data size is larger, the advantage fo Random forest may be more obvious.



In [52]:
# Q: Can you tell what features (words) are important by inspecting the RFC model?
n = 50
get_top_values(model_rfc.feature_importances_, n, wordspitts)

['best',
 'amazing',
 'great',
 'love',
 'awesome',
 'delicious',
 'horrible',
 'rude',
 'worst',
 'food',
 'excellent',
 'terrible',
 'favorite',
 'slow',
 'place',
 'ok',
 'friendly',
 'highly',
 'service',
 'bad',
 'money',
 'poor',
 'always',
 'overpriced',
 'staff',
 'avoid',
 'everything',
 'ever',
 'good',
 'town',
 'mediocre',
 'wonderful',
 'fantastic',
 'awful',
 'try',
 'loved',
 'sucks',
 'somewhere',
 'recommend',
 'waste',
 'decent',
 'average',
 'fresh',
 'go',
 'minutes',
 'atmosphere',
 'must',
 'hour',
 'bland',
 'expensive']