In [1]:
# installs.
!python -m spacy download en_core_web_lg # largest english model from spacy

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.4MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180945 sha256=f9b4e439ac8ae940af6b40dd00d143302947991aa2969ac6bf5cad0f594630d8
  Stored in directory: /tmp/pip-ephem-wheel-cache-s0zz9tfz/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [1]:
# imports.
import pandas as pd
import re
import spacy 
from spacy.tokenizer import Tokenizer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# read in the Yelp json file 
yelp = pd.read_json('https://raw.githubusercontent.com/CVanchieri/LambdaSchool-Sprints/master/Sprints/DS/Unit4/Sprint11_CharlesVanchieri/review_sample.json?token=AHYSXEC7KWZRBP6A5YGAUOTABNTOM', lines=True)
yelp = yelp[['business_id', 'review_id', 'text', 'cool', 'funny', 'useful', 'stars']]
print(yelp.shape)
yelp.head()

(10000, 7)


Unnamed: 0,business_id,review_id,text,cool,funny,useful,stars
0,nDuEqIyRc8YKS1q1fX0CZg,eZs2tpEJtXPwawvHnHZIgQ,"BEWARE!!! FAKE, FAKE, FAKE....We also own a sm...",1,0,10,1
1,eMYeEapscbKNqUDCx705hg,DoQDWJsNbU0KL1O29l_Xug,Came here for lunch Togo. Service was quick. S...,0,0,0,4
2,6Q7-wkCPc1KF75jZLOTcMw,DDOdGU7zh56yQHmUnL1idQ,I've been to Vegas dozens of times and had nev...,1,1,2,3
3,k3zrItO4l9hwfLRwHBDc9w,LfTMUWnfGFMOfOIyJcwLVA,We went here on a night where they closed off ...,3,4,5,1
4,6hpfRwGlOzbNv7k5eP9rsQ,zJSUdI7bJ8PNJAg4lnl_Gg,"3.5 to 4 stars\n\nNot bad for the price, $12.9...",1,0,5,4


In [3]:
yelp['text'] = yelp['text'].apply(lambda x: re.sub(r'[^a-zA-Z ^0-9]', '', x)) # keep only letters and numbers.
yelp['text'] = yelp['text'].apply(lambda x: re.sub(r'(x.[0-9])', '', x)) # remove any special characters. 
yelp['text'] = yelp['text'].replace('/', ' ') # remove additional white spaces.
yelp['text'] = yelp['text'].apply(lambda x: re.sub('  ', ' ', x)) # convert  the text to lowercase.
yelp['text'] = yelp['text'].apply(lambda x: x.lower())

In [4]:
yelp.text[4] # example reviews text 

'35 to 4 starsnot bad for the price 1299 for lunch seniors get 15 off pay at the front before eating there are hot food salads noodle bar dessert fruits fried varieties and soupseating in the middle section is a bit too tight but the booths on the side look more spaciousi think the lunch noodle bar would have more variety such as different types of greens vermicelli noodles frozen tofu etc the tomato and laksa broth were both pretty good'

In [5]:
df = yelp.copy() # copy the dataframe.

In [6]:
nlp = spacy.load("en_core_web_lg") # set the nlp and load the "en_core_web_lg" file.
tokenizer = Tokenizer(nlp.vocab) # use to tokenizer on the nlp.vocab
STOP_WORDS = nlp.Defaults.stop_words # use the nlp default stop words

In [7]:
### tokenizer pipeline, remove stop words, blank words, and lemmatize ###
tokens = [] # create a list for the tokens 
for doc in tokenizer.pipe(df['text'], batch_size=500): # run the df['text] data through the tokenizer in batch sizes, loop through each doc
    doc_tokens = [] # create a list for the doc tokens
    for token in doc: # loop through each token in the doc
        if (token.lemma_ not in STOP_WORDS) & (token.text != ' '): # if the token.lemma_ is not in the stop words and token.text is not blank space
            doc_tokens.append(token.lemma_) # add the token.lemma_ to the doc tokens list 
    tokens.append(doc_tokens) # add the doc tokens list to the tokens list 
# set the new tokens in the data frame.
df['tokens'] = tokens # add a new new tokens list to a new column 
df['tokens']

0       [beware, fake, fake, fakewe, small, business, ...
1       [come, lunch, togo, service, quick, staff, fri...
2       [ive, vega, dozen, time, step, foot, circus, c...
3       [night, close, street, party, actually, group,...
4       [35, 4, starsnot, bad, price, 1299, lunch, sen...
                              ...                        
9995    [family, hungry, subway, open, 24, hour, guy, ...
9996    [wife, come, couple, friend, sever, excite, po...
9997    [food, okay, brag, food, hot, item, tasty, hor...
9998    [today, visit, great, love, enjoy, town, squar...
9999    [absolute, wrong, place, stay, 43, year, life,...
Name: tokens, Length: 10000, dtype: object

In [8]:
df['tokens'][4] # example list of tokens

['35',
 '4',
 'starsnot',
 'bad',
 'price',
 '1299',
 'lunch',
 'senior',
 '15',
 'pay',
 'eat',
 'hot',
 'food',
 'salad',
 'noodle',
 'bar',
 'dessert',
 'fruit',
 'fry',
 'variety',
 'soupseating',
 'middle',
 'section',
 'bite',
 'tight',
 'booth',
 'look',
 'spaciousi',
 'think',
 'lunch',
 'noodle',
 'bar',
 'variety',
 'different',
 'type',
 'green',
 'vermicelli',
 'noodle',
 'freeze',
 'tofu',
 'etc',
 'tomato',
 'laksa',
 'broth',
 'pretty',
 'good']

## NearestNeighbors model

In [9]:
vects = [nlp(doc).vector for doc in df['text']] # create vectors from the text

In [10]:
nn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree') # set the nearest neighbors using balltree
nn.fit(vects) # fit the nn on the vects

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [11]:
# create the fake review
created_review = """
The indian food was magnificent! We will come back.
"""
created_review_vect = nlp(created_review).vector # create a vector for the review

In [12]:
most_similiar = nn.kneighbors([created_review_vect]) # use nn model on the created review

In [13]:
yelp.iloc[most_similiar[1][0]]['text'] # look at the similar reviews text

1580    we made a reservation via internet but got no ...
1896    we tried this place for the first time on sund...
7256    this is an uber cool event we bought our ticke...
5456    this past sunday 72918 my family an i were in ...
2810    the food here is excellent this is by far our ...
1181    this place was insane 26 for an intense seafoo...
8719    a huge shout out to the cafe325 recommend to a...
1799    overused quotes throughout the nightomg this i...
8859    absolutely the best greek food ive ever eaten ...
7623    after dinner we went to go see wanted at storm...
Name: text, dtype: object

## Star reviews

In [14]:
vect = TfidfVectorizer(stop_words=STOP_WORDS) # set the vector with TfidfVectorizer
rfc = RandomForestClassifier() # set the random forest classifier

In [15]:
### create the pipeline ###
pipe = Pipeline([
                 ('vect', vect), # vectorizer
                 ('clf', rfc) # classifier           
                ])

In [16]:
# set the parameters
parameters = {
    'vect__max_df': ( 0.5, 0.75, 1.0, 1.25, 1.50),
    'vect__min_df': (.01, .03, .05, .07, .09)
    }  
grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1) # use GRidSearchCV on the pipe and parameters
grid_search.fit(df['text'], df['stars']) # fit the data on the grid_search

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed: 14.2min finished
  'stop_words.' % sorted(inconsistent))


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [17]:
grid_search.best_score_ # show the grid search best score 

0.5827

In [18]:
created_review = [created_review] # set the review to a list 
pred = grid_search.predict(created_review) # use grid search predict on the review 
created_review_stars = pd.DataFrame({'text': created_review, 'stars':pred}) # craete dataframe with reivew and stars
created_review_stars['stars'] = created_review_stars['stars'].astype('int64') # change the data type to int
created_review_stars['text'] = created_review_stars.text.replace('\n':'')
pd.set_option('display.max_colwidth',1000)
created_review_stars.head()

Unnamed: 0,text,stars
0,\nThe indian food was magnificent! We will come back.\n,5
