# Predicting Health Inspection Violaitons from Yelp Reviews and Business Attributes

## 1. Environment Setup

In [1]:
import psycopg2 as psy
import pandas as pd
import re
import numpy as np
import ast
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
import scipy.sparse
import sklearn.pipeline as pipe

## 2. Data Preparation

Connect to database and download initial datasets. These datasets are materialized views of the Yelp business data converted from JSON format, and the Toronto Inspection dataset imported from CSV. All attributes have been normalized to remove leading spaces, JSON tags, and unreadable characters.

In [2]:
#set up connection to our DB
conn = psy.connect(database="sterndsyelp", 
                        user="mvsternds", 
                        password="nyustern123!", 
                        host="sterndsyelp.cawzspvmqd5q.us-east-1.rds.amazonaws.com", 
                        port="5432"
                       )
#open cursor and check our tables in the DB
cur = conn.cursor()

In [3]:
#get Yelp checkin data
# cur.execute("SELECT * FROM public.toronto_checkins")
# checkins = pd.DataFrame(cur.fetchall())
#get Yelp review text
cur.execute("SELECT * FROM public.toronto_reviews")
reviews = pd.DataFrame(cur.fetchall())

In [4]:
reviews.columns = ['bizID','reviewID','userID','type','stars','text','useful','funny','cool','date']
#get total reviews per biz
rev = reviews['bizID'].value_counts()
rev_counts = pd.DataFrame(rev).reset_index()
rev_counts.columns = ['bizID','all_review_count']

In [5]:
# #not using this - can delete

# checkins.columns = ['bizID','type','datetime']
# #get total checkins per biz
# chks  = checkins['bizID'].value_counts()
# chk_counts = pd.DataFrame(chks).reset_index()
# chk_counts.columns = ['bizID','checkin_counts']

### Join Yelp Review Data with Inspection Dataset

#### Levenshtein Distance (in-database) 
This option joins the yelp restaurant informaiton to each inspection record where:
 * The [Levenshtein distance](https://xlinux.nist.gov/dads/HTML/Levenshtein.html) of the restaurant name from the two datasets is <3
 * The distance of the address from each dataset is <4
 * The date of the review is greater than the prior inspection date
 * The date of the review is less than or equal to inspeciton date on the record
 
Whitespace at the beginning and end of the name and address in each dataset is trimmed, and the strings are converted to uppercase before matching. The mathcing thresholds can be adjusted to increase potential for matching, or decrease false matches.

In [6]:
# The materialized view of the restaurant, inspection, and review data is "toronto_all"
cur.execute("SELECT * FROM public.toronto_all_2 where review_date is not null and attributes is not null" )
obs = pd.DataFrame(cur.fetchall())
obs.head()
obs.columns=['bizID','name','address','postal_code','neighborhood','lat','long','categories','attributes','is_open','review_cnt','hours','stars','setablishment_id','establishment_name','establishment_address','inspection_date','last_inspection','count_minor','count_sig','count_crucial','count_na','count_crucial_signficant','review_id','user_id','review_stars','review_text','useful','funny','cool','review_dt']
obs.head()

Unnamed: 0,bizID,name,address,postal_code,neighborhood,lat,long,categories,attributes,is_open,...,count_na,count_crucial_signficant,review_id,user_id,review_stars,review_text,useful,funny,cool,review_dt
0,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,992QUEENSTREETW,M6J 1H1,Ossington Strip,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,...,0,0,k1YwiLwOuvdd-mRfqTjOuQ,YKU5vy0dWFP1MLexzzVLQg,3,"""Good food, lovely service. A little loud for ...",0,0,0,2015-08-15
1,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,992QUEENSTREETW,M6J 1H1,Ossington Strip,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,...,0,1,AZyLvCxyN5DnhyY2JboEgA,fSYQ_oQ0wNGk2nMJLvNZJA,1,"""Ok so imagine that is Halloween all year arou...",0,0,0,2016-12-19
2,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,992QUEENSTREETW,M6J 1H1,Ossington Strip,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,...,0,0,0Y3ksUrPhSjdFwgYXgXPlw,2MA4jXpayOxqzul0e6yPDg,4,"""Great little cheap dive bar with a dimly lit ...",1,0,0,2016-06-15
3,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,992QUEENSTREETW,M6J 1H1,Ossington Strip,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,...,0,0,dN3IVxSXlUC8kwfCYt155w,BKF78sXmNHReyj3YTGfHtQ,4,A wonderful wonderful neighborhood bar with a ...,0,0,0,2016-08-15
4,0ORDYMDX027EhfsoFOkNcQ,BRITISHSTYLEFISHCHIPS,73COXWELLAVENUE,M4L 3B1,Upper Beach,43.6686238,-79.3172737,"['Restaurants', 'Fish & Chips']","['Alcohol: none', ""Ambience: {'romantic': Fals...",1,...,0,0,pgEUxcu0fGb7jlsC3h7zhg,VP7_Jz3wdklXvbfOHiBLLA,5,Love this place. They have the best pub style ...,0,0,0,2016-02-07


In [7]:
#create a primary key of restaurant ID and each unique inspection date for that restaurant
obs['bizID-dt'] = obs['bizID'] + "-" + obs['inspection_date'].map(str)

In [8]:
#get counts of in scope reviews for each inspeciton date of a given restaurant
in_scope_rev = obs['bizID-dt'].value_counts()
in_scope_reviews = pd.DataFrame(in_scope_rev).reset_index()
in_scope_reviews.columns = ['bizID-dt','count_reviews_in_scope']
in_scope_reviews.head()

Unnamed: 0,bizID-dt,count_reviews_in_scope
0,RwRNR4z3kY-4OsFqigY5sw-2015-11-03,248
1,jc3p5SFyt9qrrMXt6E13ig-2016-12-21,161
2,RwRNR4z3kY-4OsFqigY5sw-2016-03-15,153
3,h_4dPV9M9aYaBliH1Eoeeg-2016-05-12,141
4,trKyIRyjKqVSZmcU0AnICQ-2016-04-11,114


In [9]:
#get dummies for star rating column
obs = pd.concat([obs, pd.get_dummies(obs['review_stars'], prefix='stars')], axis=1)
obs.head()

Unnamed: 0,bizID,name,address,postal_code,neighborhood,lat,long,categories,attributes,is_open,...,useful,funny,cool,review_dt,bizID-dt,stars_1,stars_2,stars_3,stars_4,stars_5
0,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,992QUEENSTREETW,M6J 1H1,Ossington Strip,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,...,0,0,0,2015-08-15,0o9imYHyCGVvkR0NyJDCoQ-2016-03-11,0,0,1,0,0
1,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,992QUEENSTREETW,M6J 1H1,Ossington Strip,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,...,0,0,0,2016-12-19,0o9imYHyCGVvkR0NyJDCoQ-2017-01-30,1,0,0,0,0
2,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,992QUEENSTREETW,M6J 1H1,Ossington Strip,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,...,1,0,0,2016-06-15,0o9imYHyCGVvkR0NyJDCoQ-2016-09-12,0,0,0,1,0
3,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,992QUEENSTREETW,M6J 1H1,Ossington Strip,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,...,0,0,0,2016-08-15,0o9imYHyCGVvkR0NyJDCoQ-2016-09-12,0,0,0,1,0
4,0ORDYMDX027EhfsoFOkNcQ,BRITISHSTYLEFISHCHIPS,73COXWELLAVENUE,M4L 3B1,Upper Beach,43.6686238,-79.3172737,"['Restaurants', 'Fish & Chips']","['Alcohol: none', ""Ambience: {'romantic': Fals...",1,...,0,0,0,2016-02-07,0ORDYMDX027EhfsoFOkNcQ-2016-05-10,0,0,0,0,1


In [10]:
#Get counts of each star rating for a given restaurant
stars = obs.groupby('bizID-dt')[['stars_1', 'stars_2','stars_3','stars_4','stars_5']].sum().reset_index()
stars.head()

Unnamed: 0,bizID-dt,stars_1,stars_2,stars_3,stars_4,stars_5
0,-2TBP3ZGu7M-FmfoNJvbrQ-2016-09-07,0,1,0,2,0
1,-2TBP3ZGu7M-FmfoNJvbrQ-2017-01-18,1,0,1,3,1
2,-76didnxGiiMO80BjSpYsQ-2015-09-01,1,1,0,0,0
3,-76didnxGiiMO80BjSpYsQ-2016-03-01,1,4,0,1,0
4,-76didnxGiiMO80BjSpYsQ-2016-09-29,2,3,0,1,1


In [11]:
#merge review text based on the business-inspection date primary key
combined_revs = obs.groupby('bizID-dt')['review_text'].apply(' '.join).reset_index()

In [12]:
#get a unique count of the users that reviewed the restaurant after the last inspection 
#and before the current inspection
users = obs.groupby('bizID-dt')['user_id'].count().reset_index()
users.columns = ['bizID-dt','count_unique_users']

In [35]:
sub = obs[['bizID-dt','bizID',
           'name','postal_code',
           'lat','long','categories',
           'attributes','is_open',
           'count_crucial_signficant','stars']]
sub = sub.drop_duplicates()
sub.head()

Unnamed: 0,bizID-dt,bizID,name,postal_code,lat,long,categories,attributes,is_open,count_crucial_signficant,stars
0,0o9imYHyCGVvkR0NyJDCoQ-2016-03-11,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,M6J 1H1,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,0,4.0
1,0o9imYHyCGVvkR0NyJDCoQ-2017-01-30,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,M6J 1H1,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,1,4.0
2,0o9imYHyCGVvkR0NyJDCoQ-2016-09-12,0o9imYHyCGVvkR0NyJDCoQ,LIPSTICKDYNAMITE,M6J 1H1,43.6444125,-79.4183518,"['Vegetarian', 'Cocktail Bars', 'Nightlife', '...","['Alcohol: full_bar', ""Ambience: {'romantic': ...",1,0,4.0
4,0ORDYMDX027EhfsoFOkNcQ-2016-05-10,0ORDYMDX027EhfsoFOkNcQ,BRITISHSTYLEFISHCHIPS,M4L 3B1,43.6686238,-79.3172737,"['Restaurants', 'Fish & Chips']","['Alcohol: none', ""Ambience: {'romantic': Fals...",1,0,4.0
8,0ORDYMDX027EhfsoFOkNcQ-2016-12-22,0ORDYMDX027EhfsoFOkNcQ,BRITISHSTYLEFISHCHIPS,M4L 3B1,43.6686238,-79.3172737,"['Restaurants', 'Fish & Chips']","['Alcohol: none', ""Ambience: {'romantic': Fals...",1,0,4.0


In [36]:
#merge in all data into one df
df1 = pd.merge(sub,stars,on='bizID-dt', how='left')
df2 = pd.merge(df1,combined_revs,on='bizID-dt', how='left')
df3 = pd.merge(df2,rev_counts,on='bizID', how='left')
df4 = pd.merge(df3,in_scope_reviews,on='bizID-dt', how='left')
df5 = pd.merge(df4,users,on='bizID-dt', how='left')


### Parsing the "Categories" and "Attributes" columns into unique features
The next cell parses the 'Categories' and 'Attributes' columns,which are nested dicitonaries of different attributesof each restaurant. The categroy column captures descriptive features such as the types of cuisine served, and wheter or not the restaurant is a bar. The atttributes column captures features such as the 'ambiance' of the restaurant, parking, noise level, and other unique features.

In [37]:
t = []
for i in range(len(df5['categories'])):
    x = ast.literal_eval(df5['categories'][i])
    t.append(x)
    
cats = pd.DataFrame(t)
cats_df = pd.get_dummies(cats, prefix='Category')
cats_df = cats_df.groupby(cats_df.columns, axis=1).sum()


atts_df = pd.DataFrame()
for x in range(len(df5['attributes'])):
    list_yelp = ast.literal_eval(df5['attributes'][x])

    attribute_list = []
    attribute_name = []

    for i in list_yelp:

        name = i.split(":")[0]
        values = i.split( name+": " )[1].replace('{','').replace('}','')

        if len(values.split(":")) > 1: 
        
            for j in values.split(","):
                name_j = name + "_" + (j.split(":")[0].strip().replace("'",''))
                attribute_name.append( name_j )
                attribute_list.append (j.split(":")[1])
        else:
            attribute_name.append( name )
            attribute_list.append ( values )
        
    dataframe = pd.DataFrame(attribute_list).transpose()
    dataframe.columns = attribute_name
    atts_df = atts_df.append(dataframe)
    
atts_df = atts_df.reset_index().drop('index', 1)

df = pd.concat([df5, cats_df, atts_df], axis=1)


Next, we create dummy variables for all of the attribute and category features that were generated in the previous step. We want to capture if the attribute is true, false, or nor applicable for the restaurant.

In [38]:
#dummy-ize all categorical and boolean variables
df=pd.get_dummies(df, columns=[
 'Alcohol',
 'Ambience_casual',
 'Ambience_classy',
 'Ambience_hipster',
 'Ambience_intimate',
 'Ambience_romantic',
 'Ambience_touristy',
 'Ambience_trendy',
 'Ambience_upscale',
 'BikeParking',
 'BusinessAcceptsCreditCards',
 'BusinessParking_garage',
 'BusinessParking_lot',
 'BusinessParking_street',
 'BusinessParking_valet',
 'BusinessParking_validated',
 'Caters',
 'GoodForKids',
 'GoodForMeal_breakfast',
 'GoodForMeal_brunch',
 'GoodForMeal_dessert',
 'GoodForMeal_dinner',
 'GoodForMeal_latenight',
 'GoodForMeal_lunch',
 'HasTV',
 'NoiseLevel',
 'OutdoorSeating',
 'RestaurantsAttire',
 'RestaurantsDelivery',
 'RestaurantsGoodForGroups',
 'RestaurantsPriceRange2',
 'RestaurantsReservations',
 'RestaurantsTableService',
 'RestaurantsTakeOut',
 'WiFi', 
 'is_open',
'BestNights_friday',
 'BestNights_monday',
 'BestNights_saturday',
 'BestNights_sunday',
 'BestNights_thursday',
 'BestNights_tuesday',
 'BestNights_wednesday',
     'ByAppointmentOnly',
 'CoatCheck',
 'DogsAllowed',
 'DriveThru',
 'GoodForDancing',
 'HappyHour',
 'Music_background_music',
 'Music_dj',
 'Music_jukebox',
 'Music_karaoke',
 'Music_live',
 'Music_no_music',
 'Music_video',
 'Open24Hours',
 'RestaurantsCounterService',
 'Smoking',
 'WheelchairAccessible'
  ]
  , dummy_na=True)


In [50]:
df['stars']=pd.to_numeric(df['stars'])


In [51]:
#useful code to view all columns of df
pd.set_option('display.max_columns', None)

In [40]:
#remove all of the extraneous features
#backup=df
del df['categories']
del df['attributes']
del df['name']
del df['bizID']
del df['bizID-dt']
del df['lat']
del df['long']
del df['postal_code']
#Make a True/False target variable to label each inspection event with wether or not a critical violaton was found
df['count_crucial_signficant']= (df['count_crucial_signficant']>0)*1

### Text feature extraction, stop word removal, and lemmatization

In [52]:
df['count_crucial_signficant']= (df['count_crucial_signficant']>0)*1
from nltk.corpus import stopwords
import nltk
import string
stop = stopwords.words('english')
df['review_text']=df['review_text'].str.lower()
df['review_text'] = df['review_text'].apply(lambda x: '  '.join([word for word in x.split() if word not in (stop)]))
ps = nltk.stem.WordNetLemmatizer()
df['review_text']=df["review_text"].apply(lambda x:[ps.lemmatize(y,pos='v') for y in x.split()])
df['review_text']=df['review_text'].apply(lambda x: ',  '.join(x))
translator = str.maketrans('', '', string.punctuation)
df['review_text'] = df['review_text'].str.translate(translator)

## 3. Modeling and Text Vectorization Comparisons
First, we import all necessary packages. Ny comparing the results of several vectorizing methods as well as several moeling methods, we can determine which combination yield best performance.

In [53]:
df_test = df[df['count_reviews_in_scope'] >0]
X_ntext = df_test.drop(['review_text','count_crucial_signficant'], axis=1)

from sklearn.feature_extraction.text import TfidfVectorizer
#separate the non text features so that we only transform the text
X_text = df_test['review_text']
X_ntext = df_test.drop(['review_text','count_crucial_signficant'], axis=1)
# set up a few different vectorizers to test
vectorizer_a = CountVectorizer(binary=True, ngram_range=(1, 2))
vectorizer_b = pipe.Pipeline([
    ('vectorizer', HashingVectorizer(ngram_range=(1,4), non_negative=True)),
    ('tfidf', TfidfTransformer()),
])
bigram_vectorizer = TfidfVectorizer(ngram_range = (1,3))


Basic Linear Regression; no CV

In [54]:
#vectorize using count vectorizer and 3-gram features and create train/test split
X1 = scipy.sparse.hstack((bigram_vectorizer.fit_transform(df_test.review_text), X_ntext))
Y1 = df_test['count_crucial_signficant']
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, train_size=.75)

model = LogisticRegression()
model.fit(X_train1, Y_train1)
print ("Area under the ROC curve on test data = %.3f" % metrics.roc_auc_score(model.predict(X_test1), Y_test1))

Area under the ROC curve on test data = 0.556


In [55]:
#vectorize using hash vectorixer and tfidf and create train/test split
X2 = scipy.sparse.hstack((vectorizer_b.fit_transform(df.review_text), X_ntext))
Y2 = df['count_crucial_signficant']
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, train_size=.75)

model = LogisticRegression()
model.fit(X_train2, Y_train2)
print ("Area under the ROC curve on test data = %.3f" % metrics.roc_auc_score(model.predict(X_test2), Y_test2))

Area under the ROC curve on test data = 0.555


Basic Decision Tree, no CV

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train1, Y_train1)
print ("Area under the ROC curve on test data = %.3f" % metrics.roc_auc_score(model.predict(X_test1), Y_test1))