# Predicting Health Inspection Violaitons from Yelp Reviews and Business Attributes

## 1. Environment Setup

In [1]:
import psycopg2 as psy
import pandas as pd
import re
import numpy as np
import ast
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
import scipy.sparse
import sklearn.pipeline as pipe

## 2. Data Preparation

Connect to database and download initial datasets. These datasets are materialized views of the Yelp business data converted from JSON format, and the Toronto Inspection dataset imported from CSV. All attributes have been normalized to remove leading spaces, JSON tags, and unreadable characters.

In [27]:
#set up connection to our DB
conn = psy.connect(database="sterndsyelp", 
                        user="mvsternds", 
                        password="nyustern123!", 
                        host="sterndsyelp.cawzspvmqd5q.us-east-1.rds.amazonaws.com", 
                        port="5432"
                       )
#open cursor and check our tables in the DB
cur = conn.cursor()

In [3]:
#get Yelp checkin data
# cur.execute("SELECT * FROM public.toronto_checkins")
# checkins = pd.DataFrame(cur.fetchall())
#get Yelp review text
cur.execute("SELECT * FROM public.toronto_reviews")
reviews = pd.DataFrame(cur.fetchall())

In [4]:
reviews.columns = ['bizID','reviewID','userID','type','stars','text','useful','funny','cool','date']
#get total reviews per biz
rev = reviews['bizID'].value_counts()
rev_counts = pd.DataFrame(rev).reset_index()
rev_counts.columns = ['bizID','all_review_count']

In [5]:
# #not using this - can delete

# checkins.columns = ['bizID','type','datetime']
# #get total checkins per biz
# chks  = checkins['bizID'].value_counts()
# chk_counts = pd.DataFrame(chks).reset_index()
# chk_counts.columns = ['bizID','checkin_counts']

### Join Yelp Review Data with Inspection Dataset

#### Levenshtein Distance (in-database) 
This option joins the yelp restaurant informaiton to each inspection record where:
 * The [Levenshtein distance](https://xlinux.nist.gov/dads/HTML/Levenshtein.html) of the restaurant name from the two datasets is <3
 * The distance of the address from each dataset is <4
 * The date of the review is greater than the prior inspection date
 * The date of the review is less than or equal to inspeciton date on the record
 
Whitespace at the beginning and end of the name and address in each dataset is trimmed, and the strings are converted to uppercase before matching. The mathcing thresholds can be adjusted to increase potential for matching, or decrease false matches.

In [59]:
# The materialized view of the restaurant, inspection, and review data is "toronto_all"
cur.execute("SELECT * FROM public.toronto_all_3 where review_date is not null and attributes is not null" )
obs = pd.DataFrame(cur.fetchall())
obs.head()
obs.columns=['bizID','name','address','postal_code','neighborhood','lat','long','categories','attributes','is_open','review_cnt','hours','stars','setablishment_id','establishment_name','establishment_address','inspection_date','last_inspection','count_minor','count_sig','count_crucial','count_na','count_crucial_signficant','review_id','user_id','review_stars','review_text','useful','funny','cool','review_dt']
obs.head()

Unnamed: 0,bizID,name,address,postal_code,neighborhood,lat,long,categories,attributes,is_open,review_cnt,hours,stars,setablishment_id,establishment_name,establishment_address,inspection_date,last_inspection,count_minor,count_sig,count_crucial,count_na,count_crucial_signficant,review_id,user_id,review_stars,review_text,useful,funny,cool,review_dt
0,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2015-11-18,2015-06-30,0,0,2,0,2,dy4lv5ur08x_fCKMf2yJGA,YvzoOqfBProz9SveFwJYOw,3,Late night Northern Chinese BBQ restaurant in ...,0,0,0,2015-10-20
1,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2016-11-28,2016-06-06,4,1,0,0,1,rby1MutMX_kanEFJ30P3hg,blCeqedLmBNssP7oft_hAg,4,"""We stumbled on this restaurant and weren't su...",0,0,0,2016-10-01
2,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2016-06-02,2016-04-19,6,4,0,0,4,OIt9UjEW9KBZ6XL5B5kkGw,wGQJjtcG1-NsxslGv4bNjw,2,"""My first impression of this place was that it...",0,0,0,2016-05-15
3,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2016-04-19,2015-11-20,3,0,0,0,0,TMHqEjNQbtkbsmmxqfXGAg,7ymTU1NY3NXM0PnT_Z0Qhg,4,"""Went here with some friends and had the Sichu...",0,0,0,2016-01-29
4,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2016-04-19,2015-11-20,3,0,0,0,0,sodiJQbduh11MUG7xPR0Iw,Hk7n0zKhcrRwkvqgb_MQ3Q,4,"""Came here for lunch on a Sunday around 1.30pm...",0,0,0,2016-01-11


In [60]:
#create a primary key of restaurant ID and each unique inspection date for that restaurant
obs['bizID-dt'] = obs['bizID'] + "-" + obs['inspection_date'].map(str)

In [61]:
#get counts of in scope reviews for each inspeciton date of a given restaurant
in_scope_rev = obs['bizID-dt'].value_counts()
in_scope_reviews = pd.DataFrame(in_scope_rev).reset_index()
in_scope_reviews.columns = ['bizID-dt','count_reviews_in_scope']
in_scope_reviews.head()

Unnamed: 0,bizID-dt,count_reviews_in_scope
0,RwRNR4z3kY-4OsFqigY5sw-2015-11-03,233
1,jc3p5SFyt9qrrMXt6E13ig-2016-12-21,156
2,RwRNR4z3kY-4OsFqigY5sw-2016-03-15,148
3,h_4dPV9M9aYaBliH1Eoeeg-2016-05-12,138
4,trKyIRyjKqVSZmcU0AnICQ-2016-04-11,112


In [62]:
#get dummies for star rating column
obs = pd.concat([obs, pd.get_dummies(obs['review_stars'], prefix='stars')], axis=1)
obs.head()

Unnamed: 0,bizID,name,address,postal_code,neighborhood,lat,long,categories,attributes,is_open,review_cnt,hours,stars,setablishment_id,establishment_name,establishment_address,inspection_date,last_inspection,count_minor,count_sig,count_crucial,count_na,count_crucial_signficant,review_id,user_id,review_stars,review_text,useful,funny,cool,review_dt,bizID-dt,stars_1,stars_2,stars_3,stars_4,stars_5
0,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2015-11-18,2015-06-30,0,0,2,0,2,dy4lv5ur08x_fCKMf2yJGA,YvzoOqfBProz9SveFwJYOw,3,Late night Northern Chinese BBQ restaurant in ...,0,0,0,2015-10-20,0Yh6U06nGLjAMwCw6l9-DA-2015-11-18,0,0,1,0,0
1,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2016-11-28,2016-06-06,4,1,0,0,1,rby1MutMX_kanEFJ30P3hg,blCeqedLmBNssP7oft_hAg,4,"""We stumbled on this restaurant and weren't su...",0,0,0,2016-10-01,0Yh6U06nGLjAMwCw6l9-DA-2016-11-28,0,0,0,1,0
2,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2016-06-02,2016-04-19,6,4,0,0,4,OIt9UjEW9KBZ6XL5B5kkGw,wGQJjtcG1-NsxslGv4bNjw,2,"""My first impression of this place was that it...",0,0,0,2016-05-15,0Yh6U06nGLjAMwCw6l9-DA-2016-06-02,0,1,0,0,0
3,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2016-04-19,2015-11-20,3,0,0,0,0,TMHqEjNQbtkbsmmxqfXGAg,7ymTU1NY3NXM0PnT_Z0Qhg,4,"""Went here with some friends and had the Sichu...",0,0,0,2016-01-29,0Yh6U06nGLjAMwCw6l9-DA-2016-04-19,0,0,0,1,0
4,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,4186FINCHAVENUEE,M1S 5C9,Scarborough,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,48,"['Monday 12:0-2:0', 'Tuesday 12:0-2:0', 'Wedne...",3.5,10406996,YANGSBBQRESTAURANT,4186FINCHAVENUEE,2016-04-19,2015-11-20,3,0,0,0,0,sodiJQbduh11MUG7xPR0Iw,Hk7n0zKhcrRwkvqgb_MQ3Q,4,"""Came here for lunch on a Sunday around 1.30pm...",0,0,0,2016-01-11,0Yh6U06nGLjAMwCw6l9-DA-2016-04-19,0,0,0,1,0


In [63]:
#Get counts of each star rating for a given restaurant
stars = obs.groupby('bizID-dt')[['stars_1', 'stars_2','stars_3','stars_4','stars_5']].sum().reset_index()
stars.head()

Unnamed: 0,bizID-dt,stars_1,stars_2,stars_3,stars_4,stars_5
0,-2TBP3ZGu7M-FmfoNJvbrQ-2016-09-07,0,1,0,2,0
1,-2TBP3ZGu7M-FmfoNJvbrQ-2017-01-18,0,0,1,3,1
2,-76didnxGiiMO80BjSpYsQ-2015-09-01,1,1,0,0,0
3,-76didnxGiiMO80BjSpYsQ-2016-03-01,1,4,0,1,0
4,-76didnxGiiMO80BjSpYsQ-2016-09-29,2,3,0,1,1


In [64]:
#merge review text based on the business-inspection date primary key
combined_revs = obs.groupby('bizID-dt')['review_text'].apply(' '.join).reset_index()

In [65]:
#get a unique count of the users that reviewed the restaurant after the last inspection 
#and before the current inspection
users = obs.groupby('bizID-dt')['user_id'].count().reset_index()
users.columns = ['bizID-dt','count_unique_users']

In [66]:
sub = obs[['bizID-dt','bizID',
           'name','postal_code',
           'lat','long','categories',
           'attributes','is_open',
           'count_crucial_signficant','stars']]
sub = sub.drop_duplicates()
sub.head()

Unnamed: 0,bizID-dt,bizID,name,postal_code,lat,long,categories,attributes,is_open,count_crucial_signficant,stars
0,0Yh6U06nGLjAMwCw6l9-DA-2015-11-18,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,M1S 5C9,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,2,3.5
1,0Yh6U06nGLjAMwCw6l9-DA-2016-11-28,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,M1S 5C9,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,1,3.5
2,0Yh6U06nGLjAMwCw6l9-DA-2016-06-02,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,M1S 5C9,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,4,3.5
3,0Yh6U06nGLjAMwCw6l9-DA-2016-04-19,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,M1S 5C9,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,0,3.5
7,0Yh6U06nGLjAMwCw6l9-DA-2017-01-23,0Yh6U06nGLjAMwCw6l9-DA,YANGSBBQRESTAURANT,M1S 5C9,43.8042008789,-79.2884373665,"['Restaurants', 'Barbeque']","['Alcohol: beer_and_wine', ""Ambience: {'romant...",1,1,3.5


In [67]:
#merge in all data into one df
df1 = pd.merge(sub,stars,on='bizID-dt', how='left')
df2 = pd.merge(df1,combined_revs,on='bizID-dt', how='left')
df3 = pd.merge(df2,rev_counts,on='bizID', how='left')
df4 = pd.merge(df3,in_scope_reviews,on='bizID-dt', how='left')
df5 = pd.merge(df4,users,on='bizID-dt', how='left')


### Parsing the "Categories" and "Attributes" columns into unique features
The next cell parses the 'Categories' and 'Attributes' columns,which are nested dicitonaries of different attributesof each restaurant. The categroy column captures descriptive features such as the types of cuisine served, and wheter or not the restaurant is a bar. The atttributes column captures features such as the 'ambiance' of the restaurant, parking, noise level, and other unique features.

In [68]:
t = []
for i in range(len(df5['categories'])):
    x = ast.literal_eval(df5['categories'][i])
    t.append(x)
    
cats = pd.DataFrame(t)
cats_df = pd.get_dummies(cats, prefix='Category')
cats_df = cats_df.groupby(cats_df.columns, axis=1).sum()


atts_df = pd.DataFrame()
for x in range(len(df5['attributes'])):
    list_yelp = ast.literal_eval(df5['attributes'][x])

    attribute_list = []
    attribute_name = []

    for i in list_yelp:

        name = i.split(":")[0]
        values = i.split( name+": " )[1].replace('{','').replace('}','')

        if len(values.split(":")) > 1: 
        
            for j in values.split(","):
                name_j = name + "_" + (j.split(":")[0].strip().replace("'",''))
                attribute_name.append( name_j )
                attribute_list.append (j.split(":")[1])
        else:
            attribute_name.append( name )
            attribute_list.append ( values )
        
    dataframe = pd.DataFrame(attribute_list).transpose()
    dataframe.columns = attribute_name
    atts_df = atts_df.append(dataframe)
    
atts_df = atts_df.reset_index().drop('index', 1)

df = pd.concat([df5, cats_df, atts_df], axis=1)


Next, we create dummy variables for all of the attribute and category features that were generated in the previous step. We want to capture if the attribute is true, false, or nor applicable for the restaurant.

In [69]:
#dummy-ize all categorical and boolean variables
df=pd.get_dummies(df, columns=[
 'Alcohol',
 'Ambience_casual',
 'Ambience_classy',
 'Ambience_hipster',
 'Ambience_intimate',
 'Ambience_romantic',
 'Ambience_touristy',
 'Ambience_trendy',
 'Ambience_upscale',
 'BikeParking',
 'BusinessAcceptsCreditCards',
 'BusinessParking_garage',
 'BusinessParking_lot',
 'BusinessParking_street',
 'BusinessParking_valet',
 'BusinessParking_validated',
 'Caters',
 'GoodForKids',
 'GoodForMeal_breakfast',
 'GoodForMeal_brunch',
 'GoodForMeal_dessert',
 'GoodForMeal_dinner',
 'GoodForMeal_latenight',
 'GoodForMeal_lunch',
 'HasTV',
 'NoiseLevel',
 'OutdoorSeating',
 'RestaurantsAttire',
 'RestaurantsDelivery',
 'RestaurantsGoodForGroups',
 'RestaurantsPriceRange2',
 'RestaurantsReservations',
 'RestaurantsTableService',
 'RestaurantsTakeOut',
 'WiFi', 
 'is_open',
'BestNights_friday',
 'BestNights_monday',
 'BestNights_saturday',
 'BestNights_sunday',
 'BestNights_thursday',
 'BestNights_tuesday',
 'BestNights_wednesday',
     'ByAppointmentOnly',
 'CoatCheck',
 'DogsAllowed',
 'DriveThru',
 'GoodForDancing',
 'HappyHour',
 'Music_background_music',
 'Music_dj',
 'Music_jukebox',
 'Music_karaoke',
 'Music_live',
 'Music_no_music',
 'Music_video',
 'Open24Hours',
 'RestaurantsCounterService',
 'Smoking',
 'WheelchairAccessible'
  ]
  , dummy_na=True)


In [70]:
#useful code to view all columns of df
pd.set_option('display.max_columns', None)

In [71]:
#remove all of the extraneous features
#backup=df
del df['categories']
del df['attributes']
del df['name']
del df['bizID']
del df['bizID-dt']
del df['lat']
del df['long']
del df['postal_code']
#Make a True/False target variable to label each inspection event with wether or not a critical violaton was found
df['count_crucial_signficant']= (df['count_crucial_signficant']>0)*1

### Text feature extraction, stop word removal, and lemmatization

In [72]:
df['count_crucial_signficant']= (df['count_crucial_signficant']>0)*1
from nltk.corpus import stopwords
import nltk
import string
stop = stopwords.words('english')
df['review_text']=df['review_text'].str.lower()
df['review_text'] = df['review_text'].apply(lambda x: '  '.join([word for word in x.split() if word not in (stop)]))
ps = nltk.stem.WordNetLemmatizer()
df['review_text']=df["review_text"].apply(lambda x:[ps.lemmatize(y,pos='v') for y in x.split()])
df['review_text']=df['review_text'].apply(lambda x: ',  '.join(x))
translator = str.maketrans('', '', string.punctuation)
df['review_text'] = df['review_text'].str.translate(translator)

## 3. Modeling and Text Vectorization
First, we import all necessary packages. Ny comparing the results of several vectorizing methods as well as several moeling methods, we can determine which combination yield best performance.

In [73]:
df['stars'] = df.stars.astype(float)
df

Unnamed: 0,count_crucial_signficant,stars,stars_1,stars_2,stars_3,stars_4,stars_5,review_text,all_review_count,count_reviews_in_scope,count_unique_users,Category_Active Life,Category_Afghan,Category_African,Category_American (New),Category_American (Traditional),Category_Antiques,Category_Arabian,Category_Arcades,Category_Argentine,Category_Art Classes,Category_Art Galleries,Category_Art Schools,Category_Arts & Crafts,Category_Arts & Entertainment,Category_Asian Fusion,Category_Australian,Category_Austrian,Category_Automotive,Category_Bagels,Category_Bakeries,Category_Bangladeshi,Category_Barbeque,Category_Bars,Category_Beauty & Spas,Category_Beer,Category_Beer Bar,Category_Belgian,Category_Bistros,Category_Books,Category_Brasseries,Category_Brazilian,Category_Breakfast & Brunch,Category_Breweries,Category_British,Category_Bubble Tea,Category_Buffets,Category_Burgers,Category_Butcher,Category_Cabaret,Category_Cafes,Category_Cajun/Creole,Category_Canadian (New),Category_Candy Stores,Category_Cards & Stationery,Category_Caribbean,Category_Casinos,Category_Caterers,Category_Cheese Shops,Category_Cheesesteaks,Category_Chicken Shop,Category_Chicken Wings,Category_Chinese,Category_Chocolatiers & Shops,Category_Cocktail Bars,Category_Coffee & Tea,Category_Coffee Roasteries,Category_Colombian,Category_Comedy Clubs,Category_Comfort Food,Category_Comic Books,Category_Convenience Stores,Category_Cooking Schools,Category_Creperies,Category_Cuban,Category_Cupcakes,Category_Dance Clubs,Category_Delicatessen,Category_Delis,Category_Department Stores,Category_Desserts,Category_Dim Sum,Category_Diners,Category_Dive Bars,Category_Do-It-Yourself Food,Category_Donairs,Category_Donuts,Category_Drugstores,Category_Education,Category_Egyptian,Category_Ethic Grocery,Category_Ethiopian,Category_Ethnic Food,Category_Ethnic Grocery,Category_Event Planning & Services,Category_Falafel,Category_Farmers Market,Category_Fashion,Category_Fast Food,Category_Filipino,Category_Fish & Chips,Category_Florists,Category_Flowers & Gifts,Category_Food,Category_Food Court,Category_Food Delivery Services,Category_Food Stands,Category_Food Trucks,Category_French,Category_Fruits & Veggies,Category_Gas & Service Stations,Category_Gastropubs,Category_German,Category_Gift Shops,Category_Gluten-Free,Category_Greek,Category_Grocery,Category_Hakka,Category_Halal,Category_Hawaiian,Category_Health Markets,Category_Himalayan/Nepalese,Category_Hookah Bars,Category_Hot Dogs,Category_Hot Pot,Category_Hotels,Category_Hotels & Travel,Category_Hungarian,Category_IT Services & Computer Repair,Category_Ice Cream & Frozen Yogurt,Category_Imported Food,Category_Indian,Category_Indonesian,Category_International,Category_International Grocery,Category_Internet Cafes,Category_Irish,Category_Italian,Category_Japanese,Category_Jazz & Blues,Category_Juice Bars & Smoothies,Category_Karaoke,Category_Kebab,Category_Korean,Category_Kosher,Category_Laotian,Category_Latin American,Category_Lebanese,Category_Live/Raw Food,Category_Local Flavor,Category_Local Services,Category_Lounges,Category_Macarons,Category_Mags,Category_Malaysian,Category_Meat Shops,Category_Mediterranean,Category_Mexican,Category_Middle Eastern,Category_Mobile Phone Repair,Category_Modern European,Category_Moroccan,Category_Mosques,Category_Music & Video,Category_Music Venues,Category_Nicaraguan,Category_Nightlife,Category_Noodles,Category_Organic Stores,Category_Pakistani,Category_Party & Event Planning,Category_Pasta Shops,Category_Patisserie/Cake Shop,Category_Persian/Iranian,Category_Peruvian,Category_Pizza,Category_Playgrounds,Category_Polish,Category_Pool Halls,Category_Pop-up Shops,Category_Portuguese,Category_Poutineries,Category_Pretzels,Category_Pubs,Category_Ramen,Category_Recreation Centers,Category_Religious Organizations,Category_Restaurants,Category_Russian,Category_Salad,Category_Salvadoran,Category_Sandwiches,Category_Scandinavian,Category_Scottish,Category_Seafood,Category_Seafood Markets,Category_Shopping,Category_Slovakian,Category_Smokehouse,Category_Social Clubs,Category_Soul Food,Category_Soup,Category_South African,Category_Southern,Category_Spanish,Category_Speakeasies,Category_Special Education,Category_Specialty Food,Category_Specialty Schools,Category_Sports Bars,Category_Sri Lankan,Category_Steakhouses,Category_Street Vendors,Category_Sushi Bars,Category_Szechuan,Category_Taiwanese,Category_Tapas Bars,Category_Tapas/Small Plates,Category_Tattoo,Category_Tea Rooms,Category_Tex-Mex,Category_Thai,Category_Themed Cafes,Category_Tires,Category_Turkish,Category_Ukrainian,Category_Vegan,Category_Vegetarian,Category_Venezuelan,Category_Venues & Event Spaces,Category_Videos & Video Game Rental,Category_Vietnamese,Category_Vinyl Records,Category_Waffles,Category_Wedding Planning,Category_Whiskey Bars,Category_Wholesale Stores,Category_Wigs,Category_Wine & Spirits,Category_Wine Bars,Category_Wineries,Alcohol_beer_and_wine,Alcohol_full_bar,Alcohol_none,Alcohol_nan,Ambience_casual_ False,Ambience_casual_ True,Ambience_casual_nan,Ambience_classy_ False,Ambience_classy_ True,Ambience_classy_nan,Ambience_hipster_ False,Ambience_hipster_ True,Ambience_hipster_nan,Ambience_intimate_ False,Ambience_intimate_ True,Ambience_intimate_nan,Ambience_romantic_ False,Ambience_romantic_ True,Ambience_romantic_nan,Ambience_touristy_ False,Ambience_touristy_ True,Ambience_touristy_nan,Ambience_trendy_ False,Ambience_trendy_ True,Ambience_trendy_nan,Ambience_upscale_ False,Ambience_upscale_ True,Ambience_upscale_nan,BikeParking_False,BikeParking_True,BikeParking_nan,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_True,BusinessAcceptsCreditCards_nan,BusinessParking_garage_ False,BusinessParking_garage_ True,BusinessParking_garage_nan,BusinessParking_lot_ False,BusinessParking_lot_ True,BusinessParking_lot_nan,BusinessParking_street_ False,BusinessParking_street_ True,BusinessParking_street_nan,BusinessParking_valet_ False,BusinessParking_valet_ True,BusinessParking_valet_nan,BusinessParking_validated_ False,BusinessParking_validated_ True,BusinessParking_validated_nan,Caters_False,Caters_True,Caters_nan,GoodForKids_False,GoodForKids_True,GoodForKids_nan,GoodForMeal_breakfast_ False,GoodForMeal_breakfast_ True,GoodForMeal_breakfast_nan,GoodForMeal_brunch_ False,GoodForMeal_brunch_ True,GoodForMeal_brunch_nan,GoodForMeal_dessert_ False,GoodForMeal_dessert_ True,GoodForMeal_dessert_nan,GoodForMeal_dinner_ False,GoodForMeal_dinner_ True,GoodForMeal_dinner_nan,GoodForMeal_latenight_ False,GoodForMeal_latenight_ True,GoodForMeal_latenight_nan,GoodForMeal_lunch_ False,GoodForMeal_lunch_ True,GoodForMeal_lunch_nan,HasTV_False,HasTV_True,HasTV_nan,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,NoiseLevel_nan,OutdoorSeating_False,OutdoorSeating_True,OutdoorSeating_nan,RestaurantsAttire_casual,RestaurantsAttire_dressy,RestaurantsAttire_formal,RestaurantsAttire_nan,RestaurantsDelivery_False,RestaurantsDelivery_True,RestaurantsDelivery_nan,RestaurantsGoodForGroups_False,RestaurantsGoodForGroups_True,RestaurantsGoodForGroups_nan,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsPriceRange2_3,RestaurantsPriceRange2_4,RestaurantsPriceRange2_nan,RestaurantsReservations_False,RestaurantsReservations_True,RestaurantsReservations_nan,RestaurantsTableService_False,RestaurantsTableService_True,RestaurantsTableService_nan,RestaurantsTakeOut_False,RestaurantsTakeOut_True,RestaurantsTakeOut_nan,WiFi_free,WiFi_no,WiFi_paid,WiFi_nan,is_open_0,is_open_1,is_open_nan,BestNights_friday_ False,BestNights_friday_ True,BestNights_friday_nan,BestNights_monday_ False,BestNights_monday_ True,BestNights_monday_nan,BestNights_saturday_ False,BestNights_saturday_ True,BestNights_saturday_nan,BestNights_sunday_ False,BestNights_sunday_ True,BestNights_sunday_nan,BestNights_thursday_ False,BestNights_thursday_ True,BestNights_thursday_nan,BestNights_tuesday_ False,BestNights_tuesday_ True,BestNights_tuesday_nan,BestNights_wednesday_ False,BestNights_wednesday_ True,BestNights_wednesday_nan,ByAppointmentOnly_False,ByAppointmentOnly_nan,CoatCheck_False,CoatCheck_True,CoatCheck_nan,DogsAllowed_False,DogsAllowed_True,DogsAllowed_nan,DriveThru_False,DriveThru_True,DriveThru_nan,GoodForDancing_False,GoodForDancing_True,GoodForDancing_nan,HappyHour_False,HappyHour_True,HappyHour_nan,Music_background_music_ False,Music_background_music_ True,Music_background_music_nan,Music_dj_ False,Music_dj_ True,Music_dj_nan,Music_jukebox_ False,Music_jukebox_ True,Music_jukebox_nan,Music_karaoke_ False,Music_karaoke_ True,Music_karaoke_nan,Music_live_ False,Music_live_ True,Music_live_nan,Music_no_music_ False,Music_no_music_nan,Music_video_ False,Music_video_ True,Music_video_nan,Open24Hours_True,Open24Hours_nan,RestaurantsCounterService_True,RestaurantsCounterService_nan,Smoking_no,Smoking_outdoor,Smoking_yes,Smoking_nan,WheelchairAccessible_False,WheelchairAccessible_True,WheelchairAccessible_nan
0,1,3.5,0,0,1,3,0,late night northern chinese bbq restauran...,48,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
1,1,3.5,0,0,2,6,1,we stumble restaurant werent sure expect ...,48,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
2,1,3.5,0,1,0,0,1,my first impression place really crowd b...,48,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
3,0,3.5,0,1,3,5,0,went friends sichuan fish hot pot jellyf...,48,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
4,1,3.5,0,1,1,3,1,food alright service complete garbage tho...,48,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
5,0,3.5,0,0,2,0,0,altona kabob small restaurant locate smal...,33,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
6,0,3.5,0,0,1,0,2,work late hours job fill up good food ...,33,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
7,0,3.5,0,0,2,0,0,im say kebab bad cause its not theres ...,33,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
8,1,3.5,0,2,0,0,0,the resto work leisurely pace line rush ...,33,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
9,0,4.0,0,1,1,2,2,fav coffee west end muffins delicious ak...,52,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,1,0


In [74]:
df_test = df[df['count_reviews_in_scope'] >0]
X_ntext = df_test.drop(['review_text','count_crucial_signficant'], axis=1)

from sklearn.feature_extraction.text import TfidfVectorizer
#separate the non text features so that we only transform the text
X_text = df_test['review_text']
X_ntext = df_test.drop(['review_text','count_crucial_signficant'], axis=1)

# set up a few different vectorizers to test
count_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
hashing_vectorizer = pipe.Pipeline([
    ('vectorizer', HashingVectorizer(ngram_range=(1,4), non_negative=True)),
    ('tfidf', TfidfTransformer()),
])
bigram_vectorizer = TfidfVectorizer(ngram_range = (1,3))


In [75]:
df_test

Unnamed: 0,count_crucial_signficant,stars,stars_1,stars_2,stars_3,stars_4,stars_5,review_text,all_review_count,count_reviews_in_scope,count_unique_users,Category_Active Life,Category_Afghan,Category_African,Category_American (New),Category_American (Traditional),Category_Antiques,Category_Arabian,Category_Arcades,Category_Argentine,Category_Art Classes,Category_Art Galleries,Category_Art Schools,Category_Arts & Crafts,Category_Arts & Entertainment,Category_Asian Fusion,Category_Australian,Category_Austrian,Category_Automotive,Category_Bagels,Category_Bakeries,Category_Bangladeshi,Category_Barbeque,Category_Bars,Category_Beauty & Spas,Category_Beer,Category_Beer Bar,Category_Belgian,Category_Bistros,Category_Books,Category_Brasseries,Category_Brazilian,Category_Breakfast & Brunch,Category_Breweries,Category_British,Category_Bubble Tea,Category_Buffets,Category_Burgers,Category_Butcher,Category_Cabaret,Category_Cafes,Category_Cajun/Creole,Category_Canadian (New),Category_Candy Stores,Category_Cards & Stationery,Category_Caribbean,Category_Casinos,Category_Caterers,Category_Cheese Shops,Category_Cheesesteaks,Category_Chicken Shop,Category_Chicken Wings,Category_Chinese,Category_Chocolatiers & Shops,Category_Cocktail Bars,Category_Coffee & Tea,Category_Coffee Roasteries,Category_Colombian,Category_Comedy Clubs,Category_Comfort Food,Category_Comic Books,Category_Convenience Stores,Category_Cooking Schools,Category_Creperies,Category_Cuban,Category_Cupcakes,Category_Dance Clubs,Category_Delicatessen,Category_Delis,Category_Department Stores,Category_Desserts,Category_Dim Sum,Category_Diners,Category_Dive Bars,Category_Do-It-Yourself Food,Category_Donairs,Category_Donuts,Category_Drugstores,Category_Education,Category_Egyptian,Category_Ethic Grocery,Category_Ethiopian,Category_Ethnic Food,Category_Ethnic Grocery,Category_Event Planning & Services,Category_Falafel,Category_Farmers Market,Category_Fashion,Category_Fast Food,Category_Filipino,Category_Fish & Chips,Category_Florists,Category_Flowers & Gifts,Category_Food,Category_Food Court,Category_Food Delivery Services,Category_Food Stands,Category_Food Trucks,Category_French,Category_Fruits & Veggies,Category_Gas & Service Stations,Category_Gastropubs,Category_German,Category_Gift Shops,Category_Gluten-Free,Category_Greek,Category_Grocery,Category_Hakka,Category_Halal,Category_Hawaiian,Category_Health Markets,Category_Himalayan/Nepalese,Category_Hookah Bars,Category_Hot Dogs,Category_Hot Pot,Category_Hotels,Category_Hotels & Travel,Category_Hungarian,Category_IT Services & Computer Repair,Category_Ice Cream & Frozen Yogurt,Category_Imported Food,Category_Indian,Category_Indonesian,Category_International,Category_International Grocery,Category_Internet Cafes,Category_Irish,Category_Italian,Category_Japanese,Category_Jazz & Blues,Category_Juice Bars & Smoothies,Category_Karaoke,Category_Kebab,Category_Korean,Category_Kosher,Category_Laotian,Category_Latin American,Category_Lebanese,Category_Live/Raw Food,Category_Local Flavor,Category_Local Services,Category_Lounges,Category_Macarons,Category_Mags,Category_Malaysian,Category_Meat Shops,Category_Mediterranean,Category_Mexican,Category_Middle Eastern,Category_Mobile Phone Repair,Category_Modern European,Category_Moroccan,Category_Mosques,Category_Music & Video,Category_Music Venues,Category_Nicaraguan,Category_Nightlife,Category_Noodles,Category_Organic Stores,Category_Pakistani,Category_Party & Event Planning,Category_Pasta Shops,Category_Patisserie/Cake Shop,Category_Persian/Iranian,Category_Peruvian,Category_Pizza,Category_Playgrounds,Category_Polish,Category_Pool Halls,Category_Pop-up Shops,Category_Portuguese,Category_Poutineries,Category_Pretzels,Category_Pubs,Category_Ramen,Category_Recreation Centers,Category_Religious Organizations,Category_Restaurants,Category_Russian,Category_Salad,Category_Salvadoran,Category_Sandwiches,Category_Scandinavian,Category_Scottish,Category_Seafood,Category_Seafood Markets,Category_Shopping,Category_Slovakian,Category_Smokehouse,Category_Social Clubs,Category_Soul Food,Category_Soup,Category_South African,Category_Southern,Category_Spanish,Category_Speakeasies,Category_Special Education,Category_Specialty Food,Category_Specialty Schools,Category_Sports Bars,Category_Sri Lankan,Category_Steakhouses,Category_Street Vendors,Category_Sushi Bars,Category_Szechuan,Category_Taiwanese,Category_Tapas Bars,Category_Tapas/Small Plates,Category_Tattoo,Category_Tea Rooms,Category_Tex-Mex,Category_Thai,Category_Themed Cafes,Category_Tires,Category_Turkish,Category_Ukrainian,Category_Vegan,Category_Vegetarian,Category_Venezuelan,Category_Venues & Event Spaces,Category_Videos & Video Game Rental,Category_Vietnamese,Category_Vinyl Records,Category_Waffles,Category_Wedding Planning,Category_Whiskey Bars,Category_Wholesale Stores,Category_Wigs,Category_Wine & Spirits,Category_Wine Bars,Category_Wineries,Alcohol_beer_and_wine,Alcohol_full_bar,Alcohol_none,Alcohol_nan,Ambience_casual_ False,Ambience_casual_ True,Ambience_casual_nan,Ambience_classy_ False,Ambience_classy_ True,Ambience_classy_nan,Ambience_hipster_ False,Ambience_hipster_ True,Ambience_hipster_nan,Ambience_intimate_ False,Ambience_intimate_ True,Ambience_intimate_nan,Ambience_romantic_ False,Ambience_romantic_ True,Ambience_romantic_nan,Ambience_touristy_ False,Ambience_touristy_ True,Ambience_touristy_nan,Ambience_trendy_ False,Ambience_trendy_ True,Ambience_trendy_nan,Ambience_upscale_ False,Ambience_upscale_ True,Ambience_upscale_nan,BikeParking_False,BikeParking_True,BikeParking_nan,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_True,BusinessAcceptsCreditCards_nan,BusinessParking_garage_ False,BusinessParking_garage_ True,BusinessParking_garage_nan,BusinessParking_lot_ False,BusinessParking_lot_ True,BusinessParking_lot_nan,BusinessParking_street_ False,BusinessParking_street_ True,BusinessParking_street_nan,BusinessParking_valet_ False,BusinessParking_valet_ True,BusinessParking_valet_nan,BusinessParking_validated_ False,BusinessParking_validated_ True,BusinessParking_validated_nan,Caters_False,Caters_True,Caters_nan,GoodForKids_False,GoodForKids_True,GoodForKids_nan,GoodForMeal_breakfast_ False,GoodForMeal_breakfast_ True,GoodForMeal_breakfast_nan,GoodForMeal_brunch_ False,GoodForMeal_brunch_ True,GoodForMeal_brunch_nan,GoodForMeal_dessert_ False,GoodForMeal_dessert_ True,GoodForMeal_dessert_nan,GoodForMeal_dinner_ False,GoodForMeal_dinner_ True,GoodForMeal_dinner_nan,GoodForMeal_latenight_ False,GoodForMeal_latenight_ True,GoodForMeal_latenight_nan,GoodForMeal_lunch_ False,GoodForMeal_lunch_ True,GoodForMeal_lunch_nan,HasTV_False,HasTV_True,HasTV_nan,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,NoiseLevel_nan,OutdoorSeating_False,OutdoorSeating_True,OutdoorSeating_nan,RestaurantsAttire_casual,RestaurantsAttire_dressy,RestaurantsAttire_formal,RestaurantsAttire_nan,RestaurantsDelivery_False,RestaurantsDelivery_True,RestaurantsDelivery_nan,RestaurantsGoodForGroups_False,RestaurantsGoodForGroups_True,RestaurantsGoodForGroups_nan,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsPriceRange2_3,RestaurantsPriceRange2_4,RestaurantsPriceRange2_nan,RestaurantsReservations_False,RestaurantsReservations_True,RestaurantsReservations_nan,RestaurantsTableService_False,RestaurantsTableService_True,RestaurantsTableService_nan,RestaurantsTakeOut_False,RestaurantsTakeOut_True,RestaurantsTakeOut_nan,WiFi_free,WiFi_no,WiFi_paid,WiFi_nan,is_open_0,is_open_1,is_open_nan,BestNights_friday_ False,BestNights_friday_ True,BestNights_friday_nan,BestNights_monday_ False,BestNights_monday_ True,BestNights_monday_nan,BestNights_saturday_ False,BestNights_saturday_ True,BestNights_saturday_nan,BestNights_sunday_ False,BestNights_sunday_ True,BestNights_sunday_nan,BestNights_thursday_ False,BestNights_thursday_ True,BestNights_thursday_nan,BestNights_tuesday_ False,BestNights_tuesday_ True,BestNights_tuesday_nan,BestNights_wednesday_ False,BestNights_wednesday_ True,BestNights_wednesday_nan,ByAppointmentOnly_False,ByAppointmentOnly_nan,CoatCheck_False,CoatCheck_True,CoatCheck_nan,DogsAllowed_False,DogsAllowed_True,DogsAllowed_nan,DriveThru_False,DriveThru_True,DriveThru_nan,GoodForDancing_False,GoodForDancing_True,GoodForDancing_nan,HappyHour_False,HappyHour_True,HappyHour_nan,Music_background_music_ False,Music_background_music_ True,Music_background_music_nan,Music_dj_ False,Music_dj_ True,Music_dj_nan,Music_jukebox_ False,Music_jukebox_ True,Music_jukebox_nan,Music_karaoke_ False,Music_karaoke_ True,Music_karaoke_nan,Music_live_ False,Music_live_ True,Music_live_nan,Music_no_music_ False,Music_no_music_nan,Music_video_ False,Music_video_ True,Music_video_nan,Open24Hours_True,Open24Hours_nan,RestaurantsCounterService_True,RestaurantsCounterService_nan,Smoking_no,Smoking_outdoor,Smoking_yes,Smoking_nan,WheelchairAccessible_False,WheelchairAccessible_True,WheelchairAccessible_nan
0,1,3.5,0,0,1,3,0,late night northern chinese bbq restauran...,48,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
1,1,3.5,0,0,2,6,1,we stumble restaurant werent sure expect ...,48,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
2,1,3.5,0,1,0,0,1,my first impression place really crowd b...,48,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
3,0,3.5,0,1,3,5,0,went friends sichuan fish hot pot jellyf...,48,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
4,1,3.5,0,1,1,3,1,food alright service complete garbage tho...,48,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
5,0,3.5,0,0,2,0,0,altona kabob small restaurant locate smal...,33,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
6,0,3.5,0,0,1,0,2,work late hours job fill up good food ...,33,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
7,0,3.5,0,0,2,0,0,im say kebab bad cause its not theres ...,33,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
8,1,3.5,0,2,0,0,0,the resto work leisurely pace line rush ...,33,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1
9,0,4.0,0,1,1,2,2,fav coffee west end muffins delicious ak...,52,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,1,0


Basic Linear Regression 10-fold CV , TfidfVectorizer(ngram_range = (1,3))

In [None]:
#vectorize using count vectorizer and 3-gram features and create train/test split
from sklearn.model_selection import cross_val_score

X1 = scipy.sparse.hstack((bigram_vectorizer.fit_transform(df_test.review_text), X_ntext))
Y1 = df_test['count_crucial_signficant']
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, train_size=.75)

model = LogisticRegression()
model.fit(X_train1, Y_train1)
print ("Area under the ROC curve on test data = %.3f" % 
       np.mean(cross_val_score(model, X_test1, Y_test1, scoring="roc_auc", cv=10)))

In [77]:
#vectorize using hash vectorixer and tfidf and create train/test split
X2 = scipy.sparse.hstack((vectorizer_b.fit_transform(df.review_text), X_ntext))
Y2 = df['count_crucial_signficant']
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, train_size=.75)

model = LogisticRegression()
model.fit(X_train2, Y_train2)
print ("Area under the ROC curve on test data = %.3f" % metrics.roc_auc_score(model.predict(X_test2), Y_test2))

Area under the ROC curve on test data = 0.544


Basic Decision Tree, no CV

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train1, Y_train1)
print ("Area under the ROC curve on test data = %.3f" % metrics.roc_auc_score(model.predict(X_test1), Y_test1))

Area under the ROC curve on test data = 0.530


In [None]:
from sklearn.neural_network import BernoulliRBM

X1 = scipy.sparse.hstack((bigram_vectorizer.fit_transform(df_test.review_text), X_ntext))
Y1 = df_test['count_crucial_signficant']
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, train_size=.75)

model = BernoulliRBM()
model.learning_rate = 0.06
model.n_iter = 20
# More components tend to give better prediction performance, but larger
# fitting time
model.n_components = 100

model.fit(X_train1, Y_train1)
print ("Area under the ROC curve on test data = %.3f" % metrics.roc_auc_score(model.predict(X_test1), Y_test1))