In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict
import heapq
import json
import math
import time

import imageio
import loading
import tqdm

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import svm

In [5]:
"""DEFINE CONSTANTS HERE"""

DATA_PATH           = '../data'
VISUALIZATION_PATH  = '../visualization'

MISSING_VALUE       = '<NONE>'       # Used for the 'neighborhood' and 'city' attributes.
DEFAULT_NEI_P       = 0.2            # Default percentile of neighborhoods to keep.
DEFAULT_CITY_P      = 0.1            # Default percentile of cities to keep.
DEFAULT_ATT_P       = 1.0            # Default percentile of attributes to keep.
DEFAULT_CAT_P       = 0.5            # Default percentile of categories to keep.
DEFAULT_HRS_P       = 1.0            # Default percentile of hours to keep.

TIME_GRANULARITY    = 1              # Granularity (ticks/hr) of time calculations, factor of 60. 

SLICE_BY            = ['Restaurants']

In [6]:
"""Loads the json file of the given dataset name."""
def load(name):
  start = time.time()
  data = loading.read_df_from_json('%s/%s.json' % (DATA_PATH, name))
  print 'time to load \'%s\': %.3fs' % (name, time.time() - start)
  return data

In [7]:
"""Cleans the business dataset."""
def clean_business(business):
  print 'Replacing %s with %s.' % (u'Montréal', u'Montreal')
  business['city'].replace(to_replace=u'Montréal', 
                           value=u'Montreal',
                           inplace=True)

In [8]:
"""Plots all businesses on the world map for visualization purposes."""
def plot_business(business):
  points = business[['latitude', 'longitude']]

  img = imageio.imread(VISUALIZATION_PATH + '/raw_map.jpg').astype('int64')
  img = img / 4               # Dim map.
  img = img[8:-8,8:-8,:]      # Clip borders.
  H, W, _ = img.shape
  
  scalar = 10                 # Amount to add to each channel.
  delta = np.zeros((H, W), dtype='int64')
  
  def get_xy(latitude, longitude):
    x = (W - 1) * (180.0 + longitude) / 360.0
    y = (H - 1) * (90.0 - latitude) / 180.0
    return int(x), int(y)

  for row in points.itertuples():
    latitude, longitude = row.latitude, row.longitude
    if not math.isnan(latitude) and not math.isnan(longitude):
      x, y = get_xy(latitude, longitude)
      delta[y,x] += scalar

  img += np.expand_dims(delta, axis=-1).repeat(3, axis=-1)
  img = img.clip(0, 255).astype('uint8')
  
  return img

In [9]:
"""Returns the count_dict as a sorted list."""
def to_list(count_dict):
  return sorted([(k, count_dict[k]) for k in count_dict], key=lambda v: v[1])

In [10]:
"""Converts a dict of counts (key, int) into a list of top features.

   Takes either top N (int) features, or top PERCENTILE (float) by occurrence.

   Example usage:
     top_features(count_dict, 0.1)                  # Returns top 10% of elements.
     top_features(count_dict, top_n=5)              # Returns top 5 elements.
"""
def top_features(count_dict, percentile=None, n=None, verbose=True):
  if n is None:
    if percentile is None:
      raise Exception
    n = int(percentile * len(count_dict))

  l = heapq.nlargest(n, count_dict, key=lambda k: count_dict[k])
  
  if verbose:
    percentage = 0.0 if not len(count_dict) else 100.0 * n / len(count_dict)
    params = (n, len(count_dict), percentage, 0 if not l else count_dict[l[-1]])
    print 'Took %d elements out of %d (%2.1f%%). Cutoff was >= %d.' % params
    
  return l

In [11]:
"""Methods for converting a tuple from itertuples() to a feature list."""

# Return neighborhood concatenated with city, or MISSING_VALUE if empty. 
def get_neighborhood(tup):
  assert type(tup.neighborhood) is unicode
  return [tup.neighborhood + '/' + tup.city if tup.neighborhood else MISSING_VALUE]

# Return city, or MISSING_VALUE if empty.
def get_city(tup):
  assert type(tup.city) is unicode
  return [tup.city if tup.city else MISSING_VALUE]

# Recursively process attributes dict to get indicators for all attributes.
def get_attributes(tup):
  def _recurse(attributes, prefix):
    assert type(attributes) is dict
    l = []
    for k, v in attributes.items():
      if type(v) is bool:
        l.append(prefix + '/' + k)
      elif type(v) is unicode:
        l.append(prefix + '/' + k + '/' + v)
      elif type(v) is int:
        l.append(prefix + '/' + k + '/' + str(v))
      elif type(v) is dict:
        l += _recurse(attributes[k], prefix=k)
      else:
        assert False  # Invalid type in attributes.
    return l
  return _recurse(tup.attributes, prefix='')

# Return categories.
def get_categories(tup):
  assert type(tup.categories) is list
  return tup.categories

"""Helper methods for get_hours(), which determines which ticks of time the business is open.
   Each tick of time corresponds to an index in [0, _max_ticks()).

   In _time_to_dt_index, ROUND_UP determines what happens when the time falls
   in between ticks. By default, the time is rounded up.
"""
# An ordering of the days of the week, and a map from str --> index
_day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
_day_index = {d:i for i, d in enumerate(_day_order)}

# Maximum number of ticks
def _max_ticks():
  return 7 * 24 * TIME_GRANULARITY

# Given day of the week and time, returns the corresponding tick in [0, _max_ticks()).
def _time_to_dt_index(day, time, round_up=True):
  [hour, minutes] = time.split(':')
  hour_index = int(hour) * TIME_GRANULARITY
  min_index  = int(minutes) / (60 / TIME_GRANULARITY)
  if round_up and int(minutes) % (60 / TIME_GRANULARITY) > 0:
    min_index += 1
  return _day_index[day] * 24 * TIME_GRANULARITY + hour_index + min_index

# Cache the string corresponding to each tick of time.
_timestr_cache = ['%s/%02d:%02d' % (_day_order[day], hour, min_index * 60 / TIME_GRANULARITY)
                  for day in range(0, 7)
                  for hour in range(0, 24)
                  for min_index in range(0, TIME_GRANULARITY)]

def get_hours(tup):
  assert type(tup.hours) is dict
  l = []
  for day, hours in tup.hours.items():
    open_time, close_time = hours.split('-')
    open_index = _time_to_dt_index(day, open_time)
    close_index = _time_to_dt_index(day, close_time)

    assert 0 <= open_index and close_index <= _max_ticks()
    
    # Handle the case where close_index is for the following day.
    if close_index <= open_index:
      close_index += 24 * TIME_GRANULARITY
    
    # Append the slice of the _time_str_cache, handling wrap-around appropriately.
    l += _timestr_cache[open_index:min(close_index, _max_ticks())]
    if close_index > _max_ticks():
      l += _timestr_cache[0:close_index - _max_ticks()]
      
  return l

# Function to retrieve all features of a given row tuple.
all_fns = [get_neighborhood, get_city, get_attributes, get_categories, get_hours]
def get_all_features(tup):
  all_features = set([])
  for fn in all_fns:
    all_features |= set(fn(tup))
  return all_features

# Returns the value to regress on for the row tuple.
def get_target(tup):
  return float(tup.stars)

In [12]:
"""Get the features that we will use for 
   neighborhoods, cities, attributes, categories.

   By default, take:
     TOP 20% OF neighborhoods
     TOP 10% OF cities
         ALL OF attributes
     TOP 50% OF categories
         ALL OF hours
   
   Returns a length 6 tuple:
     (nei_set, city_set, att_set, cat_set, hours_set, debug_vals)
"""
def get_feature_sets(business, slice_by=[],
                     percentiles=[DEFAULT_NEI_P, DEFAULT_CITY_P, DEFAULT_ATT_P, 
                                  DEFAULT_CAT_P, DEFAULT_HRS_P]):
  def has_required_features(all_features):
    all_features_set = set([])
    for features in all_features:
      all_features_set |= set(features)
    for f in slice_by:
      if f not in all_features_set:
        return False
    return True
  
  fn_counts_percentile = zip(all_fns, [defaultdict(int) for _ in range(len(all_fns))], percentiles)

  for tup in tqdm.tqdm(business.itertuples()):
    all_features = [fn(tup) for fn, _, _ in fn_counts_percentile]
    if not has_required_features(all_features):
      continue
    for features, (_, counts, _) in zip(all_features, fn_counts_percentile):
      for f in features:
        counts[f] += 1

  debug_val = [(fn.__name__, to_list(counts)) for fn, counts, _ in fn_counts_percentile]
  
  return [top_features(counts, percentile) for fn, counts, percentile in fn_counts_percentile], debug_val

In [13]:
"""Gets a mapping from feature name to feature index and vice versa."""
def get_feature_maps(feature_sets, start_index=0):  
  # Assert that there are no overlapping names.
  union = set([])
  for s in feature_sets:
    union |= set(s)
  assert len(union) == sum([len(s) for s in feature_sets])

  name_to_index, index_to_name = {}, {}
  for i, feature in enumerate(union):
    name_to_index[feature] = start_index + i
    index_to_name[start_index + i] = feature

  return name_to_index, index_to_name

In [14]:
def slice_by_feature(x, y, name_to_index, feature_name):
  if feature_name not in name_to_index:
    print 'Feature %s not found.' % feature_name
    return x, y
  feature_index = name_to_index[feature_name]
  indices = np.where(x[:,feature_index] == 1)
  return x[indices], y[indices]

def slice_by_features(x, y, name_to_index, slice_by):
  for feature_name in slice_by:
    x, y = slice_by_feature(x, y, name_to_index, feature_name)
  return x, y

In [15]:
"""Creates a list of data points for multivariate linear regression."""
def get_training_data(business, name_to_index, slice_by=[]):
  x = np.zeros((business.shape[0], len(name_to_index)), dtype='float32')
  y = np.zeros(business.shape[0], dtype='float32')
  
  for i, tup in tqdm.tqdm(enumerate(business.itertuples())):
    all_features = get_all_features(tup)
    for f in all_features:
      if f in name_to_index:
        x[i,name_to_index[f]] = 1
    y[i] = get_target(tup)
  
  return slice_by_features(x, y, name_to_index, slice_by)

Run business regression.

NOT USABLE:
- business_id
- name

PROBABLY NOT USABLE:
- latitude
- longitude
- postal code
- address
- is_open
- review_count
- state

REGRESS ON:
- stars  (1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5)

FEATURES:
- neighborhood/city  --> indicators (~20%)
- city               --> indicators (~10%)
- attributes         --> indicators (process types differently, each has a separate indicator)
- categories         --> indicators (~50%)
- hours              --> indicators for every hour/half hour/quarter of the hour

In [16]:
business = load('business')b
clean_business(business)

time to load 'business': 20.628s
Replacing Montréal with Montreal.


In [17]:
# img = plot_business(business)
# plt.imshow(img)
# plt.show()
# imageio.imsave(VISUALIZATION_PATH + '/business_map.jpg', img)

In [18]:
SLICE_BY = ['Restaurants']

feature_sets, debug_val = get_feature_sets(business, slice_by=SLICE_BY, 
                                           percentiles=[0, 0, 0.7, 0.15, 0.0])

Took 0 elements out of 550 (0.0%). Cutoff was >= 0.
Took 0 elements out of 734 (0.0%). Cutoff was >= 0.
Took 70 elements out of 100 (70.0%). Cutoff was >= 752.
Took 97 elements out of 651 (14.9%). Cutoff was >= 175.
Took 0 elements out of 168 (0.0%). Cutoff was >= 0.


In [19]:
feature_maps = get_feature_maps(feature_sets)
name_to_index, index_to_name = feature_maps  

In [20]:
x, y = get_training_data(business, name_to_index, slice_by=SLICE_BY)

                                          

In [25]:
print 'Slicing by:', SLICE_BY
print 'Samples in slice:', len(x)
print

reg = linear_model.Ridge(alpha=0.5)
reg.fit(x, y)

print 'Intercept =', reg.intercept_
print

N = 20
coefs = [(index_to_name[i], c) for i, c in enumerate(reg.coef_)]
for n, c in heapq.nlargest(N, coefs, key=lambda v: v[1]):
  print n, c
print '...'
for n, c in reversed(heapq.nsmallest(N, coefs, key=lambda v: v[1])):
  print n, c
print

y_hat = reg.predict(x)
print np.sqrt(mean_squared_error(y, y_hat))

Slicing by: ['Restaurants']
Samples in slice: 51613

Intercept = 3.36518456985

Food Trucks 0.453103035284
Vegan 0.307068254562
Latin American 0.306034198123
Caribbean 0.296301275254
French 0.27946296579
Hotels & Travel 0.260133183633
Modern European 0.251097020068
Food Stands 0.250909546574
Specialty Food 0.231032568397
German 0.223879627584
Swabian 0.221521238943
Chicken Shop 0.220045399246
Fish & Chips 0.217996759451
Portuguese 0.214262340238
Hawaiian 0.212582334901
Dive Bars 0.195085614846
Hot Dogs 0.174344622817
Cafes 0.171148198841
Soul Food 0.159251981817
/BusinessAcceptsBitcoin 0.154675084625
...
/RestaurantsAttire/casual -0.073986050238
Imported Food -0.0841129490088
American (Traditional) -0.0880086412048
Pizza -0.0886453161762
Venues & Event Spaces -0.124112273937
/BusinessAcceptsCreditCards -0.125439466803
Sports Bars -0.127505819715
/NoiseLevel/loud -0.128328313941
Chinese -0.130034773375
Ethnic Food -0.130089626883
Pakistani -0.163330543992
Burgers -0.243899187156
Tex-Mex

In [22]:
"""Splits list of data points 80/20 for training versus test."""
def training_test_split(x, y):
    indices = np.random.permutation(x.shape[0])
    cutoff = int(.8 * x.shape[0])
    training_idx, test_idx = indices[:cutoff], indices[cutoff+1:]
    x_train, x_test = x[training_idx,:], x[test_idx,:]
    y_train, y_test = y[training_idx], y[test_idx]
    return x_train, y_train, x_test, y_test

In [27]:
x_train, y_train, x_test, y_test = training_test_split(x, y)
print 'Slicing by:', SLICE_BY
print 'Samples in training set:', len(x_train)
print
reg = linear_model.Ridge(alpha=0.7)
reg.fit(x_train, y_train)
y_hat = reg.predict(x_test)
print "test MSE", np.sqrt(mean_squared_error(y_test, y_hat))
print

print 'Slicing by:', SLICE_BY
print 'Samples in slice:', len(x_train)
print

y_mean = np.mean(y_train)
print "means test MSE", np.sqrt(mean_squared_error(np.repeat(y_mean, len(y_test)), y_test))

Slicing by: ['Restaurants']
Samples in training set: 41290

test MSE 0.70817920643

Slicing by: ['Restaurants']
Samples in slice: 41290

means test MSE 0.785963


In [30]:
def index_submaps(name_array):
    sub_name_to_ind = {}
    sub_ind_to_name = {}
    ind = 0
    for n in name_array:
        sub_name_to_ind[n] = ind
        sub_ind_to_name[ind] = n
        ind += 1
    return sub_name_to_ind, sub_ind_to_name

cultural_cols = ['Latin American', 'Caribbean', 'Modern European', 'German', 'Portuguese', 'Hawaiian', 
                'American (Traditional)', 'Chinese', 'Pakistani', 'Tex-Mex']

In [31]:
sub_cult_to_ind, sub_ind_to_cult = index_submaps(cultural_cols)
x_cult, y_cult = get_training_data(business, sub_cult_to_ind, slice_by=SLICE_BY)

Feature Restaurants not found.            


In [32]:
x_train, y_train, x_test, y_test = training_test_split(x_cult, y_cult)
print 'Slicing by:', SLICE_BY
print 'Samples in slice:', len(x_cult)
print

reg = linear_model.Ridge(alpha=.5)
reg.fit(x_cult, y_cult)

print 'Intercept =', reg.intercept_
print

N = 20
coefs = [(sub_ind_to_cult[i], c) for i, c in enumerate(reg.coef_)]
for n, c in heapq.nlargest(N, coefs, key=lambda v: v[1]):
  print n, c
print
    
y_hat = reg.predict(x_cult)
print "MSE cultural", np.sqrt(mean_squared_error(y_cult, y_hat))
print

x_train, y_train, x_test, y_test = training_test_split(x_cult, y_cult)
print 'Slicing by:', SLICE_BY
print 'Samples in training set:', len(x_train)
print
reg = linear_model.Ridge(alpha=0.7)
reg.fit(x_train, y_train)
y_hat = reg.predict(x_test)
print "test MSE cultural", np.sqrt(mean_squared_error(y_test, y_hat))

Slicing by: ['Restaurants']
Samples in slice: 156639

Intercept = 3.66857220381

Modern European 0.300015062522
Latin American 0.217323481932
Caribbean 0.169281552524
Hawaiian 0.166152837305
German 0.145412203815
Portuguese 0.0523951887081
Pakistani -0.186747522438
American (Traditional) -0.336699668781
Chinese -0.346919122557
Tex-Mex -0.66409477007

MSE cultural 0.972964587125

Slicing by: ['Restaurants']
Samples in training set: 125311

test MSE cultural 0.973345461503


In [66]:
def get_rows_with(name_to_index, col, x, y):
    ind = name_to_index[col]
    print "index from name ", ind
    x_sub = []
    y_sub = []
    for i in range(len(x)):
        if (x[i, ind] == 1):
            x_sub.append(x[i])
            y_sub.append(y[i])
    return np.array(x_sub), np.array(y_sub)

In [67]:
x_latin, y_latin = get_rows_with(name_to_index, 'Latin American', x, y)
x_train, y_train, x_test, y_test = training_test_split(x_latin, y_latin)
print 'Slicing by:', SLICE_BY
print 'Samples in training set:', len(x_train)
print
reg = linear_model.Ridge(alpha=1)
reg.fit(x_train, y_train)

print 'Intercept =', reg.intercept_
print

N = 20
coefs = [(index_to_name[i], c) for i, c in enumerate(reg.coef_)]
for n, c in heapq.nlargest(N, coefs, key=lambda v: v[1]):
  print n, c
print
print '...'
for n, c in reversed(heapq.nsmallest(N, coefs, key=lambda v: v[1])):
  print n, c
print

    
y_hat = reg.predict(x_test)
print "test RMSE latin american", np.sqrt(mean_squared_error(y_test, y_hat))

index from name  13
Slicing by: ['Restaurants']
Samples in training set: 375

Intercept = 3.7904676204

Ambience/hipster 0.605289672437
/RestaurantsGoodForGroups 0.554916160494
Vegan 0.403040577323
Beer Bar 0.351060379974
/RestaurantsPriceRange2/1 0.346367577859
Food Stands 0.315131006487
Tapas Bars 0.303033153525
Ice Cream & Frozen Yogurt 0.302801784568
Cocktail Bars 0.28816900886
Dive Bars 0.286319047982
/RestaurantsPriceRange2/2 0.283055523317
/BYOBCorkage/no 0.277194944111
Chicken Shop 0.271103139385
Arts & Entertainment 0.271003048754
Breweries 0.269218503615
Food Delivery Services 0.268104876649
/BikeParking 0.264128881308
Food 0.25874456464
/RestaurantsPriceRange2/3 0.253058114509
French 0.251770540044

...
/BusinessAcceptsCreditCards -0.205559107343
/RestaurantsAttire/casual -0.208075967246
/Alcohol/full_bar -0.221551494099
Burgers -0.223228439864
/RestaurantsAttire/dressy -0.231310925496
/RestaurantsTableService -0.233733126839
Wine & Spirits -0.23453235213
Beer -0.23453235213

In [72]:
x_part, y_part = get_rows_with(name_to_index, 'Pakistani', x, y)
x_train, y_train, x_test, y_test = training_test_split(x_part, y_part)
print 'Slicing by:', SLICE_BY
print 'Samples in training set:', len(x_train)
print
reg = linear_model.Ridge(alpha=1)
reg.fit(x_train, y_train)

print 'Intercept =', reg.intercept_
print

N = 20
coefs = [(index_to_name[i], c) for i, c in enumerate(reg.coef_)]
for n, c in heapq.nlargest(N, coefs, key=lambda v: v[1]):
  print n, c
print
print '...'
for n, c in reversed(heapq.nsmallest(N, coefs, key=lambda v: v[1])):
  print n, c
print
category_rmse = get_test_RMSE(x, y, get_categories)
print "RMSE for training on category only ", category_rmse
    
y_hat = reg.predict(x_test)
print "test RMSE pakistani", np.sqrt(mean_squared_error(y_test, y_hat))

index from name  1
Slicing by: ['Restaurants']
Samples in training set: 279

Intercept = 3.49853853663

Desserts 0.775382548588
Shopping 0.516835207415
Ice Cream & Frozen Yogurt 0.512691965255
/OutdoorSeating 0.464077381114
Food Trucks 0.389515046461
Cocktail Bars 0.354636676856
/Smoking/outdoor 0.346515666603
Ambience/divey 0.324161571307
Grocery 0.311283961649
Caribbean 0.299661254319
Caterers 0.272486351485
/RestaurantsTakeOut 0.257592318417
Breakfast & Brunch 0.244587318955
Asian Fusion 0.235784639357
Wine Bars 0.235100621206
Nightlife 0.227976914596
Vegan 0.227790810864
Halal 0.202855195362
/BusinessAcceptsBitcoin 0.199701996113
/BikeParking 0.154012233499

...
Vegetarian -0.179681703066
Buffets -0.180260284383
Bakeries -0.186386235326
Food Stands -0.19586311029
Coffee & Tea -0.251799647995
Thai -0.255390555707
/DriveThru -0.261202726505
/RestaurantsPriceRange2/2 -0.263833473511
/RestaurantsPriceRange2/3 -0.275503291972
Arts & Entertainment -0.288931108978
/Smoking/no -0.292771457