What follows is a breakdown on what I have done to build our model, from the yelp dataset, through data cleaning, feature generation, and classifying...

In [1]:
import numpy as np
import pandas as pd
import datetime
from dateutil.rrule import rrule, MONTHLY
import matplotlib.pyplot as plt
from collections import Counter, OrderedDict
from IPython.display import clear_output, display
import sklearn as skl
%matplotlib inline
pd.options.display.max_rows = 1000

In [2]:
def to_datetime(review_df):
    return review_df.assign(datetime = pd.to_datetime(review_df['date'])).drop(labels=['date'], axis = 'columns').set_index('datetime').sort_index()


Step 1: Building the Dataset

WARNING Don't try to run this on your laptop! The datasets I'm importing are LARGE

In [3]:
#if you want to rebuild the everything, set this to true, but it will take long time much wait
rebuild_dset = False
state = 'AZ'

In [4]:
#load review data and business data
if(rebuild_dset == True):
    reviews = pd.read_json('dataset/yelp_academic_dataset_review.json', lines=True)
    businesses = pd.read_json('dataset/yelp_academic_dataset_business.json', lines=True)
    

In [5]:
#join the two datasets, dropping duplicate/useless labels
if(rebuild_dset == True):
    reviews_businesses = reviews.merge(businesses, on='business_id')
    state_reviews_businesses = reviews_businesses[reviews_businesses['state']==state]
    state_badname  = state_reviews_businesses[['date', 'business_id','user_id', 'name', 'cool', 'funny','useful','review_id', 'stars_x', 'categories','review_count', 'stars_y']]
    state_reviews = state_badname.rename(index=str, columns={'stars_x':'stars_review','stars_y':'stars_avg'})
    state_reviews.to_csv('b2data/' + state + '_data_dirty.csv')
else:
    state_reviews = pd.read_csv('b2data/' + state + '_data_dirty.csv')

FileNotFoundError: File b'b2data/AZ_data_dirty.csv' does not exist

In [6]:
#build our inclusive feature list, courtesy of Christina
if(rebuild_dset == True):
    incl_list = ['Asian Fusion','Buffets','Caribbean','Chinese','Trinidadian','Vegetarian','Afghan','African','Arabian','Argentine','Armenian','Australian','Austrian','Bangladeshi','Basque','Bavarian','Brasseries','British','Bulgarian','Burmese','Cambodian','Cantonese','Chilean','Colombian','Cuban','Czech','Czech/Slovakian','Dominican','Donairs','Eastern European','Egyptian','Ethiopian','Falafel','Filipino','Greek','Hainan','Haitian','Hakka','Halal','Hawaiian','Honduran','Hot Pot','Iberian','Indonesian','International','Irish','Irish Pub','Izakaya','Japanese Curry','Kebab','Kosher','Laotian','Lebanese','Malaysian','Mauritius','Mediterranean','Modern European','Mongolian','Moroccan','New Mexican Cuisine','Noodles','Northern German','Pan Asian','Peruvian','Polish','Puerto Rican','Ramen','Russian','Salvadoran','Scandinavian','Scottish','Senegalese','Shanghainese','Sicilian','Singaporean','Slovakian','Soul Food','South African','Spanish','Sri Lankan','Swiss Food','Syrian','Szechuan','Teppanyaki','Traditional Norwegian','Turkish','Tuscan','Ukrainian','Uzbek','Venezuelan','Acai Bowls','American (New)','American (Traditional)','Bagels','Baguettes','Bakeries','Barbeque','Bars','Bartenders','Beach Bars','Beer','Beer Bar','Beer Garden','Beer Gardens','Beer Hall','Beer Tours','Belgian','Bistros','Brazilian','Breakfast & Brunch','Breweries','Brewpubs','Bubble Tea','Burgers','Butcher','Cafes','Cafeteria','Cajun/Creole','Canadian (New)','Candy Stores','Caterers','Champagne Bars','Cheese Shops','Cheesesteaks','Chicken Shop','Chicken Wings','Chocolatiers & Shops','Churros','Cideries','Cocktail Bars','Coffee & Tea','Coffee Roasteries','Coffeeshops','Comfort Food','Creperies','Cupcakes','Custom Cakes','Dance Clubs','Delicatessen','Delis','Desserts','Dim Sum','Diners','Distilleries','Dive Bars','Do-It-Yourself Food','Donuts','Drive-Thru Bars','Empanadas','Ethical Grocery','Ethnic Food','Ethnic Grocery','Fast Food','Fish & Chips','Fishmonger','Fondue','Food','Food Court','Food Stands','Food Trucks','French','Fruits & Veggies','Gastropubs','Gelato','German','Gluten-Free','Honey','Hong Kong Style Cafe','Hot Dogs','Hotel bar','Hungarian','Ice Cream & Frozen Yogurt','Imported Food','Indian','International Grocery','Italian','Japanese','Japanese Sweets','Juice Bars & Smoothies','Karaoke','Kombucha','Korean','Latin American','Live/Raw Food','Local Flavor','Lounges','Macarons','Mexican','Middle Eastern','Milkshake Bars','Minho','Olive Oil','Organic Stores','Pakistani','Palatine','Pasta Shops','Patisserie/Cake Shop','Persian/Iranian','Pita','Pizza','Poke','Pop-Up Restaurants','Popcorn Shops','Poutineries','Pretzels','Pub Food','Pubs','Restaurants','Rotisserie Chicken','Salad','Sandwiches','Seafood','Serbo Croatian','Shaved Ice','Shaved Snow','Smokehouse','Soba','Soup','Southern','Speakeasies','Specialty Food','Sports Bars','Steakhouses','Street Vendors','Sugar Shacks','Supper Clubs','Sushi Bars','Tacos','Taiwanese','Tapas Bars','Tapas/Small Plates','Tea Rooms','Tempura','Thai','Themed Cafes','Tiki Bars','Tonkatsu','Udon','Vegan','Vietnamese','Waffles','Whiskey Bars','Wine & Spirits','Wine Bars','Wraps']
    res_filter = (state_reviews['categories'].str.contains('There is absolutely no way this string is contained'))
    for i, word in enumerate(incl_list):
        clear_output(wait=True)
        print('{:2.2f}% \t'.format(100*i/len(incl_list))+word)
        res_filter = (res_filter | state_reviews['categories'].str.contains(word))
    res_filter.to_csv('b2data/res_filter.csv')
else:
    res_filter = pd.Series.from_csv('b2data/res_filter.csv')

In [7]:
#dropnans and apply filter
if(rebuild_dset == True):
    state_reviews_nona = state_reviews.dropna()
    state_reviews_res = state_reviews_nona[res_filter]
    state_reviews_res.to_csv('b2data/' + state + '_data_clean.csv')
else:
    state_reviews_res = pd.read_csv('b2data/' + state +'_data_clean.csv')

Step 2: Building features

In [8]:
#again, this will take a long time if true
rebuild_features = True

In [9]:
#convert indexes to datetime and sort all reviews by business id, year, and month
state_reviews_dt = to_datetime(state_reviews_res)
state_businesses_grouped = state_reviews_dt.groupby([state_reviews_dt['business_id'], state_reviews_dt.index.year.rename('year'), state_reviews_dt.index.month.rename('month')])

target - monthly growth feature construction

In [11]:
#count reviews per month
rev_per_month = state_businesses_grouped['name'].count()
#change of growth into next month
growth_per_month = -rev_per_month.diff(periods=-1).fillna(0)

#define a mapping function to get growth for each row given business id and date:
def get_growth(bid, timestamp):
    date = datetime.date(timestamp.year, timestamp.month, 1)
    growth = growth_per_month[bid][date.year][date.month]
    print(date)
    return growth

if rebuild_features == True:
    state_reviews_mg = state_reviews_dt.apply(lambda row: get_growth(row['business_id'], row.name), axis=1)
    clear_output(wait=False)
    state_reviews_mg_df = state_reviews_mg.to_frame()
    state_reviews_mg_df.columns = ['monthly_growth']
    state_reviews_mg_df.to_csv('b2data/' + state+'_monthly__growth.csv')
else:
    state_reviews_mg = to_datetime(pd.read_csv('b2data/' + state+'_monthly__growth.csv')).drop(['Unnamed: 0'], axis=1)


feature: stars growth(from previous month)

In [12]:
#computes average star rating in reviews per month (and business id)
avg_stars_per_month = state_businesses_grouped['stars_review'].mean()

def get_star_growth(bid, timestamp):
    date = datetime.date(timestamp.year, timestamp.month, 1)
    #compares to LAST month to prevent leakage
    growth = avg_stars_per_month[bid].diff().fillna(0)[date.year][date.month]
    print(date)
    return growth

if rebuild_features == True:
    state_reviews_sg = state_reviews_dt.apply(lambda row: get_star_growth(row['business_id'], row.name), axis=1)
    clear_output(wait=False)
    state_reviews_sg_df = state_reviews_sg.to_frame()
    state_reviews_sg_df.columns = ['star_growth']
    state_reviews_sg_df.to_csv('b2data/' + state + 'star_growth.csv')
else:
    state_reviews_sg = to_datetime(pd.read_csv('b2data/' + state + 'star_growth.csv').rename(index=str, columns={'datetime':'date'}))


In [None]:
def get_percent_reviews(bid, timestamp):
    date = datetime.date(timestamp.year, timestamp.month, 1)
    revs = rev_per_month[bid][date.year][date.month]
    r_count = rev_per_month[bid].cumsum()[date.year][date.month]
    print(timestamp)
    rev_percent = revs/r_count
    return rev_percent

if rebuild_features == True:
    state_reviews_percent = state_reviews_dt.apply(lambda row: get_percent_reviews(row['business_id'], row.name), axis=1)
    clear_output(wait=False)
    state_reviews_percent_df = state_reviews_percent.to_frame()
    state_reviews_percent_df.columns = ['percent_reviews']
    state_reviews_percent_df.to_csv('b2data/' + state + 'percent_reviews.csv')
else:
    state_reviews_percent = to_datetime(pd.read_csv('b2data/' + state + 'percent_reviews.csv').rename(index=str, columns={'datetime':'date'}))

2005-02-02 00:00:00
2005-03-08 00:00:00
2005-03-09 00:00:00
2005-03-29 00:00:00
2005-04-08 00:00:00
2005-04-08 00:00:00
2005-04-15 00:00:00
2005-04-15 00:00:00
2005-04-15 00:00:00
2005-04-19 00:00:00
2005-05-02 00:00:00
2005-05-03 00:00:00
2005-05-17 00:00:00
2005-05-20 00:00:00
2005-07-03 00:00:00
2005-07-08 00:00:00
2005-07-10 00:00:00
2005-07-10 00:00:00
2005-07-10 00:00:00
2005-07-11 00:00:00
2005-07-12 00:00:00
2005-07-18 00:00:00
2005-07-18 00:00:00
2005-07-18 00:00:00
2005-07-18 00:00:00
2005-07-18 00:00:00
2005-07-18 00:00:00
2005-07-18 00:00:00
2005-07-18 00:00:00
2005-07-18 00:00:00
2005-07-18 00:00:00
2005-08-20 00:00:00
2005-08-24 00:00:00
2005-09-19 00:00:00
2005-10-05 00:00:00
2005-10-07 00:00:00
2005-10-07 00:00:00
2005-10-09 00:00:00
2005-10-09 00:00:00
2005-10-09 00:00:00
2005-10-09 00:00:00
2005-10-09 00:00:00
2005-10-12 00:00:00
2005-10-12 00:00:00
2005-10-13 00:00:00
2005-10-13 00:00:00
2005-11-07 00:00:00
2005-11-11 00:00:00
2005-11-11 00:00:00
2005-11-11 00:00:00


2006-06-25 00:00:00
2006-06-25 00:00:00
2006-06-25 00:00:00
2006-06-25 00:00:00
2006-06-25 00:00:00
2006-06-25 00:00:00
2006-06-25 00:00:00
2006-06-25 00:00:00
2006-06-25 00:00:00
2006-06-25 00:00:00
2006-06-28 00:00:00
2006-06-29 00:00:00
2006-06-29 00:00:00
2006-06-29 00:00:00
2006-06-29 00:00:00
2006-06-29 00:00:00
2006-06-29 00:00:00
2006-06-30 00:00:00
2006-06-30 00:00:00
2006-06-30 00:00:00
2006-06-30 00:00:00
2006-07-02 00:00:00
2006-07-02 00:00:00
2006-07-02 00:00:00
2006-07-02 00:00:00
2006-07-03 00:00:00
2006-07-03 00:00:00
2006-07-04 00:00:00
2006-07-06 00:00:00
2006-07-08 00:00:00
2006-07-08 00:00:00
2006-07-08 00:00:00
2006-07-08 00:00:00
2006-07-08 00:00:00
2006-07-08 00:00:00
2006-07-08 00:00:00
2006-07-09 00:00:00
2006-07-10 00:00:00
2006-07-11 00:00:00
2006-07-11 00:00:00
2006-07-14 00:00:00
2006-07-14 00:00:00
2006-07-14 00:00:00
2006-07-14 00:00:00
2006-07-15 00:00:00
2006-07-18 00:00:00
2006-07-18 00:00:00
2006-07-18 00:00:00
2006-07-19 00:00:00
2006-07-20 00:00:00


2006-10-07 00:00:00
2006-10-07 00:00:00
2006-10-07 00:00:00
2006-10-07 00:00:00
2006-10-08 00:00:00
2006-10-08 00:00:00
2006-10-08 00:00:00
2006-10-08 00:00:00
2006-10-09 00:00:00
2006-10-09 00:00:00
2006-10-09 00:00:00
2006-10-09 00:00:00
2006-10-09 00:00:00
2006-10-09 00:00:00
2006-10-10 00:00:00
2006-10-10 00:00:00
2006-10-10 00:00:00
2006-10-10 00:00:00
2006-10-10 00:00:00
2006-10-10 00:00:00
2006-10-11 00:00:00
2006-10-12 00:00:00
2006-10-12 00:00:00
2006-10-12 00:00:00
2006-10-12 00:00:00
2006-10-12 00:00:00
2006-10-12 00:00:00
2006-10-12 00:00:00
2006-10-12 00:00:00
2006-10-12 00:00:00
2006-10-12 00:00:00
2006-10-13 00:00:00
2006-10-13 00:00:00
2006-10-13 00:00:00
2006-10-13 00:00:00
2006-10-13 00:00:00
2006-10-13 00:00:00
2006-10-13 00:00:00
2006-10-13 00:00:00
2006-10-13 00:00:00
2006-10-13 00:00:00
2006-10-14 00:00:00
2006-10-14 00:00:00
2006-10-14 00:00:00
2006-10-14 00:00:00
2006-10-15 00:00:00
2006-10-15 00:00:00
2006-10-16 00:00:00
2006-10-16 00:00:00
2006-10-17 00:00:00


2007-01-21 00:00:00
2007-01-21 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-22 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-23 00:00:00
2007-01-24 00:00:00
2007-01-24 00:00:00
2007-01-24 00:00:00
2007-01-24 00:00:00
2007-01-24 00:00:00
2007-01-24 00:00:00
2007-01-24 00:00:00


2007-02-21 00:00:00
2007-02-21 00:00:00
2007-02-21 00:00:00
2007-02-21 00:00:00
2007-02-21 00:00:00
2007-02-21 00:00:00
2007-02-21 00:00:00
2007-02-21 00:00:00
2007-02-21 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-22 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00
2007-02-23 00:00:00


2007-03-14 00:00:00
2007-03-14 00:00:00
2007-03-14 00:00:00
2007-03-14 00:00:00
2007-03-14 00:00:00
2007-03-14 00:00:00
2007-03-14 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-15 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00
2007-03-16 00:00:00


2007-04-10 00:00:00
2007-04-10 00:00:00
2007-04-10 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-11 00:00:00
2007-04-12 00:00:00
2007-04-12 00:00:00
2007-04-12 00:00:00
2007-04-12 00:00:00
2007-04-12 00:00:00
2007-04-13 00:00:00
2007-04-13 00:00:00
2007-04-13 00:00:00
2007-04-13 00:00:00
2007-04-13 00:00:00
2007-04-14 00:00:00
2007-04-14 00:00:00
2007-04-14 00:00:00
2007-04-15 00:00:00
2007-04-15 00:00:00
2007-04-15 00:00:00
2007-04-15 00:00:00
2007-04-15 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00
2007-04-16 00:00:00


2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-29 00:00:00
2007-05-30 00:00:00
2007-05-30 00:00:00
2007-05-30 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-05-31 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00
2007-06-01 00:00:00


2007-07-02 00:00:00
2007-07-02 00:00:00
2007-07-02 00:00:00
2007-07-02 00:00:00
2007-07-02 00:00:00
2007-07-02 00:00:00
2007-07-02 00:00:00
2007-07-02 00:00:00
2007-07-02 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-03 00:00:00
2007-07-04 00:00:00
2007-07-04 00:00:00
2007-07-04 00:00:00
2007-07-04 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-05 00:00:00
2007-07-06 00:00:00
2007-07-06 00:00:00
2007-07-06 00:00:00
2007-07-06 00:00:00
2007-07-06 00:00:00


2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-29 00:00:00
2007-07-30 00:00:00
2007-07-30 00:00:00
2007-07-30 00:00:00
2007-07-30 00:00:00
2007-07-30 00:00:00
2007-07-30 00:00:00
2007-07-30 00:00:00
2007-07-30 00:00:00
2007-07-30 00:00:00
2007-07-30 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00
2007-07-31 00:00:00


In [None]:
feature: trendiness

In [None]:
#functions

#return a list of categories for a specific business (takes either a single row or full dataframe witha businessid)
def get_categories_business(business_df, business_id = None):
    if business_id!=None:
        categories = business_df[business_df['business_id'] == business_id].iloc[0]['categories']
    else:
        categories = business_df['categories']
    listed_cats = categories.split(', ')
    return listed_cats

#finds the N most popular categories for a given year and month
def top_categories_date(reviews, year, month, N=15, stopwords = []):
    if type(reviews.index[0]) == pd._libs.tslib.Timestamp:
        review_dt = reviews
    else:
        review_dt = to_datetime(reviews)
    time_filter = ((AZ_reviews_datetimed.index.year == year) & (AZ_reviews_datetimed.index.month == month))
    time_reviews = review_dt[time_filter]
    flat_list = get_categories(time_reviews)
    counted_categories = Counter(flat_list)
    sorted_categories = list(OrderedDict(values.most_common()).keys())
    for word in stopwords:
        if(word in sorted_categories):
            sorted_categories.remove(word)
    return sorted_categories[:N]

#computes number of categories in commmon with a business and a given month
def trendiness_measure(review_df, year, month, business_id, N = 15):
    business_cats = get_categories_business(review_df, business_id = business_id)
    trendiness_cats = top_categories_date(review_df, year, month, N=N)
    return len(set(business_cats) & set(trendiness_cats))

In [None]:
#build monthly trendiness array
if rebuild_features == True:
    months = []
    for key in state_businesses_grouped.groups.keys():
        months.append(datetime.date(key[0],key[1], 1))

    top_categories = pd.DataFrame(index = months, columns = {"top_categories":''})
    for i, date in enumerate(months):
        print('{:2.4f}% complete\t'.format(i*100/len(months)) + str(date))
        top_categories.loc[date]['top_categories'] = ', '.join(top_categories_date(state_reviews_dt, date.year, date.month, N=100))
    top_categories.to_csv('b2data/' + state + 'monthly_trends.csv')
else   
    top_categories = pd.read_csv('b2data/' + state + 'monthly_trends.csv')

In [None]:
def trendiness_measure_apply(categories, timestamp):
    if type(categories)==float:
        return 0
    business_cats = categories.split(', ')
    date = datetime.date(timestamp.year, timestamp.month, 1)
    trendiness_cats = top_categories.loc[date]['top_categories'].split(', ')
    result = len(set(business_cats) & set(trendiness_cats))
    clear_output(wait=True)
    display(timestamp)
    return result

if rebuild_features == True:
    state_trendiness = state_reviews_dt.apply(lambda row: trendiness_measure_apply(row['categories'], row.name), axis=1)
    state_trendiness.to_csv('b2data/' + state + 'trendiness.csv')
else:
    state_trendiness = pd.read_csv('b2data/' + state + 'trendiness.csv')


Step 3: Modeling

In [None]:
#make a big dataframe:
state_reviews_dt['trendiness'] = state_trendiness
state_reviews_dt['monthly_growth'] = state_reviews_mg
state_reviews_dt['percent_reviews'] = state_reviews_percent
state_reviews_dt['star_growth'] = state_reviews_sg


In [None]:
#set target threshold
state_reviews_dt['high_growth'] = (state_reviews_dt['monthly_growth']>3)

In [None]:
#train/test split
from sklearn.model_selection import train_test_split
X = state_reviews_dt[['trendiness', 'percent_reviews', 'star_growth']]
Y = state_reviews_dt['high_growth']
Xtr, Xte, Ytr, Yte = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

print("Training Decision Tree...")
dtc = DecisionTreeClassifier()
dtc.fit(Xtr, Ytr)

print("Training Logistic Regression...")
lrc = LogisticRegression()
lrc.fit(Xtr, Ytr)

print("Training Bernolli Naive Bayes...")
nbc = BernoulliNB()
nbc.fit(Xtr, Ytr)

print("Decision Tree mean accuracy: {:2.4f})".format(dtc.score(Xte,Yte)))
print("Log-Reg mean accuracy: {:2.4f})".format(lrc.score(Xte,Yte)))
print("Naïve Bayes mean accuracy: {:2.4f})".format(nbc.score(Xte,Yte)))
print('Base accuracy: {:2.4f}'.format(1-Yte.sum()/Yte.count()))

In [None]:
from sklearn.metrics import roc_curve, auc

models = [dtc, lrc, nbc]
models_l = ['dtc', 'lrc', 'nbc']

for i, model in enumerate(models):
    predict = model.predict_proba(Xte)
    fpr, tpr, thres = roc_curve(Yte, predict[:,1])
    auc = skl.metrics.auc(fpr, tpr)

    plt.plot(fpr, tpr, label=(models_l[i] + ' AUC: {:2.4f}'.format(auc)))

plt.plot([[0,0],[1,1]], 'k--', label='', alpha=.75)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()