In [73]:
import numpy as np
import pandas as pd
import datetime
from dateutil.rrule import rrule, MONTHLY
import matplotlib.pyplot as plt
from collections import Counter, OrderedDict
from IPython.display import clear_output, display

In [2]:
#load ye olde data
AZ_reviews = pd.read_csv('AZ_review.csv')

In [3]:
#assign datetime
AZ_reviews_datetimed = AZ_reviews.assign(datetime = pd.to_datetime(AZ_reviews['date'])).drop(labels=['date'], axis = 'columns').set_index('datetime').sort_index()

In [4]:
#sort by month
AZ_reviews_groupby_date = AZ_reviews_datetimed.groupby([AZ_reviews_datetimed.index.year.rename('year'), AZ_reviews_datetimed.index.month.rename('month')])

In [5]:
#filter out only results with a given year and month
year = 2007
month = 8
time_filter = ((AZ_reviews_datetimed.index.year == year) & (AZ_reviews_datetimed.index.month == month))

aug2016 = AZ_reviews_datetimed[time_filter]

#split the strings in this new dataframe
aug2016_list = aug2016['categories'].str.split(', ').dropna()

In [6]:
#flatten categories into one looooong list
flat_list = [item for sublist in aug2016_list.values for item in sublist]

In [7]:
#count elements in the list and take the 15 most popular elements
values = Counter(flat_list)
sorted_values = list(OrderedDict(values.most_common()).keys())
sorted_values[:15]

['Restaurants',
 'Nightlife',
 'Bars',
 'Food',
 'American (Traditional)',
 'Sandwiches',
 'American (New)',
 'Shopping',
 'Mexican',
 'Pizza',
 'Arts & Entertainment',
 'Breakfast & Brunch',
 'Event Planning & Services',
 'Burgers',
 'Italian']

In [8]:
#Testing removel of stopwords
val = ['Restraunts','Nightlife']
test_sorted_values = sorted_values
for v in val:
    if(v in test_sorted_values):
        test_sorted_values.remove(v)
print(test_sorted_values[:15])

['Restaurants', 'Bars', 'Food', 'American (Traditional)', 'Sandwiches', 'American (New)', 'Shopping', 'Mexican', 'Pizza', 'Arts & Entertainment', 'Breakfast & Brunch', 'Event Planning & Services', 'Burgers', 'Italian', 'Lounges']


In [24]:
#defining useful functions that we we're going to use

#convert a dataframe with 'date' element to be indexed along datetime objects
def to_datetime(review_df):
    return review_df.assign(datetime = pd.to_datetime(AZ_reviews['date'])).drop(labels=['date'], axis = 'columns').set_index('datetime').sort_index()

#return a list of categories for some dataset (all rows)
def get_categories(review_df):
    listed_cats = review_df['categories'].str.split(', ').dropna()
    flat_list = [item for sublist in listed_cats.values for item in sublist]
    return flat_list

#return a list of categories for a specific business (takes either a single row or full dataframe witha businessid)
def get_categories_business(business_df, business_id = None):
    if business_id!=None:
        categories = business_df[business_df['business_id'] == business_id].iloc[0]['categories']
    else:
        categories = business_df['categories']
    listed_cats = categories.split(', ')
    return listed_cats

#finds the N most popular categories for a given year and month
def top_categories_date(reviews, year, month, N=15, stopwords = []):
    if type(reviews.index[0]) == pd._libs.tslib.Timestamp:
        review_dt = reviews
    else:
        review_dt = to_datetime(reviews)
    time_filter = ((AZ_reviews_datetimed.index.year == year) & (AZ_reviews_datetimed.index.month == month))
    time_reviews = review_dt[time_filter]
    flat_list = get_categories(time_reviews)
    counted_categories = Counter(flat_list)
    sorted_categories = list(OrderedDict(values.most_common()).keys())
    for word in stopwords:
        if(word in sorted_categories):
            sorted_categories.remove(word)
    return sorted_categories[:N]

#computes number of categories in commmon with a business and a given month
def trendiness_measure(review_df, year, month, business_id, N = 15):
    business_cats = get_categories_business(review_df, business_id = business_id)
    trendiness_cats = top_categories_date(review_df, year, month, N=N)
    return len(set(business_cats) & set(trendiness_cats))



In [10]:
%timeit top_categories_date(AZ_reviews_datetimed, 2014, 6)

187 ms ± 1.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
months = []
for key in AZ_reviews_groupby_date.groups.keys():
    months.append(datetime.date(key[0],key[1], 1))

In [12]:
#compute the top 100 categories for each month
top_categories = pd.DataFrame(index = months, columns = {"top_categories":''})
for i, date in enumerate(months):
    print('{:2.4f}% complete\t'.format(i*100/len(months)) + str(date))
    top_categories.loc[date]['top_categories'] = ', '.join(top_categories_date(AZ_reviews_datetimed, date.year, date.month, N=100))
path = 'trends.csv'
top_categories.to_csv(path)

0.0000% complete	2004-12-01
0.6135% complete	2005-02-01
1.2270% complete	2005-03-01
1.8405% complete	2005-04-01
2.4540% complete	2005-05-01
3.0675% complete	2005-06-01
3.6810% complete	2005-07-01
4.2945% complete	2005-08-01
4.9080% complete	2005-09-01
5.5215% complete	2005-10-01
6.1350% complete	2005-11-01
6.7485% complete	2005-12-01
7.3620% complete	2006-01-01
7.9755% complete	2006-02-01
8.5890% complete	2006-03-01
9.2025% complete	2006-04-01
9.8160% complete	2006-05-01
10.4294% complete	2006-06-01
11.0429% complete	2006-07-01
11.6564% complete	2006-08-01
12.2699% complete	2006-09-01
12.8834% complete	2006-10-01
13.4969% complete	2006-11-01
14.1104% complete	2006-12-01
14.7239% complete	2007-01-01
15.3374% complete	2007-02-01
15.9509% complete	2007-03-01
16.5644% complete	2007-04-01
17.1779% complete	2007-05-01
17.7914% complete	2007-06-01
18.4049% complete	2007-07-01
19.0184% complete	2007-08-01
19.6319% complete	2007-09-01
20.2454% complete	2007-10-01
20.8589% complete	2007-11-01
21

In [25]:
bid = 'tQAGMU_RNemsAs13HquhjQ'
bcats = get_categories_business(AZ_reviews, business_id = bid)
tcats = top_categories_date(AZ_reviews_datetimed, 2015, 6, N=100)

len(set(tcats)&set(bcats))

1

In [76]:
#restructured trendiness measure to work nicely in a loop using the top_categories dataframe
def trendiness_measure_apply(categories, timestamp):
    if type(categories)==float:
        return 0
    business_cats = categories.split(', ')
    date = datetime.date(timestamp.year, timestamp.month, 1)
    trendiness_cats = top_categories.loc[date]['top_categories'].split(', ')
    result = len(set(business_cats) & set(trendiness_cats))
    clear_output(wait=True)
    display(timestamp)
    return result

In [52]:
time = AZ_reviews_datetimed.iloc[0].name
date = datetime.date(time.year, time.month,1)
print(date)
print(top_categories.index[0])
print(date == top_categories.index[0])
print(top_categories.loc[date])

2004-12-01
2004-12-01
True
top_categories    Restaurants, Nightlife, Bars, Food, American (...
Name: 2004-12-01, dtype: object


In [None]:
#add trendiness measure to each review given business id and time
AZ_trendiness = AZ_reviews_datetimed.apply(lambda row: trendiness_measure_apply(row['categories'], row.name), axis=1)

In [None]:
#save results
AZ_reviews_datetimed['trendiness'] = AZ_trendiness
AZ_reviews_datetimed['trendiness'].describe()
path_final = 'AZ_review_with_trendiness.csv'
AZ_reviews_datetimed.to_csv(path_final)