In [1]:
%matplotlib inline
import matplotlib
import seaborn as sns
sns.set()
matplotlib.rcParams['figure.dpi'] = 144

In [2]:
from static_grader import grader

In [3]:
import gzip
import ujson as json
import pprint

import pandas as pd
import numpy as np

# PS Miniproject


## Introduction


The objective of this miniproject is to apply probability and statistics to yelp data for businesses primarily in AZ and NV. We will study the data to find meaningful patterns in the ratings and data for these businesses.


## Metric


Your answers will be assessed based on how well you apply these standard statistical techniques.


## Download and parse the incoming data


The data are [here](s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz).
Notice that each row of the file is a json blurb.  You can read it with Python.

In [4]:
!mkdir -p data/
!aws s3 cp s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz data

download: s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz to data/yelp_train_academic_dataset_business.json.gz


In [5]:
with gzip.open("data/yelp_train_academic_dataset_business.json.gz") as fin:
    data = [json.loads(line) for line in fin]
df = pd.DataFrame(data)
df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,{u'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{u'Thursday': {u'close': u'17:00', u'open': u'...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,7,3.5,AZ,business
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business
2,"{u'Take-out': True, u'Outdoor Seating': False,...",uGykseHzyS5xAMWoN6YUqA,"[American (Traditional), Restaurants]",De Forest,"505 W North St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'06...",43.252267,-89.353437,Deforest Family Restaurant,[],True,16,4.0,WI,business
3,"{u'Take-out': True, u'Accepts Credit Cards': T...",LRKJF43s9-3jG9Lgx4zODg,"[Food, Ice Cream & Frozen Yogurt, Fast Food, R...",De Forest,"4910 County Rd V\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'10...",43.251045,-89.374983,Culver's,[],True,7,4.5,WI,business
4,"{u'Take-out': True, u'Has TV': False, u'Outdoo...",RgDg-k9S5YD_BaxMckifkg,"[Chinese, Restaurants]",De Forest,"631 S Main St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'11...",43.240875,-89.343722,Chang Jiang Chinese Kitchen,[],True,3,4.0,WI,business


## City z-score

In [6]:
#def more_than5(df):
#    if df['business_id'].sum()>4:
#        return df
#more_than5(ah)

In [7]:
city_group = df.groupby('city')
len(city_group)

167

In [8]:
city_group.stars.count().head()

city
Ahwatukee           8
Anthem             55
Apache Junction    80
Arcadia             1
Atlanta             1
Name: stars, dtype: int64

In [9]:
city_groups = city_group.business_id.nunique()
city_groups.head()

city
Ahwatukee           8
Anthem             55
Apache Junction    80
Arcadia             1
Atlanta             1
Name: business_id, dtype: int64

In [10]:
busi_boolean = city_group.business_id.nunique() > 4
busi_boolean.head()

city
Ahwatukee           True
Anthem              True
Apache Junction     True
Arcadia            False
Atlanta            False
Name: business_id, dtype: bool

In [11]:
df_boolean = busi_boolean.to_frame()
df_boolean.head()

Unnamed: 0_level_0,business_id
city,Unnamed: 1_level_1
Ahwatukee,True
Anthem,True
Apache Junction,True
Arcadia,False
Atlanta,False


In [12]:
df_popular_cities = df.set_index('city').loc[busi_boolean]
len(df_popular_cities)

37797

In [13]:
df_popular_cities = df.set_index('city').loc[busi_boolean]
len(df_popular_cities)

37797

In [14]:
type(df_popular_cities)

pandas.core.frame.DataFrame

In [15]:
#def get_5busi_city(GroupByObject):
#    if GroupByObject.business_id.unique() > 4:
#        return GroupByObject
#    else:
#        pass

In [16]:
#city_group = df_popular_cities.groupby('city')

In [17]:
#city_5 = city_group.apply(get_5busi_city)

In [18]:
#city_5[city_5['business_id']<=5]

Do certain cities have better amenities than others?  We can test this by comparing the average score or rating of businesses in one city vs. the average across all businesses in the data set.  We will also want to use statistics to help determine if this difference is statistically significant.

Compute the z-score for average ratings for every city.

**Note**:
1. Only look at cities with at least five businesses, both for the city averages and when computing the average across all cities.
2. Estimate the population's standard deviation and mean.
3. Estimate each city's mean and use the population statistics to compute the z score.
4. Do not do any cleaning of the city names.  (You will get Las Vegas twice.  That's okay.)

In [19]:
total_std = df_popular_cities.stars.std(ddof=0)
total_std

0.88905436664907611

In [20]:
city_std_err = total_std / np.sqrt(city_group.stars.count())
city_std_err.head()

city
Ahwatukee          0.314328
Anthem             0.119880
Apache Junction    0.099399
Arcadia            0.889054
Atlanta            0.889054
Name: stars, dtype: float64

In [21]:
total_mean = df_popular_cities.stars.mean()
total_mean

3.6726062915046169

In [22]:
city_gro = df_popular_cities.groupby('city')
city_mean = city_gro['stars'].mean()
city_mean.head()

city
Ahwatukee          3.687500
Anthem             3.781818
Apache Junction    3.637500
Avondale           3.538627
Boulder City       4.136364
Name: stars, dtype: float64

In [23]:
city_z = (city_mean - total_mean) / city_std_err
city_z.head()

city
Ahwatukee          0.047383
Anthem             0.911010
Apache Junction   -0.353184
Arcadia                 NaN
Atlanta                 NaN
Name: stars, dtype: float64

In [24]:
df_city_z = pd.DataFrame(city_z).reset_index()
df_city_z.head()

Unnamed: 0,city,stars
0,Ahwatukee,0.047383
1,Anthem,0.91101
2,Apache Junction,-0.353184
3,Arcadia,
4,Atlanta,


In [25]:
def city_z_score():
    #return [('Ahwatukee', 0.047382042549430063)] * 70
    alist = []
    alist = [ (i[0],i[1])
          for i in df_city_z.itertuples(index=False)]
    return alist
grader.score('ps__city_z_score', city_z_score)

[(u'Ahwatukee', 0.047382669358208063), (u'Anthem', 0.91100959187573716), (u'Apache Junction', -0.35318449438866156), (u'Arcadia', nan), (u'Atlanta', nan), (u'Avondale', -2.3003217405756531), (u'Black Canyon City', nan), (u'Bonnyrigg', nan), (u'Boulder City', 2.4466611274394734), (u'Buckeye', -2.5035752877810342), (u'C Las Vegas', nan), (u'Cambridge', 1.3264522703445301), (u'Carefree', 1.4596358029225591), (u'Casa Grande', -1.6299874135344514), (u'Cave Creek', 3.0846624289022677), (u'Centennial Hills', nan), (u'Central City Village', nan), (u'Central Henderson', nan), (u'Chandler', -0.21683034896274497), (u'Chandler-Gilbert', nan), (u'City of Edinburgh', nan), (u'Clark County', nan), (u'Coolidge', -0.74796439569439954), (u'Cottage Grove', -1.5197633109207875), (u'Cramond', nan), (u'Dalkeith', nan), (u'Dane', nan), (u'De Forest', 0.21323228615356482), (u'DeForest', 1.0749414447548939), (u'Deforest', nan), (u'Eagan', nan), (u'Edinburgh', 7.08965791495209), (u'El Mirage', 0.168765293096689

**Question**: Which cities have the most statistically high ratings?  Do you notice a pattern?

## "Good for Kids" confidence interval

Which cities are the most child friendly?  Let's estimate the 2-sigma confidence interval for fraction of venues which are 'Good for Kids' in each city.

**Note**:
1. Ignore any businesses that don't have the 'Good for Kids' attribute specified. **IMPORTANT**: These are the businesses for which you have no information and they should be filtered out. All businesses that have the 'Good for Kids' attribute, whether that be True or False, should be included.
2. Only look at cities with at least five businesses (satisfying the above condition).
3. In this simplified schema, a venue is either 'Good for Kids' or not.  What kind of random variable does this correspond with?
4. Notice that some cities have an unrealistic confidence interval.  Can you apply the "Rule of Three" to generate more realistic confidence intervals in this case?

In [26]:
df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,{u'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{u'Thursday': {u'close': u'17:00', u'open': u'...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,7,3.5,AZ,business
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business
2,"{u'Take-out': True, u'Outdoor Seating': False,...",uGykseHzyS5xAMWoN6YUqA,"[American (Traditional), Restaurants]",De Forest,"505 W North St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'06...",43.252267,-89.353437,Deforest Family Restaurant,[],True,16,4.0,WI,business
3,"{u'Take-out': True, u'Accepts Credit Cards': T...",LRKJF43s9-3jG9Lgx4zODg,"[Food, Ice Cream & Frozen Yogurt, Fast Food, R...",De Forest,"4910 County Rd V\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'10...",43.251045,-89.374983,Culver's,[],True,7,4.5,WI,business
4,"{u'Take-out': True, u'Has TV': False, u'Outdoo...",RgDg-k9S5YD_BaxMckifkg,"[Chinese, Restaurants]",De Forest,"631 S Main St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'11...",43.240875,-89.343722,Chang Jiang Chinese Kitchen,[],True,3,4.0,WI,business


In [27]:
df['attributes'].head()

0                       {u'By Appointment Only': True}
1    {u'Take-out': True, u'Price Range': 1, u'Outdo...
2    {u'Take-out': True, u'Outdoor Seating': False,...
3    {u'Take-out': True, u'Accepts Credit Cards': T...
4    {u'Take-out': True, u'Has TV': False, u'Outdoo...
Name: attributes, dtype: object

In [28]:
def check_key(key_lis):
    if 'Good for Kids' in key_lis:
        return True
    else:
        return False

In [29]:
#df['attributes'][1]#{'Good For Kids'}
#val = [d.get('Good For Kids') for d in df['attributes']]
#val_fil = [x for x in val if x is not None]

In [30]:
key_ls = []
for i in range(len(df)):
    key_lis = []
    key_lis = list(df['attributes'][i].viewkeys())
    key_ls.append(check_key(key_lis))

In [31]:
df['attributes'].iloc[2]["Good for Kids"]#.items()#.get('Good For Kids')

True

In [32]:
key_ls[:10]

[False, True, True, False, False, False, False, False, False, True]

In [33]:
df['GFK'] = key_ls
df_gfk = df[df['GFK'] == True]
len(df_gfk)

15124

In [34]:
df_gfk.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,GFK
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business,True
2,"{u'Take-out': True, u'Outdoor Seating': False,...",uGykseHzyS5xAMWoN6YUqA,"[American (Traditional), Restaurants]",De Forest,"505 W North St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'06...",43.252267,-89.353437,Deforest Family Restaurant,[],True,16,4.0,WI,business,True
9,"{u'Alcohol': u'full_bar', u'Noise Level': u'lo...",_wZTYYL7cutanzAnJUTGMA,"[Bars, American (Traditional), Nightlife, Loun...",Mc Farland,"4506 Larson Beach Rd\nMc Farland, WI 53558",{},43.017701,-89.303789,Beach House Restaurant & Lounge,[],True,31,3.5,WI,business,True
11,{u'Good for Kids': True},1tkeiIa-daD8LbM6mHm_9A,"[Active Life, Bowling]",Mc Farland,"4711 Farwell St\nMc Farland, WI 53558","{u'Monday': {u'close': u'02:00', u'open': u'11...",43.013156,-89.302164,Spartan Bowl,[],True,3,1.5,WI,business,True
12,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",zOc8lbjViUZajbY7M0aUCQ,"[Pizza, Restaurants]",Mc Farland,"5813 Main St\nMc Farland, WI 53558","{u'Tuesday': {u'close': u'22:00', u'open': u'1...",43.014164,-89.288567,Spartan Pizza,[],True,4,3.5,WI,business,True


In [35]:
val = []
for i in range(len(df_gfk)):
    val.append(df_gfk['attributes'].iloc[i]["Good for Kids"])
len(val)

15124

In [36]:
val[:20]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 False]

In [37]:
df_gfk['GFK_val'] = val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
df_gfk.head(2)

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,GFK,GFK_val
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business,True,True
2,"{u'Take-out': True, u'Outdoor Seating': False,...",uGykseHzyS5xAMWoN6YUqA,"[American (Traditional), Restaurants]",De Forest,"505 W North St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'06...",43.252267,-89.353437,Deforest Family Restaurant,[],True,16,4.0,WI,business,True,True


In [39]:
city_group2 = df_gfk.groupby('city')
city_boolean = city_group2.business_id.nunique() > 4
city_boolean.head() #115 rows

city
Ahwatukee          False
Anthem              True
Apache Junction     True
Arcadia            False
Atlanta            False
Name: business_id, dtype: bool

In [40]:
df_gfk_5 = df_gfk.set_index('city').loc[city_boolean]

In [41]:
df_gfk_busi = df_gfk_5.groupby('city')

In [42]:
len(df_gfk_busi)

51

In [43]:
#df_city = df.groupby('city')
#df_city.get_group('Anthem')

In [44]:
df_ph = df_gfk_busi.get_group('Anthem')
df_ph['GFK'].sum()

21

In [45]:
df_ph.head(10)

Unnamed: 0_level_0,attributes,business_id,categories,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,GFK,GFK_val
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Anthem,"{u'Take-out': True, u'Accepts Credit Cards': T...",v1ocqAm5soWTNPttLQhSZw,"[Chinese, Restaurants]","3434 W Anthem Way\nSte 118\nAnthem, AZ 85086",{},33.864429,-112.135305,Shanghai Club,[],True,50,3.5,AZ,business,True,True
Anthem,"{u'Take-out': True, u'Accepts Credit Cards': T...",wyOMi6P9E3C8Fk1IxlcLJA,"[Pizza, Restaurants]","42211 N 41st Dr\nAnthem, AZ 85086","{u'Monday': {u'close': u'21:00', u'open': u'10...",33.867395,-112.147858,Rays Pizza,[],True,19,3.5,AZ,business,True,True
Anthem,"{u'Take-out': True, u'Accepts Credit Cards': T...",Vh_xXB42CYfhEdelpDb1Ow,"[Pizza, Restaurants]","3632 W Anthem Way\nAnthem, AZ 85086","{u'Monday': {u'close': u'22:00', u'open': u'11...",33.865974,-112.136601,Pizza Hut,[],True,6,2.5,AZ,business,True,True
Anthem,"{u'Good For': {u'dessert': False, u'latenight'...",sowtJ0oFaa3a579A7Pmadw,[Restaurants],"3655 West Anthem Way\nAnthem, AZ 85086",{},33.864913,-112.138854,Pinata Nueva,[],False,3,2.5,AZ,business,True,True
Anthem,{u'Good for Kids': True},d523RJhT7ap6GblcVwCb7Q,"[Active Life, Fitness & Instruction]","41130 N Freedom Way\nAnthem, AZ 85086",{},33.85856,-112.13049,Anthem Community Center,[],True,4,4.5,AZ,business,True,True
Anthem,"{u'Take-out': True, u'Accepts Credit Cards': T...",koCfyexzjjF3pZrzijlL3g,"[Breakfast & Brunch, American (New), Restaurants]","4211 W Anthem Way\nAnthem, AZ 85086",{},33.871067,-112.150695,Denny's Anthem,[],True,5,3.5,AZ,business,True,True
Anthem,"{u'Delivery': True, u'Take-out': True, u'Accep...",mt7scTEjBOKKgWamSBcf_Q,"[Fast Food, Mexican, Tex-Mex, Restaurants]","3659 W. Anthem Way\nAnthem, AZ 85086","{u'Monday': {u'close': u'23:00', u'open': u'07...",33.86411,-112.1359,Taco Bell,[],True,5,2.0,AZ,business,True,True
Anthem,"{u'Alcohol': u'none', u'Noise Level': u'quiet'...",238vCjwa6kQCV7ukJI6d8A,"[Greek, Mediterranean, Restaurants]","3655 W Anthem Way\nAnthem, AZ 85086",{},33.864913,-112.138854,Mr Gyros II,[],True,23,4.0,AZ,business,True,True
Anthem,"{u'Noise Level': u'average', u'Takes Reservati...",i8DkHnUxJwPBd7e50FHUpw,"[Mexican, Restaurants]","4115 W Anthem Way\nAnthem, AZ 85086",{},33.870521,-112.148602,Del Taco,[],True,8,2.5,AZ,business,True,True
Anthem,"{u'Take-out': True, u'Accepts Credit Cards': T...",zwFpOl3BNBazrdXynuQb-A,"[Sandwiches, Restaurants]","4250 W Anthem Way\nAnthem, AZ 85086",{},33.864028,-112.127884,Charley's Grilled Subs,[],True,5,4.0,AZ,business,True,True


In [46]:
len(df_ph)

21

In [47]:
df_ph['GFK'].mean()

1.0

In [48]:
str(df_ph.index.values[0])

'Anthem'

In [49]:
list_t = []

In [50]:

def tup(df_gfk):
    if df_gfk['GFK'].sum() == len(df_gfk):
        return list_t.append((str(df_gfk.index.values[0]),(1.0 - 3.0/len(df_gfk),1)))
    else:
        p = df_gfk['GFK'].mean()
        s = np.sqrt(p*(1-p)/len(df_gfk))
        return list_t.append((str(df_gfk.index.values[0]),(p - 2.0*s,p + 2.0*s)))
    

In [51]:
df_gfk_busi.apply(tup)

In [52]:
len(list_t)

52

In [53]:
new_list = list_t[1:]

In [54]:
6.0/7 

0.8571428571428571

In [55]:
len(df_ph)

21

In [56]:
def good_for_kids_ci():
    #return [('Anthem', (0.85714285714285721, 1.0))] * 51
    return new_list

grader.score('ps__good_for_kids_ci', good_for_kids_ci)

Your score:  0.303921568627


**Question**: For which cities do you need to apply the Rule of Three?  Is there there something similar about these cities in the data?

## Category reviews confidence interval

Some categories may be more popular than others.  Compute the 3-sigma confidence interval for the average number of reviews for businesses in each category.

**Note**:
1. Category is actually a list of categories that apply to the business.  Let's just set a business's category to be the first one in this list.  Ignore businesses that do not have categories defined.
2. Only consider categories that have at least 40 businesses.
3. Assume the data for all the businesses was collected over the same time period. What probability distribution might we assume for the count of reviews during that time?  Instead of estimation the standard deviation using the standard deviation of the sample, use the properties of the assumed probability distribution to estimate the standard deviation.

In [57]:
df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,GFK
0,{u'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{u'Thursday': {u'close': u'17:00', u'open': u'...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,7,3.5,AZ,business,False
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business,True
2,"{u'Take-out': True, u'Outdoor Seating': False,...",uGykseHzyS5xAMWoN6YUqA,"[American (Traditional), Restaurants]",De Forest,"505 W North St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'06...",43.252267,-89.353437,Deforest Family Restaurant,[],True,16,4.0,WI,business,True
3,"{u'Take-out': True, u'Accepts Credit Cards': T...",LRKJF43s9-3jG9Lgx4zODg,"[Food, Ice Cream & Frozen Yogurt, Fast Food, R...",De Forest,"4910 County Rd V\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'10...",43.251045,-89.374983,Culver's,[],True,7,4.5,WI,business,False
4,"{u'Take-out': True, u'Has TV': False, u'Outdoo...",RgDg-k9S5YD_BaxMckifkg,"[Chinese, Restaurants]",De Forest,"631 S Main St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'11...",43.240875,-89.343722,Chang Jiang Chinese Kitchen,[],True,3,4.0,WI,business,False


In [58]:
df['categories'].head()

0                          [Doctors, Health & Medical]
1                                        [Restaurants]
2                [American (Traditional), Restaurants]
3    [Food, Ice Cream & Frozen Yogurt, Fast Food, R...
4                               [Chinese, Restaurants]
Name: categories, dtype: object

In [59]:
df['category'] = df['categories'].apply(lambda x: np.nan if len(x) == 0 else x[0])

In [60]:
df.head(1)

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,GFK,category
0,{u'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{u'Thursday': {u'close': u'17:00', u'open': u'...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,7,3.5,AZ,business,False,Doctors


In [61]:
df_400 = df.groupby('category').filter(lambda x: len(x) >= 40)

In [62]:
df_400.shape

(35068, 17)

In [63]:
grp_cat = df_400.groupby('category')
len(grp_cat)

119

In [64]:
def ci(df):
    lambd = df['review_count'].mean()
    cont = df['review_count'].count()
    std = np.sqrt(lambd/cont)
    c_interval = (lambd - 3*std,lambd + 3*std)
    return c_interval

In [65]:
city_review = grp_cat.apply(ci)
type(city_review)

pandas.core.series.Series

In [66]:
df_city_review = pd.DataFrame(city_review).reset_index()
df_city_review.head()

Unnamed: 0,category,0
0,Active Life,"(16.7885523932, 17.4317296753)"
1,Adult Entertainment,"(60.6685146178, 66.2629922315)"
2,American (New),"(67.1246006995, 69.6701361426)"
3,American (Traditional),"(49.8929126957, 51.6396415614)"
4,Arts & Crafts,"(8.12875285363, 9.49477655813)"


In [67]:
#category_reviews_ci_tuples = ...

In [68]:
def category_reviews_ci():
    #return [('Active Life', (16.788552393175845, 17.431729675326505))] * 119
    alist = []
    alist = [ (i[0],i[1])
    for i in df_city_review.itertuples(index=False)]
    return alist

grader.score('ps__category_reviews_ci', category_reviews_ci)

Your score:  1.0


**Questions**:
1. What categories of businesses tend to have fewer reviews?
2. What categories of businesses tend to have more reviews?

## Stars by popularity z-score

Are more popular venues more likely to be highly rated?  Given the large variation in reporting amongst categories, we know that popularity depends on the category.

Separate the venues into those that have (strictly) more reviews than the median for their category, and those that have the same number or fewer.  For each category, compute the average number of stars for both those businesses with more than and less than or equal to the median number of reviews.  Report the z score of the difference of those means.

**Note**:
1. Again, category of a business will be defined as the first category and you should ignore businesses that do not have categories.
2. Likewise, only consider categories with at least 40 businesses.

In [88]:
df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,GFK,category
0,{u'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{u'Thursday': {u'close': u'17:00', u'open': u'...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,7,3.5,AZ,business,False,Doctors
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business,True,Restaurants
2,"{u'Take-out': True, u'Outdoor Seating': False,...",uGykseHzyS5xAMWoN6YUqA,"[American (Traditional), Restaurants]",De Forest,"505 W North St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'06...",43.252267,-89.353437,Deforest Family Restaurant,[],True,16,4.0,WI,business,True,American (Traditional)
3,"{u'Take-out': True, u'Accepts Credit Cards': T...",LRKJF43s9-3jG9Lgx4zODg,"[Food, Ice Cream & Frozen Yogurt, Fast Food, R...",De Forest,"4910 County Rd V\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'10...",43.251045,-89.374983,Culver's,[],True,7,4.5,WI,business,False,Food
4,"{u'Take-out': True, u'Has TV': False, u'Outdoo...",RgDg-k9S5YD_BaxMckifkg,"[Chinese, Restaurants]",De Forest,"631 S Main St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'11...",43.240875,-89.343722,Chang Jiang Chinese Kitchen,[],True,3,4.0,WI,business,False,Chinese


In [116]:
df_fi = df.loc[:,['category','review_count','stars']]

In [119]:
df_sorted = df_fi.sort_values(['category','review_count','stars'])

In [98]:
grp_cat = df.groupby('category')['type'].count()
grp_df = grp_cat.to_frame()
grp_df = grp_df[grp_df['type'] >= 40]
grp_df = grp_df.reset_index()

In [99]:
median_cat = df.groupby('category').median()
median_cat.rename(columns={'review_count':'median'}, inplace = True)
median_cat.dtypes
#median_cat[['median','stars','length']] = median_cat[['median','stars','length']].astype(int)

latitude     float64
longitude    float64
open         float64
median       float64
stars        float64
GFK          float64
dtype: object

In [113]:
median_cat = median_cat.reset_index()
median_cat = median_cat[['category','median']]
median_cat = pd.merge(grp_df,median_cat,on ='category',how='left')
median_cat = median_cat[['category','median']]
median_cat.head()

Unnamed: 0,category,median
0,Active Life,8.0
1,Adult Entertainment,13.0
2,American (New),18.0
3,American (Traditional),20.0
4,Arts & Crafts,6.0


In [81]:
sorted_category[['category','review_count','stars']]

NameError: name 'sorted_category' is not defined

In [120]:
df_sorted

Unnamed: 0,category,review_count,stars
13289,Accessories,9,3.0
5821,Accountants,3,2.5
34951,Accountants,3,2.5
7548,Accountants,3,3.5
22733,Accountants,3,3.5
24909,Accountants,3,5.0
26925,Accountants,3,5.0
27611,Accountants,3,5.0
31005,Accountants,3,5.0
234,Accountants,4,2.0


In [121]:
df_sorted = pd.merge(median_cat,df_sorted,on = 'category', how ='left')
df_sorted['above_median'] = df_sorted['review_count'] > df_sorted['median']
df_sorted.head()

Unnamed: 0,category,median,review_count,stars,above_median
0,Active Life,8.0,3,1.0,False
1,Active Life,8.0,3,1.0,False
2,Active Life,8.0,3,1.5,False
3,Active Life,8.0,3,1.5,False
4,Active Life,8.0,3,1.5,False


In [122]:
sorted_cat = df_sorted.groupby('category').count()
sorted_cat = sorted_cat.reset_index()
sorted_cat = sorted_cat[['category','review_count']]
sorted_cat.columns = ['category','count']
sorted_cat.head()

Unnamed: 0,category,count
0,Active Life,1489
1,Adult Entertainment,73
2,American (New),380
3,American (Traditional),599
4,Arts & Crafts,170


In [124]:
above_median = df_sorted[df_sorted['above_median'] == True]
above_median[['category','review_count','median','stars']]
above_median_n = above_median.groupby('category')['review_count'].count()
grp_above = above_median.groupby('category')['stars'].mean()
grp_above.head()

category
Active Life               3.965706
Adult Entertainment       3.388889
American (New)            3.668478
American (Traditional)    3.314189
Arts & Crafts             3.595238
Name: stars, dtype: float64

In [131]:
var_1 = above_median.groupby('category')['stars'].var(ddof=1)
var_1.head()

category
Active Life               0.500196
Adult Entertainment       0.315873
American (New)            0.345777
American (Traditional)    0.334849
Arts & Crafts             0.737808
Name: stars, dtype: float64

In [135]:
n1 = above_median.groupby('category')['stars'].count()
n1.head()

category
Active Life               729
Adult Entertainment        36
American (New)            184
American (Traditional)    296
Arts & Crafts              84
Name: stars, dtype: int64

In [143]:
s1 = var_1/n1
s1.head()

category
Active Life               0.000686
Adult Entertainment       0.008774
American (New)            0.001879
American (Traditional)    0.001131
Arts & Crafts             0.008783
Name: stars, dtype: float64

In [125]:
below_median = df_sorted[df_sorted['above_median'] == False]
below_median[['category','review_count','median','stars']]
below_median_n = below_median.groupby('category')['review_count'].count()
grp_below = below_median.groupby('category')['stars'].mean()
grp_below.head()

category
Active Life               4.163158
Adult Entertainment       3.270270
American (New)            3.349490
American (Traditional)    3.181518
Arts & Crafts             3.947674
Name: stars, dtype: float64

In [132]:
var_2 = below_median.groupby('category')['stars'].var(ddof=1)
var_2.head()

category
Active Life               0.718404
Adult Entertainment       0.549925
American (New)            0.629795
American (Traditional)    0.548068
Arts & Crafts             0.794289
Name: stars, dtype: float64

In [136]:
n2 = below_median.groupby('category')['stars'].count()
n2.head()

category
Active Life               760
Adult Entertainment        37
American (New)            196
American (Traditional)    303
Arts & Crafts              86
Name: stars, dtype: int64

In [144]:
s2 = var_2/n2
s2.head()

category
Active Life               0.000945
Adult Entertainment       0.014863
American (New)            0.003213
American (Traditional)    0.001809
Arts & Crafts             0.009236
Name: stars, dtype: float64

In [137]:
#df_sorted.groupby('category').count()

In [140]:
sample_mean = grp_above - grp_below
sample_mean.head()

category
Active Life              -0.197451
Adult Entertainment       0.118619
American (New)            0.318988
American (Traditional)    0.132671
Arts & Crafts            -0.352436
Name: stars, dtype: float64

In [146]:
z_score = sample_mean/((s1+s2)**0.5)
z_score.head()

category
Active Life              -4.888538
Adult Entertainment       0.771535
American (New)            4.470037
American (Traditional)    2.446801
Arts & Crafts            -2.625495
Name: stars, dtype: float64

In [None]:

#popularity_stars_z_score_tuples = ...

In [170]:
df_z = z_score.to_frame()

In [171]:
def popularity_stars_z_score():
    #return [('Active Life', -4.8885384947587749)] * 119
    alist = []
    alist = [ (str(i[0]),i[1])
    for i in df_z.itertuples(index=True)]
    return alist
grader.score('ps__popularity_stars_z_score', popularity_stars_z_score)

Your score:  1.0


**Questions**:
1. What categories benefit from having more reviewers?
2. What categories are hurt by having more reviewers?
3. Why did we choose to separate each category by the median number of reviews rather than the mean number of reviews?
4. What types of categories have the most disagreement?  Use the variance as a proxy.  How would you compute the 2-sigma confidence interval?

*Copyright &copy; 2017 The Data Incubator.  All rights reserved.*