In [1]:
import sys, math, os
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt    
import numpy as np
from collections import defaultdict
from collections import Counter
import itertools

sys.path.insert(0, '../../src/data/')
import utils

%matplotlib inline
%load_ext autoreload  
%autoreload 2  

path_raw = "../../data/raw/beer_reviews"

Quick way of getting the notebook name, see [here](https://stackoverflow.com/a/23619544/1153897) for source; notebook name will be in python variable `notebook`.

In [2]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "notebook = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

# Introduction

The goal of this notebook is to try and answer the question
> If you had to pick 3 beers to recommend using only this data, which would you pick?

# Initial thoughts

This question is quite open ended, best beer might be interpreted in the following ways:
- best overall beers in entire dataset
- best beers per `beer_type`
- best can be rated by any or a combination of the beer attributes: `review_overall`, `review_aroma`, `review_appearance`, `review_tast`, `review_palate`, `beer_abv`

**NOTE**: analysis [1.0_initial_look](1.0_initial_look.ipynb) identified various attributes with missing data that may impact this analysis:
- `beer_abv` not available for all beers
- `review_profilename` not available for each review -> it's uncertain what happend with this data - consider excluding?

In [3]:
# LOAD DATA
# we assume the file we're after is a
# single .csv in path_raw
for file in os.listdir(path_raw):
    file = os.path.join(path_raw, file)
    if os.path.isfile(file) and '.csv' in file: 
        dat_raw = pd.read_csv(file, encoding='utf-8') # NOTE: force utf-8 encoding because some beer_styles have accents in them
        
dat_raw.head()

# DF of beer metadata
beer_meta_cols = ['brewery_id', 'brewery_name','beer_style', 'beer_name','beer_abv']
beer_dat = dat_raw.groupby('beer_beerid').agg('first')[beer_meta_cols]

<hr>

In [4]:
print 'Num reviews without a profilename: ', dat_raw[dat_raw.review_profilename.isnull()].shape[0]
print 'Num reviews with a profilename: ', dat_raw[dat_raw.review_profilename.notnull()].shape[0]

Num reviews without a profilename:  348
Num reviews with a profilename:  1586266


<mark>Given that those reviews without a profilename represent a small fraction of the total reviews, we will exclude these reviews from the analysis.</mark>

In [5]:
# create new copy of data without reviews with missing profilename
dat = dat_raw[dat_raw.review_profilename.notnull()].copy()

For now, we will only consider the `review_overall` score in order to gauge a beer; we do this because it is assumed this metric is some sort of aggregating metric over all the other beer attribute scores and thus gives a good, generalized metric for "goodness".

In [6]:
# naively check average review_overall score for each beer
score_cols = ['review_overall','review_aroma','review_appearance','review_taste','beer_abv']
avg_scores = dat.groupby('beer_beerid').mean()[score_cols]
avg_scores.columns = ['avg ' + l for l in score_cols]
avg_scores.head()

counts = dat.groupby('beer_beerid').count()[['brewery_id']]
counts.columns = ['number of reviews']

perfect = dat[dat.review_overall==5].groupby('beer_beerid').count()[['brewery_id']]
perfect.columns = ['counts with top score']

avg_scores = avg_scores.merge(counts, left_index=True, right_index=True)
avg_scores = avg_scores.merge(perfect, left_index=True, right_index=True)

avg_scores.sort_values('avg review_overall', ascending=False).head(10)

Unnamed: 0_level_0,avg review_overall,avg review_aroma,avg review_appearance,avg review_taste,avg beer_abv,number of reviews,counts with top score
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
77303,5.0,4.5,4.5,4.5,5.8,1,1
63470,5.0,3.5,3.0,4.5,,1,1
63829,5.0,3.5,4.0,3.5,8.5,1,1
63746,5.0,3.5,4.0,4.0,,1,1
63735,5.0,4.0,4.0,4.5,5.4,1,1
42392,5.0,4.75,4.0,4.75,5.2,2,2
16043,5.0,3.5,4.0,5.0,7.0,1,1
42501,5.0,4.0,3.5,4.5,,1,1
63692,5.0,4.0,4.5,4.0,,1,1
63673,5.0,4.5,4.0,4.5,7.5,1,1


This method won't work because beers aren't equally reviewed.

In [7]:
# check whether any reviewers are trolls (e.g. always scoring the same thing)
reviewer_scores = dat.groupby('review_profilename').agg(['mean','median','count','std'])[['review_overall']]
reviewer_scores.columns = ['mean review_overall','median review_overall', 'count reviews', 'std review_overall']

min_reviews_troll = 3 # a reviewer must have had more than this many reviews in order to be considered a troll
min_std_troll = 0 # reviewers with a std(review_overall) greater than this value are considered NOT trolls
trolls = reviewer_scores[(reviewer_scores['count reviews'] > min_reviews_troll) & (reviewer_scores['std review_overall']<=min_std_troll)]

trolls.to_csv('../../data/interim/trolls.csv') # write to file for later use

print '%s trolls found' %trolls.shape[0]

50 trolls found


In [8]:
# get the top n beers for each reviewer 
# then find intersection of all of those
best_beers_dict = defaultdict(list) # {profile_name: [list of beers rated 5]}
n = 3 # number of top beers to return per reviewer
min_num_reviews = 1 # minimum number of reviews a reviewer must have done in order to consider their opinion

for name, df in dat.groupby('review_profilename'):
    if df.shape[0] > min_num_reviews and name not in trolls.index:
        best_beers_dict[name] = utils.get_highest_rated_beers(df, n=n)

In [9]:
# calculate the number of times each beer was identified as a best beer
all_best_beers = list(itertools.chain.from_iterable(best_beers_dict.values()))
best_beers_counts = Counter(all_best_beers)

best_beers = pd.DataFrame(best_beers_counts, index=[0]).transpose()
best_beers.columns = ['counts as best beer']
best_beers.index.name = 'beer_beerid'

# add column for average overall score
best_beers = best_beers.merge(avg_scores, 
                              left_index=True, 
                              right_index=True)

# normalize a few of the scores to reviews
best_beers['norm counts as best beer'] = best_beers['counts as best beer'] / best_beers['number of reviews']
best_beers['weight counts as best beer'] = best_beers['counts as best beer'] * best_beers['number of reviews'] / dat.shape[0]
best_beers['norm counts with top score'] = best_beers['counts with top score'] / best_beers['number of reviews']


#best_beers.sort_values('counts as best beer', ascending=False).head(50).sort_values('counts with top score', ascending=False)

In [10]:
top50best_beers.sort_values('counts as best beer', ascending=False).head(50)

NameError: name 'top50best_beers' is not defined

In [None]:
best_n = 20
x = best_beers.index.tolist()[:best_n]
tmp = dat[dat.beer_beerid.isin(x)]

plt.figure(figsize=(17,4))
ax = sns.violinplot(x="beer_beerid", y="review_overall", data=tmp, order=x, cut=0)
plt.suptitle('Distribution of "review_overall" for the highest rated beers')