In [25]:
import sys, math, os
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt    
import numpy as np
import statsmodels.formula.api as sm
import scipy.stats
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import operator

sys.path.insert(0, '../../src/data/')
import utils

%matplotlib inline
%load_ext autoreload  
%autoreload 2  

path_raw = "../../data/raw/beer_reviews"
random = 42

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Quick way of getting the notebook name, see [here](https://stackoverflow.com/a/23619544/1153897) for source; notebook name will be in python variable `notebook`.

In [2]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "notebook = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

# Introduction

The goal of this notebook is to try and answer the question
> If I enjoy a beer's aroma and appearance, which beer style should I try?


In [3]:
# LOAD DATA
# we assume the file we're after is a
# single .csv in path_raw
for file in os.listdir(path_raw):
    file = os.path.join(path_raw, file)
    if os.path.isfile(file) and '.csv' in file: 
        dat_raw = pd.read_csv(file, encoding='utf-8') # NOTE: force utf-8 encoding because some beer_styles have accents in them
        
# this file only available if analysis 3.0 is run
assert os.path.isfile('../../data/interim/trolls.csv'), "The trolls file doesn't exist yet, please run through analysis 3.0 first!"
trolls = pd.read_csv('../../data/interim/trolls.csv')

# create new copy of data
# 1. without reviews with missing profilename
# 2. without trolls
dat = dat_raw[(dat_raw.review_profilename.notnull()) & (~dat_raw.review_profilename.isin(trolls))].copy()

<hr>

In [28]:
# for each beer style, get some descriptive stats for the interested factors
factors = ['review_aroma', 'review_appearance']
label = ['beer_style']
factor_stats = dat[factors + label].groupby('beer_style').agg(['mean','median','std','count'])

# naivly sort by mean score and grab top10
top10_by_aroma = factor_stats.sort_values(('review_aroma','mean'), ascending=False).head(10)
top10_by_appearance = factor_stats.sort_values(('review_appearance','mean'), ascending=False).head(10)

# show just overlap
print 'Overlap between top10 lists for both aroma and appearance: \n- %s' %'\n- '.join(list(top10_by_appearance.index 
                                                                                    & top10_by_aroma.index))

Overlap between top10 lists for both aroma and appearance: 
- American Double / Imperial Stout
- Quadrupel (Quad)
- American Double / Imperial IPA
- Russian Imperial Stout


In [None]:
param = {'n_estimators':[10], 'max_features':['auto',1/3.0], 'max_depth':[None, 10]}
mdl = RandomForestClassifier(n_jobs=-1, random_state=random, oob_score=True)

clf = GridSearchCV(mdl, param, n_jobs=-1)
clf.fit(dat[factors], np.ravel(dat[label]))
print clf.best_params_, clf.best_score_