In [3]:
import sys, math, os
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt    
import numpy as np
import statsmodels.formula.api as sm
import scipy.stats
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

sys.path.insert(0, '../../src/data/')
import utils

%matplotlib inline
%load_ext autoreload  
%autoreload 2  

path_raw = "../../data/raw/beer_reviews"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Quick way of getting the notebook name, see [here](https://stackoverflow.com/a/23619544/1153897) for source; notebook name will be in python variable `notebook`.

In [4]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "notebook = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

# Introduction

The goal of this notebook is to try and answer the question
> Which of the factors (aroma, taste, appearance, palette) are most important in determining the overall quality of a beer?

Another way of wording the questions is:
> Which of the beer score factors (aroma, taste, appearance, palette) explain the most variance in the attribute `review_overall`.

**NOTE:** we learned several things in analysis [3.0_recommend_3_beers](3.0_recommend_3_beers.ipynb) that impact this analysis:
- we can remove the reviews that had no associated `review_profilename`
- there were numerous "troll" reviewers which should be removed from the dataset (trolls written to *../../data/interim/trolls.csv*)

In [5]:
# LOAD DATA
# we assume the file we're after is a
# single .csv in path_raw
for file in os.listdir(path_raw):
    file = os.path.join(path_raw, file)
    if os.path.isfile(file) and '.csv' in file: 
        dat_raw = pd.read_csv(file, encoding='utf-8') # NOTE: force utf-8 encoding because some beer_styles have accents in them
        
# this file only available if analysis 3.0 is run
trolls = pd.read_csv('../../data/interim/trolls.csv')

# create new copy of data
# 1. without reviews with missing profilename
# 2. without trolls
dat = dat_raw[(dat_raw.review_profilename.notnull()) & (~dat_raw.review_profilename.isin(trolls))].copy()

<hr>

In [6]:
factors = ['review_aroma','review_appearance','review_palate','review_taste']
all_scores_factors = factors + ['review_overall']
melt = pd.melt(dat, id_vars=['beer_beerid',
                      'review_overall'], value_vars=factors, var_name='factor', value_name='score')


for factor,df in melt.groupby('factor'):
    x = df.review_overall.tolist()
    y = df.score.tolist()
    
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
    print 'factor: %s, r^2: %.2f, p: %.4f, std err: %.4f' %(factor, r_value**2, p_value, std_err)

factor: review_appearance, r^2: 0.25, p: 0.0000, std err: 0.0006
factor: review_aroma, r^2: 0.38, p: 0.0000, std err: 0.0006
factor: review_palate, r^2: 0.49, p: 0.0000, std err: 0.0005
factor: review_taste, r^2: 0.62, p: 0.0000, std err: 0.0005


The factor `review_taste` is most highly correlated with `review_overall` having an r^2 of 0.62.

In [7]:
y = 'review_overall'
x = "+".join(factors)
formula = '%s ~ %s' % (y, x)
reg_results = sm.ols(formula, data=dat[factors+[y]]).fit().summary()
print(reg_results)

                            OLS Regression Results                            
Dep. Variable:         review_overall   R-squared:                       0.658
Model:                            OLS   Adj. R-squared:                  0.658
Method:                 Least Squares   F-statistic:                 7.633e+05
Date:                Mon, 19 Jun 2017   Prob (F-statistic):               0.00
Time:                        19:35:09   Log-Likelihood:            -8.7980e+05
No. Observations:             1586266   AIC:                         1.760e+06
Df Residuals:                 1586261   BIC:                         1.760e+06
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------
Intercept             0.4395      0.00

The multivariate model including all the beer factor scores has an r^2 value of 0.658, a value which isn't much higher than the model based on just using the `review_taste` as the only explanatory variable. This tells us that the 'review_taste' factor is the most important factor in predicting `review_overall`.

Let's try and train some different models to see if any other factors serve as good predictors.

In [8]:
# split our data into three sets
# training (60%)
# cv (20%)
# test (20%)
random = 42
dat_train, dat_cv, dat_test = utils.split_data(dat[all_scores_factors], random=random)




In [11]:
mdl = linear_model.LinearRegression(n_jobs=-1)
mdl.fit(dat_train[factors], dat_train['review_overall'])

print 'r^2 on test: %0.3f' %mdl.score(dat_test[factors], dat_test['review_overall'])

r^2 on test: 0.657


Values agree well with previous findings

In [13]:
param = {'n_estimators':[10,100]}
mdl = RandomForestRegressor(n_jobs=-1, random_state=42)

clf = GridSearchCV(mdl, param)
clf.fit(dat_train[factors], dat_train['review_overall'])
clf.best_params_
# print 'r^2 on test: %0.3f' %mdl.score(dat_test[factors], dat_test['review_overall'])
# mdl.feature_importances_

{'n_estimators': 100}