## model-describer ErrorViz and SensitivityViz Analysis on Wine Quality Data

Goals of tutorial include:
* [Importing wine quality dataset](#wine_quality)
* [Handling categorical features](#categorical)
* [Build model](#model)
* [Deploying regression ErrorViz graphics](#wbox_error)
* [Deploying classification ErrorViz grphics](#error_classification)
* [Deploying regression SensitivitiyViz graphics](#wbox_sensitivity)
* [SensitivityViz Classification](#sensitivityviz_classification)

In [None]:
import pandas as pd
import requests
import io
import numpy as np
from sklearn.ensemble import RandomForestRegressor

from mdesc.utils import utils as wb_utils
from mdesc.eval import ErrorViz, SensitivityViz
import mdesc

In [2]:
mdesc.__version__

'0.1.2.2'

### Import wine quality dataset <a id='wine_quality' >
Perform basic exploratory data analysis to better understand what types of columns are available

In [3]:
df = pd.read_csv('NotebookData/wine.csv')

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Type
0,low,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,low,6,White
1,low,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,low,6,White
2,medium,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,low,6,White
3,low,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,low,6,White
4,low,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,low,6,White


In [5]:
df.dtypes # it looks like most of our columns are numeric, with the exception of Type and AlcoholContent

fixed acidity            object
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                  object
quality                   int64
Type                     object
dtype: object

In [6]:
df.groupby('alcohol')['fixed acidity'].count() # most of our data resides in low/medium alcohol content

alcohol
high       356
low       3295
medium    2846
Name: fixed acidity, dtype: int64

In [7]:
df.groupby('Type')['fixed acidity'].count() # and most of our data is white wine

Type
Red      1599
White    4898
Name: fixed acidity, dtype: int64

### Handling categorical data <a id='categorical' >
    
We can rely on pandas to convert our string/category columns into dummy variables to be used in our models

In [8]:
# dependent variables
ydepend = 'quality'

# create model data frame which will have categories converted to dummies
model_df = pd.get_dummies(df.loc[:, df.columns!=ydepend])

### Build model <a id='model' >

In [9]:
# build the model
modelobj = RandomForestRegressor()

modelobj.fit(model_df, df.loc[:, ydepend])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### Create ErrorViz Regression <a id='wbox_error' >

In [10]:
# specify keepfeaturelist as a subset of columns we want to focus on

keepfeaturelist = ['fixed acidity', 
                  'quality', 
                  'alcohol', 
                  'sulphates', 
                  'volatile acidity', 
                  'residual sugar', 
                  'free sulfur dioxide','Type'
                  ]


# specify the groupby variables
groupbyvars = ['alcohol','Type']

# instantiate wbox error
EV = ErrorViz(modelobj=modelobj, # sklearn model object
                   model_df=model_df, # modelling dataframe
                   ydepend=ydepend, # depndent variable
                   cat_df=df, # original dataframe
                   groupbyvars=groupbyvars, # grouping variables
                   keepfeaturelist=keepfeaturelist, # trim final output to these variables
                   autoformat_types=True, # auto convert categorical dtypes to objects
                   verbose=None, # debug
                   round_num=4) # number of digits to round outputs

In [None]:
# run wbox error
EV.run(output_type='html', output_path='winequality_example.html',
      progbar=True,
      output_df=False)

In [None]:
# if you want to save to an alternative location than what was originally specified
EV._save('outputs/winequality_example.html')

### ErrorViz Classification <a id='error_classification' >
    
Basic example of pipeline for ErrorViz in the classification context

In [None]:
# lets turn our wine quality dataset into a binary classification task
# if the quality rank is greater than 5, convert to 1, otherwise 0

classdf = df.copy(deep=True)
classdf.loc[:, ydepend] = classdf.loc[:, ydepend].apply(lambda x: 1 if x > 5 else 0)
classdf.groupby(ydepend)['citric acid'].count()

In [None]:
# import randomforest classification
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
clf.fit(model_df, classdf.loc[:, ydepend])

In [None]:
EV = ErrorViz(clf,
              model_df=model_df,
              ydepend=ydepend,
              cat_df=classdf,
              keepfeaturelist=None,
              groupbyvars=['Type'],
              aggregate_func=np.nanmedian,
              verbose=None,
              autoformat_types=True
              )

EV.run(output_path='outputs/error_viz_classification.html',
       output_type='html')

### SensitivityViz Analysis <a id='wbox_sensitivity' >

In [11]:
# instantiate SensitivityViz in much the same way as ErrorViz
SV = SensitivityViz(modelobj=modelobj, # model object
                    model_df=model_df, # df used to build model
                    ydepend=ydepend, # depndent variable
                    cat_df=df, # unadjusted dataframe
                    groupbyvars=groupbyvars, # grouping variables for detailed analysis
                    keepfeaturelist=keepfeaturelist, # variables to keep/render in output
                    std_num=1, # how many standard deviations to tweak synthetic data
                    verbose=None) # debugging
# run
SV.run(output_path='outputs/winequality_sensitivity.html',
      output_type='html',
      progbar=True,
      output_df=False)

Percent Complete: 100.0%

### SensitivityViz Classification <a id='sensitivityviz_classification' >
    
Same usage as before, just with classification model and dataset

In [None]:
WB = SensitivityViz(clf,
                    model_df=model_df,
                    ydepend=ydepend,
                    cat_df=classdf,
                    keepfeaturelist=None,
                    groupbyvars=['Type'],
                    aggregate_func=np.nanmedian,
                    verbose=None,
                    std_num=2,
                    autoformat_types=True,
                    )



WB.run(output_type='html',
       output_path='outputs/sensitivity_classification.html', 
       progbar=True,
      output_df=False)