# Visualizer for all SciPy distributions

The awesome ``scipy.stats`` subpackage holds a lot of continuous and discrete distributions that you might never have heard of. To quickly familiarize oneself with an unknown distribution plotting and *experiencing* the distribution helps a lot. This visualiser tries to make this as easy and comfortable as possible. This tool is based on [Bokeh](http://bokeh.pydata.org/) and [ipywidgets](http://ipywidgets.readthedocs.org/).

**TL;DR**: Just run all cells and be stunned!

In [1]:
from collections import OrderedDict, defaultdict
from enum import Enum

import numpy as np
from scipy import stats

from bokeh.io import output_notebook, show, push_notebook
from bokeh.plotting import figure
from bokeh.io import show

from ipywidgets import widgets, interact, interactive
from IPython.display import display 

import warnings
warnings.simplefilter('ignore', DeprecationWarning)

In [2]:
output_notebook()

We start with a little introspection to get lists of all continuous and discrete distributions in ``scipy.stats``. In order to do so, we use an ``Enum`` to define the two types of distribution that exists in the world of mathematics, i.e. *continuous* and *discrete* distributions. Based on a given type we determine all classes that have as base class either ``stats.rv_contiuous`` or ``stats.rv_discrete`` and create an ordered dictionary with the distribution's name as key and the object of the distribution as value.

In [3]:
class DistType(Enum):
    continuous = 0
    discrete = 1
    
dist_types = OrderedDict([('continuous', DistType.continuous),
                          ('discrete', DistType.discrete)])

def get_dict_of_dists(dist_type):
    if dist_type is DistType.continuous:
        baseclass = stats.rv_continuous
    else:
        baseclass = stats.rv_discrete
    dists = [getattr(stats, d) for d in sorted(dir(stats)) if isinstance(getattr(stats,d), baseclass)]
    return OrderedDict([(dist.name, dist) for dist in dists])

dist_continuous = get_dict_of_dists(DistType.continuous)
dist_discrete = get_dict_of_dists(DistType.discrete)
print('number of continuous distributions:', len(dist_continuous))
print('number of discrete distributions:  ', len(dist_discrete))

number of continuous distributions: 89
number of discrete distributions:   13


Since a lot of distributions need additional shape parameters we use a nested ``defaultdict`` to define shape parameters as we go. For an undefined distribution ``DEFAULT_SHAPES`` will return ``1.0`` for all shape parameters.

The ``DEFAULT_SHAPES`` dictionary is not exhaustive, meaning that a lot of sane parameters still need to be configured.

In [4]:
def make_default_shape_dict():
    shape_param = defaultdict(lambda: 1.0)
    return defaultdict(lambda: shape_param)

DEFAULT_SHAPES = make_default_shape_dict()
DEFAULT_SHAPES['alpha'] = {'a': 1.3}
DEFAULT_SHAPES['beta'] = {'a': 1.5, 'b': 2.}

# discrete
DEFAULT_SHAPES['bernoulli'] = {'p': 0.7}
DEFAULT_SHAPES['binom'] = {'n': 10, 'p': 0.7}
DEFAULT_SHAPES['logser'] = {'p': 0.3}
DEFAULT_SHAPES['zipf'] = {'a': 2}
DEFAULT_SHAPES['randint'] = {'low': 0, 'high': 10}
DEFAULT_SHAPES['nbinom'] = {'n': 10, 'p': 0.6}
DEFAULT_SHAPES['hypergeom'] = {'n': 3, 'M': 10, 'N': 7}
DEFAULT_SHAPES['geom'] = {'p': 0.6}

Every project needs some purely auxiliary functions that help to keep the real program logic shorter and much more comprehensible. We define them in advance and all of them should be pretty much self-explanatory. Eventually, we have functions to:
* flatten a list of list,
* calculate the support of a distribution,
* create patches, i.e. bars, from (x, y) data points,
* determine the shape parameters of a distribution,
* check if a distribution has shape parameters,
* determine the options for a distribution selector widget,
* determine the options for a function selector widget.

In [16]:
def flatten(lst):
    return [item for sublist in lst for item in sublist]

def support(dist, *shapeargs):
    # due to bug in scipy.levy_stable no keyword args for interval
    return dist.interval(1.0, *shapeargs)

def make_patches(x, y, width=0.5):
    m = width/2
    x = [[p-m, p-m, p+m, p+m] for p in x]
    y = [[0, p, p, 0] for p in y]
    return x, y

def shape_params(dist):
    if dist.shapes is not None:
        return dist.shapes.split(', ')
    
def has_shape_params(dist):
    return shape_params(dist) is not None

def dist_options(dist_type):
    if dist_type is DistType.continuous:
        return dist_continuous
    else:
        return dist_discrete

def func_options(dist_type):
    if dist_type is DistType.continuous:
        return ['pdf', 'cdf', 'ppf']
    else:
        return ['pmf', 'cdf']

The whole tool is basically about evaluating different functions, e.g. ``pdf``, ``cdf``, etc., of a distribution. So what we need to do is:
1. determining the support of the function
2. check if the distribution is continuous or discrete
3. define a set of suitable ``x``-values
4. evaluate the given function on that set of ``x`` and return ``x`` and ``y``

In [17]:
def get_dist_func_xy(dist, func, *shapeargs, **params):
    if func == 'ppf':
        interval = [0., 1.]
    else:
        interval = list(support(dist, *shapeargs))
    if dist in dist_continuous.values():
        for i, x in enumerate(interval):
            if np.isinf(x):
                interval[i] = np.sign(x)*100
            interval[i] += (-1)**i*1e-3
        l, r = interval
        x = np.linspace(l, r, 100*(r-l))
    elif dist in dist_discrete.values():
        for i, x in enumerate(interval):
            if np.isinf(x):
                interval[i] = np.sign(x)*10
        l, r = interval        
        x = np.arange(l+1, r+1)
    else:
        raise RuntimeError("Unknown distribution: {}".format(dist.name))
    y = getattr(dist, func)(x, *shapeargs, **params)
    return x, y

In [13]:
def update_type_sel():
    dist_sel.options = dist_options(type_sel.value)
    
def update_dist_sel():
    func_sel.options = func_options(type_sel.value)
    if has_shape_params(dist_sel.value):
        shapes = OrderedDict([(p, DEFAULT_SHAPES[dist_sel.value.name][p]) for p in shape_params(dist_sel.value)])
        text_inputs = [widgets.BoundedFloatText(value=v, description='{}:'.format(k)) for k, v in shapes.items()]
        [w.on_trait_change(update_dist_params, name='value') for w in text_inputs]
        shape_param_container.children = text_inputs
    else:
        shape_param_container.children = []
    if type_sel.value is DistType.continuous:
        param_container.children = [loc_slider, scale_slider]
    else:
        param_container.children = []
    update_dist_params()

def update_continuous(fig, data, *shapeargs):
    data['x'], data['y'] = get_dist_func_xy(dist_sel.value, func_sel.value, *shapeargs, loc=loc_slider.value, scale=scale_slider.value)
    fig.y_range.start, fig.y_range.end = max(np.max(data['y']) - 5, 1.1*np.min(data['y'])), min(np.min(data['y']) + 5, 1.1*np.max(data['y']))
    offset, lim = 1e-1, 5
    fig.x_range.start, fig.x_range.end = max(-lim, np.min(data['x']) - offset), min(lim, np.max(data['x']) + offset)
    
    
def update_discrete(fig, data, *shapeargs):
    x, y = get_dist_func_xy(dist_sel.value, func_sel.value, *shapeargs)
    data['xs'], data['ys'] = make_patches(x, y)
    fig.y_range.start, fig.y_range.end = 0., 1.1*max(max(data['ys']))
    x_high_value = [x for x, y in zip(flatten(data['xs']), flatten(data['ys'])) if y > 0.001] 
    fig.x_range.start, fig.x_range.end = min(x_high_value) - 1, max(x_high_value) + 1
    
    
def update_dist_params():
    shapeargs = [c.value for c in shape_param_container.children]
    l_data['x'], l_data['y'] = [], []
    p_data['xs'], p_data['ys'] = [], []
    try:
        if type_sel.value is DistType.continuous:
            update_continuous(fig, l_data, *shapeargs)
        else:
            update_discrete(fig, p_data, *shapeargs)
    except Exception as e:
        error_text.value = "Invalid parameters! Choose again.<br>ERROR: {}".format(e)
        error_text.visible = True
    else:
        error_text.visible = False
    push_notebook()
    

In [14]:
def get_dist_fig_data():
    fig = figure(width=700, height=700, title=None, x_range=(-1, 1), y_range=(0, 1))
    ren_p = fig.patches([[]], [[]], line_width=3, alpha=0.3)
    ren_l = fig.line([], [], line_width=3)
    return fig, ren_l.data_source.data, ren_p.data_source.data

In [15]:
type_sel = widgets.Dropdown(options=dist_types, value=DistType.continuous, description='type:')
type_sel.on_trait_change(update_type_sel, name='value')
dist_sel = widgets.Dropdown(options=dist_options(type_sel.value), description='dist:')
dist_sel.on_trait_change(update_dist_sel, name='value')
func_sel = widgets.Dropdown(options=func_options(type_sel.value), description='func:')
func_sel.on_trait_change(update_dist_sel, name='value')

loc_slider = widgets.FloatSlider(value=0., min=-5.0, max=5.0, step=0.1, description='loc:')
loc_slider.on_trait_change(update_dist_params, name='value')
scale_slider = widgets.FloatSlider(value=1., min=0.01, max=10.0, step=0.01, description='scale:')
scale_slider.on_trait_change(update_dist_params, name='value')

dist_container = widgets.HBox()  
dist_container.children = [type_sel, dist_sel, func_sel]

param_container = widgets.VBox()
param_container.children = [loc_slider, scale_slider]

shape_param_container = widgets.HBox()
error_text = widgets.HTML()

display(dist_container)
display(param_container)
display(shape_param_container)
display(error_text)

fig, l_data, p_data = get_dist_fig_data()

show(fig)
dist_sel.value = dist_continuous['norm']