# Waterfall Chart

See:

- [Wikipedia article](https://en.wikipedia.org/wiki/Waterfall_chart)
- [ggplot2 + waterfalls](https://r-charts.com/flow/waterfall-chart/)

In [1]:
from lets_plot import *

In [2]:
LetsPlot.setup_html()

In [3]:
data = dict(
    x = ["A", "B", "C", "D", "E"],
    y = [100, 200, -400, 500, -200],
)

In [4]:
_DY_NAME = "dy"
_CUMSUM_NAME = "cumsum"
_INITIAL_NAME = "initial"
_DY_TITLE = "Difference"
_CUMSUM_TITLE = "Cumulative sum"
_INITIAL_TITLE = "Initial"
_FLOW_TYPE_TITLE = "Flow type"
_FLOW_TYPE_NAMES = {
    "increase": "Increase",
    "decrease": "Decrease",
    "total": "Total",
}
_FLOW_TYPE_BOX_COLORS = {
    "increase": "#4daf4a",
    "decrease": "#e41a1c",
    "total": "#377eb8",
}
_FLOW_TYPE_TEXT_COLORS = {
    "increase": "#b2df8a",
    "decrease": "#fb9a99",
    "total": "#a6cee3",
}
_FLOW_TYPE_COLOR_VALUE = 'flow_type'

_FILL_DEF = "lightgray"
_SHOW_LEGEND_DEF = False
_TOOLTIPS_DEF = layer_tooltips().title("^x")\
                                .line("{0}|@{1}".format(_INITIAL_TITLE, _INITIAL_NAME))\
                                .line("{0}|@{1}".format(_DY_TITLE, _DY_NAME))\
                                .line("{0}|@{1}".format(_CUMSUM_TITLE, _CUMSUM_NAME))\
                                .disable_splitting()
_SORTED_VALUE_DEF = False
# Total
_CALC_TOTAL_DEF = True
# Horizontal line
_HLINE_DEF = False
_HLINE_ONTOP_DEF = True
# Connector lines
_CONNECTOR_LINES_DEF = True
# Labels
_LABELS_DEF = True
_LABEL_COLOR_DEF = "white"

def _get_stat_data(data, x, y, sorted_value, calc_total, threshold, max_values, flow_type_names):
    xs, ys = data[x], data[y]
    assert len(xs) == len(set(xs)), "x values shouldn't contains duplicates"
    if sorted_value:
        xs, ys = zip(*sorted(zip(xs, ys), key=lambda p: abs(p[1]), reverse=True))
    xs = [str(v) for v in xs]
    if threshold is not None:
        other_value = sum([v for v in ys if abs(v) < threshold])
        xs, ys = zip(*[p for p in zip(xs, ys) if abs(p[1]) >= threshold])
        if abs(other_value) > 0:
            xs = list(xs) + ["Other"]
            ys = list(ys) + [other_value]
    elif max_values is not None:
        indices = list(zip(*sorted(zip(range(len(xs)), xs, ys), key=lambda p: abs(p[2]), reverse=True)))[0][:max_values]
        other_value = sum([v for i, v in enumerate(ys) if i not in indices], 0)
        xs = [v for i, v in enumerate(xs) if i in indices]
        ys = [v for i, v in enumerate(ys) if i in indices]
        if abs(other_value) > 0:
            xs = xs + ["Other"]
            ys = ys + [other_value]
    cum_sum = 0
    yprev = []
    ynext = []
    ymin = []
    ymax = []
    flow_type = []
    for y_val in ys:
        yprev.append(cum_sum)
        ynext.append(cum_sum + y_val)
        ymin.append(min(cum_sum, ynext[-1]))
        ymax.append(max(cum_sum, ynext[-1]))
        flow_type.append(flow_type_names["increase"] if y_val >= 0 else flow_type_names["decrease"])
        cum_sum = ynext[-1]
    if calc_total:
        xs = list(xs) + [flow_type_names["total"]]
        ys = list(ys) + [cum_sum - ys[0]]
        yprev.append(ys[0])
        ynext.append(cum_sum)
        ymin.append(min(cum_sum, 0))
        ymax.append(max(cum_sum, 0))
        flow_type.append(flow_type_names["total"])
    return {
        'x': xs,
        _INITIAL_NAME: yprev,
        _CUMSUM_NAME: ynext,
        _DY_NAME: ys,
        'ymin': ymin,
        'ymax': ymax,
        _FLOW_TYPE_TITLE: flow_type,
    }

def _get_annotations_data(stat_data, calc_total):
    n = len(stat_data['x'])
    return {**stat_data,
            **{'y': [(stat_data["ymin"][i] + stat_data["ymax"][i]) / 2 \
                     for i in range(n)],
               'label': [(stat_data[_DY_NAME][i] if i < n - 1 or not calc_total else stat_data[_CUMSUM_NAME][i]) \
                         for i in range(n)]}}

def _get_intermediate_lines(stat_data):
    from itertools import pairwise
    xs = []
    ys = []
    xends = []
    yends = []
    for i, j in list(pairwise(range(len(stat_data['x'])))):
        xs.append(stat_data['x'][i])
        ys.append(stat_data[_CUMSUM_NAME][i])
        xends.append(stat_data['x'][j])
        yends.append(stat_data[_CUMSUM_NAME][i])
    return {
        'x': xs,
        'y': ys,
        'xend': xends,
        'yend': yends,
    }

def waterfall_plot(data, x, y, *, \
                   color=None, fill=_FLOW_TYPE_COLOR_VALUE, size=None, alpha=None, linetype=None, width=None, \
                   show_legend=_SHOW_LEGEND_DEF, tooltips=_TOOLTIPS_DEF, \
                   sorted_value=_SORTED_VALUE_DEF, threshold=None, max_values=None, \
                   calc_total=_CALC_TOTAL_DEF, total_title=None, \
                   hline=_HLINE_DEF, hline_ontop=_HLINE_ONTOP_DEF, hline_color=None, hline_size=None, hline_linetype=None, \
                   connector_lines=_CONNECTOR_LINES_DEF, connector_linetype=None, \
                   labels=_LABELS_DEF, label_color=_LABEL_COLOR_DEF, label_format=None):

    flow_type_names = _FLOW_TYPE_NAMES.copy()
    if total_title is not None:
        flow_type_names['total'] = total_title
    stat_data = _get_stat_data(data, x, y, sorted_value, calc_total, threshold, max_values, flow_type_names)
    mapping_dict = {'x': 'x', 'y': _CUMSUM_NAME, 'ymin': 'ymin', 'ymax': 'ymax'}
    if fill == _FLOW_TYPE_COLOR_VALUE:
        mapping_dict['fill'] = _FLOW_TYPE_TITLE
        fill = None

    p = ggplot()
    hline_layer = geom_hline(yintercept=0, color=hline_color, size=hline_size, linetype=hline_linetype, tooltips='none')
    if hline and not hline_ontop:
        p += hline_layer
    if connector_lines:
        p += geom_segment(aes('x', 'y', xend='xend', yend='yend'), \
                          data=_get_intermediate_lines(stat_data), \
                          linetype=connector_linetype, \
                          color=color, size=size, tooltips='none')
    p += geom_crossbar(aes(**mapping_dict), \
                       data=stat_data, \
                       fatten=0, \
                       color=color, fill=fill, size=size, alpha=alpha, linetype=linetype, \
                       width=width, \
                       show_legend=show_legend, tooltips=tooltips)
    if hline and hline_ontop:
        p += hline_layer
    if labels:
        labels_mapping_dict = {'x': 'x', 'y': 'y', 'label': 'label'}
        color_is_mapped = label_color == _FLOW_TYPE_COLOR_VALUE
        if color_is_mapped:
            labels_mapping_dict['color'] = _FLOW_TYPE_TITLE
            label_color = None
        p += geom_text(aes(**labels_mapping_dict), \
                       data=_get_annotations_data(stat_data, calc_total), \
                       color=label_color, label_format=label_format, \
                       show_legend=(show_legend and color_is_mapped))
    actual_flow_type_names = list(flow_type_names.keys())
    if not calc_total:
        actual_flow_type_names.remove('total')
    return p + \
        scale_fill_manual(values={flow_type_names[k]: _FLOW_TYPE_BOX_COLORS[k] for k in actual_flow_type_names}) + \
        scale_color_manual(values={flow_type_names[k]: _FLOW_TYPE_TEXT_COLORS[k] for k in actual_flow_type_names})

## Default

In [5]:
waterfall_plot(data, 'x', 'y')

## Parameters

### Aesthetics

In [6]:
# color
waterfall_plot(data, 'x', 'y', color="magenta")

In [7]:
# fill
waterfall_plot(data, 'x', 'y', fill="blue")

In [8]:
# size
waterfall_plot(data, 'x', 'y', size=2)

In [9]:
# remove borders
waterfall_plot(data, 'x', 'y', size=0)

In [10]:
# alpha
waterfall_plot(data, 'x', 'y', alpha=.5)

In [11]:
# linetype
waterfall_plot(data, 'x', 'y', linetype='dashed')

In [12]:
# width
waterfall_plot(data, 'x', 'y', width=.4)

### Standard parameters

In [13]:
# show_legend
gggrid([
    waterfall_plot(data, 'x', 'y', show_legend=True) + ggtitle("Show legend", "Default calc_total"),
    waterfall_plot(data, 'x', 'y', show_legend=True, calc_total=False) + ggtitle("Show legend", "calc_total=False"),
])

In [14]:
# tooltips
gggrid([
    waterfall_plot(data, 'x', 'y', tooltips='none'),
    waterfall_plot(data, 'x', 'y', tooltips=layer_tooltips().line("@dy: from @initial to @cumsum").disable_splitting())
])

### Waterfall-specific parameters

In [15]:
# sorted_value
waterfall_plot(data, 'x', 'y', sorted_value=True)

In [16]:
# threshold
waterfall_plot(data, 'x', 'y', threshold=300)

In [17]:
# max_values
waterfall_plot(data, 'x', 'y', max_values=3)

In [18]:
# Use threshold to skip zeros
data_with_zeros = dict(
    x=['a', 'b', 'c', 'd', 'e'],
    y=[1, -2, 3, 0, 1],
)

gggrid([
    waterfall_plot(data_with_zeros, 'x', 'y'),
    waterfall_plot(data_with_zeros, 'x', 'y', threshold=1),
])

In [19]:
# calc_total
waterfall_plot(data, 'x', 'y', calc_total=False)

In [20]:
# total_title
waterfall_plot(data, 'x', 'y', total_title="Result", show_legend=True)

### Control additional geometries

In [21]:
# hline
waterfall_plot(data, 'x', 'y', hline=True)

In [22]:
# hline_ontop
waterfall_plot(data, 'x', 'y', hline=True, hline_ontop=False)

In [23]:
# hline_color
waterfall_plot(data, 'x', 'y', hline=True, hline_color="magenta")

In [24]:
# hline_size
waterfall_plot(data, 'x', 'y', hline=True, hline_size=2)

In [25]:
# hline_linetype
waterfall_plot(data, 'x', 'y', hline=True, hline_linetype='dashed')

In [26]:
# connector_lines
waterfall_plot(data, 'x', 'y', width=.5, connector_lines=False)

In [27]:
# connector_linetype
waterfall_plot(data, 'x', 'y', width=.5, connector_linetype='dotted')

In [28]:
# labels
waterfall_plot(data, 'x', 'y', labels=False)

In [29]:
# label_color
waterfall_plot(data, 'x', 'y', label_color="yellow")

In [30]:
# label_format
waterfall_plot(data, 'x', 'y', label_format=".2f")

## Other Customizations

In [31]:
# fill and color

gggrid([
    waterfall_plot(data, 'x', 'y', show_legend=True) + ggtitle("Default"),
    waterfall_plot(data, 'x', 'y', show_legend=True, fill='flow_type') + ggtitle("fill='flow_type'"),
    waterfall_plot(data, 'x', 'y', show_legend=True, label_color='flow_type') + ggtitle("label_color='flow_type'"),
    waterfall_plot(data, 'x', 'y', show_legend=True, fill=None, label_color='flow_type') + ggtitle("fill=None and label_color='flow_type'"),
    waterfall_plot(data, 'x', 'y', show_legend=True, color="#777777", label_color="#777777") + \
        scale_fill_manual({"Increase": "white", "Decrease": "black", "Total": "yellow"}) + \
        ggtitle("Custom scale_fill_manual()"),
    waterfall_plot(data, 'x', 'y', show_legend=True, fill="black", label_color='flow_type') + \
        scale_color_manual({"Increase": "green", "Decrease": "red", "Total": "#bbbbbb"}) + \
        ggtitle("Custom scale_color_manual()"),
    waterfall_plot(data, 'x', 'y', show_legend=True, color="#777777", label_color="#777777") + \
        scale_fill_manual({"Increase": "green", "Decrease": "red", "Total": "yellow"}, labels=["Up", "Down", "Result"]) + \
        ggtitle("Custom flow type names"),
], ncol=3)

In [32]:
# flip coordinates
waterfall_plot(data, 'x', 'y') + coord_flip()

In [33]:
# custom theme
waterfall_plot(data, 'x', 'y') + theme_bw()