# Explicit Use of Statistics

Basically, for each statistical transformation, there is a geometry that uses it by default.

However, you can simply select an appropriate geometry and explicitly specify the desired statistic for it.

Examples of how to do this can be found further in this notebook.

In [1]:
import pandas as pd

from lets_plot import *
from lets_plot.mapping import as_discrete

In [2]:
LetsPlot.setup_html()

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv")
print(df.shape)
df.head()

(234, 12)


Unnamed: 0.1,Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
4,5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


## 1. Identity

Explicit use of the `'identity'` statistic is required if the geometry has its own statistic you wish to replace with yours.

In [4]:
identity_df = df.groupby("fl")["cty"].mean().to_frame().reset_index()

ggplot(identity_df, aes("fl", "cty")) + geom_bar(stat='identity')

## 2. One Variable

### 2.1. Discrete

#### `'count'`

In [5]:
gggrid([
    ggplot(df, aes("drv")) + \
        geom_bar() + \
        ggtitle("'count' stat is default for bar geometry"),
    ggplot(df, aes("drv")) + \
        geom_lollipop(stat='count') + \
        ggtitle("Explicit setting of 'count' stat"),
])

### 2.2. Continuous

#### `'bin'`

In [6]:
gggrid([
    ggplot(df, aes("cty")) + \
        geom_histogram() + \
        ggtitle("'bin' stat is default for histogram geometry"),
    ggplot(df, aes("cty")) + \
        geom_lollipop(stat='count') + \
        ggtitle("Explicit setting of 'bin' stat"),
])

#### `'density'`

In [7]:
gggrid([
    ggplot(df, aes("cty")) + \
        geom_density() + \
        ggtitle("'density' stat is default for density geometry"),
    ggplot(df, aes("cty")) + \
        geom_point(stat='density', alpha=.25) + \
        ggtitle("Explicit setting of 'density' stat"),
])

#### `'dotplot'`

In [8]:
gggrid([
    ggplot(df, aes("cty")) + \
        geom_dotplot() + \
        ggtitle("'dotplot' stat is default for dotplot geometry"),
    ggplot(df, aes("cty")) + \
        geom_lollipop(aes(y='..count..'), stat='dotplot') + \
        ggtitle("Explicit setting of 'dotplot' stat"),
])

#### `'ecdf'`

In [9]:
gggrid([
    ggplot(df, aes("cty")) + \
        stat_ecdf() + \
        ggtitle("'ecdf' stat is default for ecdf statistic"),
    ggplot(df, aes("cty")) + \
        geom_line(stat='ecdf') + \
        ggtitle("Explicit setting of 'ecdf' stat"),
])

#### `'qq'` and `'qq_line'`

In [10]:
gggrid([
    ggplot(df) + \
        geom_qq(aes(sample="cty")) + \
        geom_qq_line(aes(sample="cty")) + \
        ggtitle("'qq' and 'qq_line' stats are default\nfor geom_qq() and geom_qq_line()"),
    ggplot(df) + \
        geom_point(aes(sample="cty"), stat='qq') + \
        geom_line(aes(sample="cty"), stat='qq_line') + \
        ggtitle("Explicit setting of 'qq' and 'qq_line' stats"),
])

## 3. Two Variables

### 3.1. Both Discrete

#### `'count2d'`

In [11]:
gggrid([
    ggplot(df, aes(as_discrete("cyl", order=1), as_discrete("year"))) + \
        geom_pie(aes(fill="drv")) + \
        ggtitle("'count2d' stat is default for pie geometry"),
    ggplot(df, aes(as_discrete("cyl", order=1), as_discrete("year"))) + \
        geom_point(aes(group="drv", color="drv", size='..count..'), \
                   stat='count2d', alpha=.2) + \
        scale_size(range=[2, 30], guide='none') + \
        ggtitle("Explicit setting of 'count2d' stat")
])

#### `'sum'`

In [12]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        stat_sum() + \
        ggtitle("'sum' stat is default for sum statistic"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point(stat='sum') + \
        ggtitle("Explicit setting of 'sum' stat"),
])

### 3.2. One Discrete, One Continuous

#### `'boxplot'` and `'boxplot_outlier'`

In [13]:
gggrid([
    ggplot(df, aes("drv", "cty")) + \
        geom_boxplot() + \
        ggtitle("'boxplot' and 'boxplot_outlier' stats\nis default for boxplot geometry"),
    ggplot(df, aes("drv", "cty")) + \
        geom_errorbar(stat='boxplot') + \
        geom_crossbar(aes(ymin='..lower..', ymax='..upper..'), \
                      stat='boxplot', fatten=0, tooltips='none') +
        geom_errorbar(aes(ymin='..middle..', ymax='..middle..'), stat='boxplot', \
                      width=.9, size=2.5/2) + \
        geom_point(stat='boxplot_outlier') + \
        ggtitle("Explicit setting of 'boxplot'\nand 'boxplot_outlier' stats"),
])

#### `'densityridges'`

In [14]:
gggrid([
    ggplot(df, aes("cty", "drv")) + \
        geom_area_ridges() + \
        ggtitle("'densityridges' stat is default for area ridges geometry"),
    ggplot(df, aes("cty", "drv")) + \
        geom_point(aes(y='..height..'), stat='densityridges', alpha=.2) + \
        facet_grid(y="drv") + \
        ggtitle("Explicit setting of 'densityridges' stat"),
])

#### `'summary'`

In [15]:
gggrid([
    ggplot(df, aes("drv", "cty")) + \
        stat_summary() + \
        ggtitle("'summary' stat is default for stat_summary()"),
    ggplot(df, aes("drv", "cty")) + \
        geom_crossbar(stat='summary') + \
        ggtitle("Explicit setting of 'summary' stat"),
])

#### `'ydensity'`

In [16]:
gggrid([
    ggplot(df, aes("drv", "cty")) + \
        geom_violin() + \
        ggtitle("'ydensity' stat is default for violin geometry"),
    ggplot(df, aes("drv", "cty")) + \
        geom_point(aes(x='..violinwidth..'), stat='ydensity', alpha=.2) + \
        facet_grid(x="drv") + \
        ggtitle("Explicit setting of 'ydensity' stat"),
])

#### `'ydotplot'`

In [17]:
gggrid([
    ggplot(df, aes("drv", "cty")) + \
        geom_ydotplot() + \
        ggtitle("'ydotplot' stat is default for ydotplot geometry"),
    ggplot(df, aes("drv", "cty")) + \
        geom_lollipop(aes(x='..count..'), stat='ydotplot', dir='h', fatten=1.5) + \
        facet_grid(x="drv") + \
        ggtitle("Explicit setting of 'ydotplot' stat"),
])

### 3.3. Both Continuous

#### `'bin2d'`

In [18]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_bin2d() + \
        ggtitle("'bin2d' stat is default for bin2d geometry"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point(aes(color='..count..'), stat='bin2d') + \
        ggtitle("Explicit setting of 'bin2d' stat"),
])

#### `'contour'` and `'contourf'`

In [19]:
def get_contour_data(n=50, seed=42):
    import numpy as np
    from scipy.stats import multivariate_normal
    x = np.linspace(-1, 1, n)
    y = np.linspace(-1, 1, n)
    X, Y = np.meshgrid(x, y)
    mean = np.zeros(2)
    cov = [[1, .75],
           [.75, 1]]
    rv = multivariate_normal(mean, cov)
    Z = rv.pdf(np.dstack((X, Y)))
    return pd.DataFrame(dict(x=X.flatten(), y=Y.flatten(), z=Z.flatten()))

contour_df = get_contour_data()

gggrid([
    ggplot(contour_df, aes("x", "y", z="z")) + \
        geom_contour() + \
        coord_fixed(ratio=.75) + \
        ggtitle("'contour' stat is default\nfor contour geometry"),
    ggplot(contour_df, aes("x", "y", z="z")) + \
        geom_point(stat='contour', alpha=.2) + \
        coord_fixed(ratio=.75) + \
        ggtitle("Explicit setting of 'contour' stat"),
    ggplot(contour_df, aes("x", "y", z="z")) + \
        geom_contourf(aes(fill='..level..'), show_legend=False) + \
        coord_fixed(ratio=.75) + \
        ggtitle("'contourf' stat is default\nfor filled contour geometry"),
    ggplot(contour_df, aes("x", "y", z="z")) + \
        geom_point(aes(color='..level..'), stat='contourf', \
                   alpha=.2, show_legend=False) + \
        coord_fixed(ratio=.75) + \
        ggtitle("Explicit setting of 'contourf' stat"),
], ncol=2)

#### `'density2d'` and `'density2df'`

In [20]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_density2d() + \
        coord_fixed(ratio=.5) + \
        ggtitle("'density2d' stat is default\nfor density2d geometry"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point(stat='density2d', alpha=.2) + \
        coord_fixed(ratio=.5) + \
        ggtitle("Explicit setting of 'density2d' stat"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_density2df(aes(fill='..level..'), show_legend=False) + \
        coord_fixed(ratio=.5) + \
        ggtitle("'density2df' stat is default\nfor density2df geometry"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point(aes(color='..level..'), stat='density2df', \
                   alpha=.2, show_legend=False) + \
        coord_fixed(ratio=.5) + \
        ggtitle("Explicit setting of 'density2df' stat"),
], ncol=2)

#### `'qq2'` and `'qq2_line'`

In [21]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_qq2() + \
        geom_qq2_line() + \
        ggtitle("'qq2' and 'qq2_line' stats are default\nfor geom_qq2() and geom_qq2_line()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point(stat='qq2') + \
        geom_line(stat='qq2_line') + \
        ggtitle("Explicit setting of 'qq2' and 'qq2_line' stats"),
])

#### `'smooth'`

In [22]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_point() + \
        geom_smooth() + \
        ggtitle("'smooth' stat is default for smooth geometry"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point() + \
        geom_ribbon(stat='smooth', fill="yellow", size=0) + \
        geom_line(stat='smooth', color="red") + \
        ggtitle("Explicit setting of 'smooth' stat"),
])

#### `'summary_bin'`

In [23]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        stat_summary_bin() + \
        ggtitle("'summarybin' stat is default for stat_summary_bin()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_crossbar(stat='summarybin') + \
        ggtitle("Explicit setting of 'summarybin' stat"),
])