In this notebook we'll look at how data visualization can help us understand the relationship between two features(x and y). This is based on a research study called <a href="https://www.autodeskresearch.com/publications/samestats" target="_blank">The Datasaurus Dozen</a> by Autodesk research and the original <a href="http://www.thefunctionalart.com/2016/08/download-datasaurus-never-trust-summary.html" target="_blank">Datasaurus</a> provided by Alberto Cairo.

Takeaway: <b>Never trust summary statistics alone; always visualize your data</b>

Use the dropdown to select different datasets. Note that the basic stats (first, second moments and correlation) are almost the same for all the datasets eventhough though the relationships between `x` and `y` are quite different (as evident from the scatter plot and histograms)

In [None]:
import pandas as pd

import ipywidgets as widgets
import bqplot.pyplot as plt

In [None]:
pd.options.display.float_format = '{:.2f}'.format

In [None]:
datasaurus_data = pd.read_csv('data/DatasaurusDozen.tsv', delimiter='\t')

# group by dataset and compute first two moments and corr
dataset_gby = datasaurus_data.groupby('dataset')

# basic stats for all datasets: mean and std
stats = dataset_gby.agg(['mean', 'std'])

# correlation between x and y for all datasets
corr = dataset_gby.apply(lambda g: g['x'].corr(g['y']))

# stats for all datasets
stats_df = pd.concat([stats, corr], axis=1)
stats_df.columns = ['x_mean', 'x_std', 'y_mean', 'y_std', 'corr']

In [None]:
type_dropdown = widgets.Dropdown(description='Dataset', options=list(dataset_gby.groups.keys()))
stats_table_placeholder = widgets.Box()

In [None]:
scat_fig = plt.figure(animation_duration=1000, preserve_aspect=True)
scat_fig.layout.width = '800px'
scat_fig.layout.height = '650px'
scat = plt.scatter([], [], colors=['deepskyblue'], default_size=40, stroke='black')
plt.xlabel('X')
plt.ylabel('Y')

# historgrams of X and Y
hist_layout = widgets.Layout(height='320px', width='400px')
hist_title_tmpl = 'Histogram of {dataset}[{var}]'
x_hist_fig = plt.figure(layout=hist_layout)
x_hist = plt.hist([], colors=['orangered'], bins=30)

y_hist_fig = plt.figure(layout=hist_layout)
y_hist = plt.hist([], colors=['lightgreen'], bins=30)

for axis in x_hist_fig.axes:
    axis.grid_lines = 'none'

for axis in y_hist_fig.axes:
    axis.grid_lines = 'none'
        
# create a callback to update the scatter and the stats table
def update(*args):
    dataset = type_dropdown.value
    scat_fig.title = dataset
    with scat.hold_sync():
        x, y = (dataset_gby
        .get_group(dataset)[['x', 'y']]
        .values.T)
        scat.x, scat.y = x, y
    
    x_hist.sample = x
    x_hist_fig.title = hist_title_tmpl.format(dataset=dataset,
                                              var='x')
    y_hist.sample = y
    y_hist_fig.title = hist_title_tmpl.format(dataset=dataset,
                                              var='y')

    out = widgets.Output()
    with out:
        display(stats_df.loc[dataset].to_frame())
    stats_table_placeholder.children = [out]

type_dropdown.observe(update, 'value')

# invoke the callback on startup
update(None)

histograms = widgets.VBox([x_hist_fig, y_hist_fig])
widgets.VBox([type_dropdown, 
              widgets.HBox([scat_fig, 
                            histograms, 
                            stats_table_placeholder])])