# EDDI19 - Workshop

## Setup

In [99]:
import numpy as np
import pandas as pd

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

In [100]:
from bokeh.models import ColumnDataSource, Span, CDSView, RangeSlider, CustomJS, CustomJSFilter, DatetimeTickFormatter, HoverTool, PanTool, ZoomInTool, ZoomOutTool, ResetTool, Legend
from bokeh.palettes import Spectral6
from bokeh.transform import factor_cmap
from bokeh.layouts import column, widgetbox, row

In [101]:
# Function to get a csv as a DataFrame
def get_df(filename):
    return pd.read_csv(filename)

panelists = get_df('data/panelists_ndob.csv')
events = get_df('data/events.csv')
interactions = get_df('data/survey_interactions_2019_light.csv')
surveys = get_df('data/surveys.csv')

## First example

In [102]:
s = panelists.sex.value_counts()

genders = list(s.index)
counts = list(s.values)

source = ColumnDataSource(data=dict(genders=genders, counts=counts))

p = figure(
    x_range=genders,
    title="Panelist count by gender",
    plot_height=400,
    plot_width=400,
)

p.vbar(
    x='genders',
    top='counts',
    width=0.9,
    source=source,
    legend_field='genders',
    fill_color=factor_cmap('genders', palette=Spectral6, factors=genders),
)

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.legend.orientation = "horizontal"
p.legend.location = "bottom_center"

show(p)

## Second example

In [103]:
# Gets the id of events 'survey_start'
EVENT_START_IDS = events[events['codename'].isin(['survey_start'])]['id']
# Remove interactions that are not 'survey_start'
iactions = interactions[interactions['event_id'].isin(EVENT_START_IDS)]

# Adds birthdate of each panelists on each interaction
iactions = iactions.merge(panelists[['id', 'birthdate']], left_on='panelist_id', right_on='id').drop(columns=['id'])

# Change column `date` and `birthdate` type
iactions[['date', 'birthdate']] = iactions[['date', 'birthdate']].apply(pd.to_datetime)

# Adds `age` column
from dateutil.relativedelta import relativedelta
import calendar
def get_age(date):
    now = pd.to_datetime('now')
    days_in_year = 366 if calendar.isleap(now.year) else 365
    day_of_year = int(now.strftime("%j"))
    delta = relativedelta(now, date)
    date_day_of_year = int(date.strftime("%j"))
    countdown = (date_day_of_year-day_of_year)
    if countdown >= 0:
        return delta.years - countdown/days_in_year + 1
    else:
        return delta.years - countdown/days_in_year
iactions['age'] = iactions['birthdate'].apply(get_age)


# Change column `start_date` type
surveys[['start_date']] = surveys[['start_date']].apply(pd.to_datetime)

# Get survey start dates
survey_ids = iactions['survey_id'].unique()
survey_start_dates = surveys[surveys['id'].isin(survey_ids)]['start_date']

source = ColumnDataSource(iactions)

p = figure(
    title="Survey start date by age",
    x_axis_type='datetime',
    plot_height=600,
    plot_width=600,
)
p.circle('date', 'age', source=source, fill_alpha=0.15, line_alpha=0.15)

for date in survey_start_dates:
    p.add_layout(Span(location=date, dimension='height', line_width=1))

show(p)

## Third example

In [104]:
from datetime import timedelta

def seconds_to_datetime(df):
    return timedelta(seconds=df)

def get_completion_time(df):
    dates = list(df['date'])
    if len(dates) == 2:
        seconds = abs((dates[0]-dates[1]).total_seconds())
        df['completion_time'] = seconds
    return df

def q1(x):
    return x.quantile(0.25)

def q3(x):
    return x.quantile(0.75)


# Get interactions with that are start/end of a survey
#EVENT_START_END_IDS = events[events['codename'].isin(['survey_start', 'survey_end'])]['id']
#iactions2 = interactions[interactions['event_id'].isin(EVENT_START_END_IDS)]
iactions2 = interactions
iactions2[['date']] = iactions2[['date']].apply(pd.to_datetime)

# Get completion times
iactions2 = iactions2.groupby(['panelist_id', 'survey_id']).apply(get_completion_time)

# Remove unused columns
iactions2 = iactions2[['completion_time', 'panelist_id', 'survey_id']].drop_duplicates()
iactions2 = iactions2[iactions2['completion_time'].notnull()]

# Calc stats
iactions2 = iactions2.groupby(['survey_id']).agg({'completion_time': ['mean', 'min', q1, 'median', q3, 'max']})

# Simplify stat column indexes
iactions2.columns = iactions2.columns.get_level_values(1)

# Merge survey information
iactions2 = iactions2.merge(surveys[['id', 'title', 'estimated_time']], left_on='survey_id', right_on='id')

# Cast stats from seconds to datetime
iactions2[['mean', 'min', 'q1', 'median', 'q3', 'max']] = iactions2[['mean', 'min', 'q1', 'median', 'q3', 'max']].apply(lambda x: pd.to_timedelta(x, unit='seconds'))
iactions2[['estimated_time']] = iactions2[['estimated_time']].apply(lambda x: pd.to_timedelta(x, unit='minutes'))

In [105]:
df = iactions2
df['id'] = df['id'].astype(str)
source = ColumnDataSource(df)

ids = df['id']
rng_start = ids.min()
rng_end = ids.max()
rng_value = (rng_start, rng_end)

p = figure(
    plot_height=800,
    plot_width=800,
    title="Response time distribution by survey",
    x_axis_type='datetime',
    y_range=[i for i in ids.values],
    tools='save'
)

p.x_range.start = 0
p.x_range.end = pd.to_timedelta('01:15:00').total_seconds()*1000
p.x_range.bounds = (0, None)
p.xaxis.formatter = DatetimeTickFormatter(minutes=['%M min'], hourmin='%H h %M min')
p.ygrid.grid_line_color = None
p.xgrid.grid_line_dash = 'dashed'

# Spread between 25th and 50th percentiles
q2 = p.hbar(y='id', left='q1', right='median', height=0.8, color='limegreen', alpha=0.5, source=source)

# Spread between 50th and 75th percentiles
q3 = p.hbar(y='id', left='median', right='q3', height=0.8, color='indianred', alpha=0.5, source=source)

p.add_tools(HoverTool(
    renderers=[q2, q3],
    formatters={key: 'datetime' for key in ['mean', 'min', 'q1', 'median', 'q3', 'max', 'estimated_time']},
    tooltips=[
        ('Survey', '@title'),
        ('Estimated time', '@estimated_time{%H:%M:%S}'),
        ('Minimum', '@min{%H:%M:%S}'),
        ('25%', '@{q1}{%H:%M:%S}'),     
        ('Median', '@{median}{%H:%M:%S}'),
        ('75%', '@{q3}{%H:%M:%S}'),
        ('Maximum', '@max{%H:%M:%S}')
    ],
    point_policy='follow_mouse'
))

# Whiskers
p.rect(x='min', y='id', height=0.8, width=0.01, color='gray', source=source)
p.rect(x='max', y='id', height=0.8, width=0.01, color='gray', source=source)

# Stems
s = p.segment(x0='min', y0='id', x1='max', y1='id', color='gray', source=source)

# Expected completion time
e = p.circle(x='estimated_time', y='id', size=8, color="black", source=source)

# Legend
legend = Legend(items=[
    ('Estimated time', [e]),
    ('25%-50%', [q2]),
    ('50%-75%', [q3]),
    ('min-max', [s]),
], location=(10, 300))
p.add_layout(legend, 'right')

# Horizontal panning
p.add_tools(
    PanTool(dimensions="width"),
    ZoomInTool(dimensions="width"),
    ZoomOutTool(dimensions="width"),
    ResetTool()
)

show(p)