In [None]:
# Auto-reload frequently changed files
%load_ext autoreload
%autoreload 2
%aimport utils

import pandas as pd
import numpy as np
import altair as alt
from altair_saver import save
from os.path import join
from web import for_website

from constants import COLUMNS
from utils import (
    read_combined_labs_df, read_combined_by_country_labs_df, read_combined_by_site_labs_df,
    read_loinc_df,
    apply_theme
)

In [None]:
"""
Common info that should be defined everytime before rendering visualizations
"""
SITES = read_combined_by_site_labs_df()[COLUMNS.SITE_ID].unique()

# Titles
NUM_SITES = len(SITES)
DATA_DATE = "2020-04-07"
VIS_DATE = "2020-04-09"
NUM_PATIENTS = "15,427"
SUBTITLE = f"Data as of {DATA_DATE} | {NUM_SITES} Sites | Plots generated on {VIS_DATE}"

SAVE_DIR = join("..", "output") # Where to save visualization *.PNG files

# Colors
COMBINED = "All countries"
COMBINED_COLOR = "#444444"

COUNTRIES = ["France", "Germany", "Italy", "Singapore", "USA"]
COUNTRY_COLOR = ["#0072B2", "#E69F00", "#009E73", "#CC79A7", "#D55E00"]
COLOR_BY_COUNTRY = {COUNTRIES[i]: COUNTRY_COLOR[i] for i in range(len(COUNTRIES))} 

# Site-leve colors
SITES = ['APHP', 'FRBDX', 'UKER', 'UKFR', 'ICSM1', 'ICSM20', 'ICSM5', 'POLIMI', 'BCH', 'BIDMC', 'CHOP', 'KUMC', 'MAYOC', 'MGB', 'MUSC', 'UCLA', 'UMICH', 'UPenn', 'UTSW']
SITES_ANONYMOUS = [f"SITE {(i+1):02d}" for i in range(len(SITES))]
SITES_TO_ANONYMOUS = { SITES[i]: SITES_ANONYMOUS[i] for i in range(len(SITES)) } 
SITES_COUNTRY = ["France", "France", "Germany", "Germany", "Italy", "Italy", "Italy", "Italy", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA"]
SITE_COLOR = [COLOR_BY_COUNTRY[SITES_COUNTRY[i]] for i in range(len(SITES))]
COLOR_BY_SITE = { SITES[i]: COLOR_BY_COUNTRY[SITES_COUNTRY[i]] for i in range(len(SITES)) } 

COUNTRIES_AND_COMBINED = [COMBINED] + COUNTRIES
COUNTRY_AND_COMBINED_COLOR = [COMBINED_COLOR] + COUNTRY_COLOR
COLOR_BY_COUNTRY_AND_COMBINED = {COUNTRIES_AND_COMBINED[i]: COUNTRY_AND_COMBINED_COLOR[i] for i in range(len(COUNTRIES_AND_COMBINED))} 

COLOR20 = [
    "#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", 
    "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", 
    "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"
]

# Required Setups
- All combined datasets should be placed in `../data/combined` (e.g., `../data/combined/Labs-Combinedyymmdd.csv`).
- To save PNG files for visualizations, a folder named "output" should be present (i.e., `../output/`).

# Data Preprocess

In [None]:
def process_labs_df(df_lb):
    
    # Negative values to zeros
    df_lb.loc[df_lb[COLUMNS.NUM_PATIENTS] < 0, COLUMNS.NUM_PATIENTS] = 0
    df_lb.loc[df_lb[COLUMNS.MEAN_VALUE] < 0, COLUMNS.MEAN_VALUE] = 0
    df_lb.loc[df_lb[COLUMNS.STDEV_VAL] < 0, COLUMNS.STDEV_VAL] = 0

    # Upper and under bound for values
    df_lb["upper"] = df_lb[COLUMNS.MEAN_VALUE] + df_lb[COLUMNS.STDEV_VAL] 
    df_lb["under"] = df_lb[COLUMNS.MEAN_VALUE] - df_lb[COLUMNS.STDEV_VAL]
    df_lb["upper_p"] = df_lb[COLUMNS.NUM_PATIENTS] + df_lb[COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS]
    df_lb["under_p"] = df_lb[COLUMNS.NUM_PATIENTS]
    df_lb[COLUMNS.NUM_PATIENTS] += (df_lb["upper_p"] - df_lb["under_p"]) / 2.0

    # Add readable names for LOINC
    loinc_df = read_loinc_df().set_index(COLUMNS.LOINC).rename(columns={'labTest': 'name'})
    df_lb["loinc_name"] = df_lb[COLUMNS.LOINC].apply(lambda code: 
        loinc_df.at[code, "name"] if loinc_df.at[code, "unit"] == "-1" else loinc_df.at[code, "name"] + " (" + loinc_df.at[code, "unit"] + ")"
    )

    # Number of sites
    df_lb[COLUMNS.NUM_SITES] = df_lb[COLUMNS.UNMASKED_SITES_NUM_PATIENTS]

    # Drop unused columns
    df_lb = df_lb.drop(columns=[
        COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS,
        COLUMNS.UNMASKED_SITES_NUM_PATIENTS,
        COLUMNS.MASKED_SITES_NUM_PATIENTS
    ])
    
    return df_lb

# Load datasets
df_lb = read_combined_by_country_labs_df()
df_lb = process_labs_df(df_lb)
    
df_lb_combined = read_combined_labs_df()
df_lb_combined = process_labs_df(df_lb_combined)

df_lb_site = read_combined_by_site_labs_df()
df_lb_site = process_labs_df(df_lb_site)

# Merge all dfs
df_lb = pd.concat([df_lb, df_lb_combined])
df_lb[COLUMNS.IS_COUNTRY] = True
df_lb_site[COLUMNS.IS_COUNTRY] = False
df_lb = pd.concat([df_lb, df_lb_site])

# Use more readable names
df_lb.loc[df_lb[COLUMNS.SITE_ID] == "Combined", COLUMNS.SITE_ID] = COMBINED

# Remove data if no sites provided
df_lb = df_lb[df_lb[COLUMNS.NUM_SITES] != 0]

# Set extent
NUM_PATIENTS_EXTENT = [0, max(df_lb[COLUMNS.NUM_PATIENTS])]
NUM_SITES_EXTENT = [0, max(df_lb[COLUMNS.NUM_SITES])]
DAYS_SINCE_EXTENT = [min(df_lb[COLUMNS.DAYS_SINCE_POSITIVE]), max(df_lb[COLUMNS.DAYS_SINCE_POSITIVE])]

# Anonumous
for site in SITES:
    df_lb.loc[df_lb[COLUMNS.SITE_ID] == site, COLUMNS.SITE_ID] = SITES_TO_ANONYMOUS[site]

df_lb

# Visualizations

In [None]:
LOINCS = df_lb["loinc_name"].unique()
LOINC_IDS = df_lb["loinc"].unique()

LAB_TOOLTIP = [
    alt.Tooltip(COLUMNS.SITE_ID, title="Country"),
    alt.Tooltip(COLUMNS.DAYS_SINCE_POSITIVE, title="Days since positive"),
    alt.Tooltip(COLUMNS.MEAN_VALUE, title="Mean value", format=".2f"),
    alt.Tooltip(COLUMNS.NUM_PATIENTS, title="# of patients"),
    alt.Tooltip(COLUMNS.NUM_SITES, title="# of institutions")
]
class DATA_LEVEL:
    COMBINED = "combined"
    COUNTRY = "country"
    SITE = "site"

def lab_by_date(data_level=DATA_LEVEL.COMBINED):
    
    """
    Selections
    """
    nearest = alt.selection(type="single", nearest=True, on="mouseover", fields=[COLUMNS.DAYS_SINCE_POSITIVE], empty='none', clear="mouseout")

    lab_dropdown = alt.binding_select(options=LOINCS)
    lab_selection = alt.selection_single(fields=["loinc_name"], bind=lab_dropdown, name="Lab", init={"loinc_name": LOINCS[0]})

    legend_selection = alt.selection_multi(fields=[COLUMNS.SITE_ID], bind="legend")

    """
    Select rows (Altair is not good at handling large number of rows)
    """
    
    if data_level == DATA_LEVEL.COMBINED:
        df = df_lb[df_lb[COLUMNS.SITE_ID] == COMBINED]
        color_scale = alt.Scale(domain=[COMBINED], range=[COMBINED_COLOR])
    elif data_level == DATA_LEVEL.COUNTRY:
        df = df_lb[df_lb[COLUMNS.IS_COUNTRY] == True]
        df = df[df[COLUMNS.SITE_ID] != COMBINED]
        color_scale = alt.Scale(domain=COUNTRIES, range=COUNTRY_COLOR)
    else:
        df = df_lb[df_lb[COLUMNS.IS_COUNTRY] == False]
        color_scale = alt.Scale(domain=SITES_ANONYMOUS, range=SITE_COLOR, scheme="category20")
    
    """
    Rules
    """
    df_v_rule = pd.DataFrame({"date": [-1]})
    v_rule = alt.Chart(df_v_rule).mark_rule(color="gray", strokeDash=[3,3]).encode(
        x="date:Q"
    )

    nearest_rule = alt.Chart(df).mark_rule(color="black").encode(
        x=f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
        size=alt.value(0.5)
    ).transform_filter(
        nearest
    )

    """
    Data preprocessing
    """
    filtered_chart = alt.Chart(df).transform_filter(
        legend_selection
    ).transform_filter(
        lab_selection
    )
    
    mean_rule = filtered_chart.mark_rule(color="red", size=2, opacity=0.7).encode(
        y=f"mean({COLUMNS.MEAN_VALUE}):Q"
    )
    
    """
    Top Chart
    """
    line = filtered_chart.mark_line(size=2, opacity=0.7).encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
            title=None, 
            axis=alt.Axis(grid=True, labels=False, ticks=False, domain=True),
            scale=alt.Scale(domain=DAYS_SINCE_EXTENT)
        ),
        y=alt.Y(
            f"{COLUMNS.MEAN_VALUE}:Q", 
            title="Mean value",
        ),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=color_scale), 
        tooltip=LAB_TOOLTIP
    )

    circle = line.mark_circle(size=30, opacity=0.7).encode(
        size=alt.condition(~nearest, alt.value(30), alt.value(60))
    )

    top_chart = (circle + line + nearest_rule).properties(height=300, width=700).interactive()

    """
    Middle Chart
    """
    bar = filtered_chart.mark_bar(size=8).encode(
        y=alt.Y(
            f"sum({COLUMNS.NUM_PATIENTS}):Q", 
            title="# of patients"
        ),
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
            title=None,
            axis=alt.Axis(grid=True, labels=False, ticks=False, domain=True)
        ),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=color_scale, title=None),
        tooltip=LAB_TOOLTIP,
    )
    
    middle_chart = (bar + v_rule + nearest_rule).properties(height=60, width=700)

    """
    Bottom Chart
    """
    bottom_bar = filtered_chart.mark_bar(size=8).encode(
        y=alt.Y(
            f"sum({COLUMNS.NUM_SITES}):Q", 
            title="# of sites",
        ),
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
            title="Days since positive"
        ),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=color_scale),
        tooltip=LAB_TOOLTIP,
    )
    bottom_chart = (bottom_bar + v_rule + nearest_rule).properties(height=60, width=700).interactive()

    result_vis = alt.vconcat(top_chart, middle_chart, bottom_chart, spacing=5).resolve_scale(
        y="independent", x="shared", color="shared"
    ).add_selection(
        legend_selection
    ).add_selection(
        nearest
    ).add_selection(
        lab_selection
    )

    return result_vis

## Lab values

In [None]:
country_level_lab = lab_by_date(data_level=DATA_LEVEL.COMBINED).properties(title={
    "text": "Lab values", 
    "subtitle": [SUBTITLE],
    "subtitleColor": "gray", 
    "dx": 60
})
country_level_lab = apply_theme(country_level_lab, legend_orient="right")

for_website(country_level_lab, "Labs", "Lab values") # TODO: Remove this before deploying notebook

country_level_lab

## Lab values by country

In [None]:
country_level_lab = lab_by_date(data_level=DATA_LEVEL.COUNTRY).properties(title={
    "text": "Lab values by country", 
    "subtitle": [SUBTITLE],
    "subtitleColor": "gray", 
    "dx": 60
})
country_level_lab = apply_theme(country_level_lab, legend_orient="right")

for_website(country_level_lab, "Labs", "Lab values by country") # TODO: Remove this before deploying notebook

country_level_lab

## Lab values by site

In [None]:
site_level_lab = lab_by_date(data_level=DATA_LEVEL.SITE).properties(title={
    "text": "Lab values by site", 
    "subtitle": [SUBTITLE],
    "subtitleColor": "gray", 
    "dx": 60
})
site_level_lab = apply_theme(site_level_lab, legend_orient="right")

for_website(site_level_lab, "Labs", "Lab values by site") # TODO: Remove this before deploying notebook

site_level_lab