In [None]:
# Auto-reload frequently changed files
%load_ext autoreload
%autoreload 2
%aimport utils

import pandas as pd
import numpy as np
import altair as alt
from altair_saver import save
from ipywidgets import interact
from os.path import join

from constants import COLUMNS
from utils import (
    read_combined_daily_counts_df, read_combined_by_country_daily_counts_df,
    read_combined_demographics_df, read_combined_by_country_demographics_df,
    read_combined_labs_df, read_combined_by_country_labs_df,
    read_combined_diagnoses_df, read_combined_by_country_diagnoses_df,
    apply_theme, apply_grouped_bar_theme, apply_trellis_theme,
    read_icd_df, read_loinc_df
)

In [None]:
# Common info that should be defined everytime before rendering visualizations
NUM_SITES = "12"
DATA_DATE = "2020-04-05"
NUM_PATIENTS = "15,427"
VIS_DATE = "2020-04-07"
SUBTITLE = f"Data as of {DATA_DATE}" + " | " + NUM_SITES + " Sites | " + NUM_PATIENTS + " Patients | Plots generated on " + VIS_DATE

SAVE_DIR = join("..", "output") # Where to save visualization *.PNG files

COMBINED = "All countries"
COMBINED_COLOR = "#444444"

COUNTRIES = ["France", "Italy", "USA"]  # "Germany", 
COUNTRY_COLOR = ["#0072B2", "#009E73", "#D55E00"] # "#E69F00",
COLOR_BY_COUNTRY = {COUNTRIES[i]: COUNTRY_COLOR[i] for i in range(len(COUNTRIES))} 

COUNTRIES_AND_COMBINED = [COMBINED] + COUNTRIES
COUNTRY_AND_COMBINED_COLOR = [COMBINED_COLOR] + COUNTRY_COLOR
COLOR_BY_COUNTRY_AND_COMBINED = {COUNTRIES_AND_COMBINED[i]: COUNTRY_AND_COMBINED_COLOR[i] for i in range(len(COUNTRIES_AND_COMBINED))} 

COLOR20 = [
    "#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", 
    "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", 
    "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"
]

# Required Setups

- All four combined datasets should be placed in `../data/combined` (e.g., `../data/combined/DailyCounts-Combinedyymmdd.csv` for the DailyCounts file).
- To save PNG files for visualizations, a folder named "output" should be present (i.e., `../output/`).

# Labs

In [None]:
def process_labs_df(df_lb):
    
    # Zero negative values
    df_lb.loc[df_lb[COLUMNS.NUM_PATIENTS] < 0, COLUMNS.NUM_PATIENTS] = 0
    df_lb.loc[df_lb[COLUMNS.MEAN_VALUE] < 0, COLUMNS.MEAN_VALUE] = 0
    df_lb.loc[df_lb[COLUMNS.STDEV_VAL] < 0, COLUMNS.STDEV_VAL] = 0

    df_lb["upper"] = df_lb[COLUMNS.MEAN_VALUE] + df_lb[COLUMNS.STDEV_VAL] 
    df_lb["under"] = df_lb[COLUMNS.MEAN_VALUE] - df_lb[COLUMNS.STDEV_VAL]

    df_lb["upper_p"] = df_lb[COLUMNS.NUM_PATIENTS] + df_lb[COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS]
    df_lb["under_p"] = df_lb[COLUMNS.NUM_PATIENTS]
    df_lb[COLUMNS.NUM_PATIENTS] += (df_lb["upper_p"] - df_lb["under_p"]) / 2.0

    loinc_df = read_loinc_df().set_index('loinc').rename(columns={'labTest': 'name'})

    df_lb["loinc_name"] = df_lb[COLUMNS.LOINC].apply(lambda code: 
        loinc_df.at[code, "name"] if loinc_df.at[code, "unit"] == "-1" else loinc_df.at[code, "name"] + " (" + loinc_df.at[code, "unit"] + ")"
    )

    # Number of sites
    df_lb[COLUMNS.NUM_SITES] = df_lb[COLUMNS.UNMASKED_SITES_NUM_PATIENTS] # + df_lb[COLUMNS.MASKED_SITES_NUM_PATIENTS]

    # Drop unused columns
    df_lb = df_lb.drop(columns=[
        COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS,
        COLUMNS.UNMASKED_SITES_NUM_PATIENTS,
        COLUMNS.MASKED_SITES_NUM_PATIENTS
    ])
    
    return df_lb

# Load datasets
df_lb = read_combined_by_country_labs_df()
df_lb = process_labs_df(df_lb)
    
df_lb_combined = read_combined_labs_df()
df_lb_combined = process_labs_df(df_lb_combined)

# Merge two
df_lb = pd.concat([df_lb, df_lb_combined])

# Use more readable names
df_lb.loc[df_lb[COLUMNS.SITE_ID] == "Combined", COLUMNS.SITE_ID] = COMBINED

# Set extent
NUM_PATIENTS_EXTENT = [0, max(df_lb[COLUMNS.NUM_PATIENTS])]
NUM_SITES_EXTENT = [0, max(df_lb[COLUMNS.NUM_SITES])]
DAYS_SINCE_EXTENT = [min(df_lb[COLUMNS.DAYS_SINCE_POSITIVE]), max(df_lb[COLUMNS.DAYS_SINCE_POSITIVE])]

df_lb

In [None]:
LOINCS = df_lb["loinc_name"].unique()
LOINC_IDS = df_lb["loinc"].unique()

LAB_TOOLTIP = [
    alt.Tooltip(COLUMNS.SITE_ID, title="Country"),
    alt.Tooltip(COLUMNS.DAYS_SINCE_POSITIVE, title="Days since positive"),
    alt.Tooltip(COLUMNS.MEAN_VALUE, title="Mean value"),
    alt.Tooltip(COLUMNS.NUM_PATIENTS, title="# of patients"),
    alt.Tooltip(COLUMNS.NUM_SITES, title="# of institutions")
]

color_scale = alt.Scale(domain=COUNTRIES_AND_COMBINED, range=COUNTRY_AND_COMBINED_COLOR)
nearest = alt.selection(type='single', nearest=True, on='mouseover', fields=[COLUMNS.DAYS_SINCE_POSITIVE], empty='none', clear="mouseout")

def lab_by_date(test, country):

    """
    vertical rule
    """
    df_v_rule = pd.DataFrame({"date": [1]})
    v_rule = alt.Chart(df_v_rule).mark_rule(color="red", strokeDash=[3,3]).encode(
        x="date:Q"
    )
    nearest_rule = alt.Chart(df_lb).mark_rule(color="black").encode(
        x=f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
        size=alt.value(0.5)
    ).transform_filter(
        nearest
    )

    """
    Data preprocessing
    """
    filtered_chart = alt.Chart(df_lb).transform_filter(
        alt.datum["loinc_name"] == test
    ).transform_filter(
        alt.datum[COLUMNS.SITE_ID] == country
    )
    
    Y_EXTENT = [
        min(df_lb.loc[df_lb["loinc_name"] == test, "under"]), 
        max(df_lb.loc[df_lb["loinc_name"] == test, "upper"])
    ]
    
    """
    Top Chart
    """
    line = filtered_chart.mark_line(size=1, opacity=1).encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
            title=None, 
            axis=alt.Axis(grid=True, labels=False, ticks=False, domain=True),
            scale=alt.Scale(domain=DAYS_SINCE_EXTENT)
        ),
        y=alt.Y(
            f"{COLUMNS.MEAN_VALUE}:Q", 
            title="Mean value (stdev)",
            scale=alt.Scale(domain=Y_EXTENT)
        ),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=color_scale, legend=None),
        tooltip=LAB_TOOLTIP,
    )

    circle = line.mark_circle(size=10).encode(
        size=alt.condition(~nearest, alt.value(10), alt.value(50))
    )
    errorband = filtered_chart.mark_errorband().encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
            title=None,
        ),
        y=alt.Y(
            f"under:Q",
            title="", 
        ),
        y2="upper:Q",
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=color_scale, legend=None),
        tooltip=LAB_TOOLTIP
    )   
    white_errorline = errorband.mark_errorbar().encode(
        size=alt.value(1),
        opacity=alt.value(0.3)
    )

    top_chart = (circle + line + errorband + white_errorline + v_rule + nearest_rule).properties(height=200, width=500)

    """
    Middle Chart
    """
    bar = filtered_chart.mark_bar().encode(
        y=alt.Y(
            f"sum({COLUMNS.NUM_PATIENTS}):Q", 
            title="# of tested",
            scale=alt.Scale(domain=NUM_PATIENTS_EXTENT)
        ),
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
            title=None,
            axis=alt.Axis(grid=True, labels=False, ticks=False, domain=True)
        ),
        color=alt.value("gray"),
        tooltip=LAB_TOOLTIP
    )

    errorbar = filtered_chart.mark_errorbar().encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
        ),
        y=alt.Y(
            f"under_p:Q",
            title="", 
        ),
        y2="upper_p:Q",
        color=alt.value("black"),
        size=alt.value(1)
    )
    
    middle_chart = (bar + errorbar + v_rule + nearest_rule).properties(height=60)

    """
    Bottom Chart
    """
    bottom_bar = filtered_chart.mark_bar().encode(
        y=alt.Y(
            f"{COLUMNS.NUM_SITES}:Q", 
            title="# of sites",
            scale=alt.Scale(domain=NUM_SITES_EXTENT)
        ),
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
            title="Days since positive"
        ),
        color=alt.value("gray"),
        tooltip=LAB_TOOLTIP
    )
    bottom_chart = (bottom_bar + v_rule + nearest_rule).properties(height=60)

    result_vis = alt.vconcat(top_chart, middle_chart, bottom_chart, spacing=5).resolve_scale(y="independent", x="shared").properties(
        title={
            "text": test,
            "subtitle": SUBTITLE
        }).add_selection(nearest)

    return result_vis

def lab_on_positive_by_country(test):
    
    # Filter
    filtered_chart = alt.Chart(df_lb).transform_filter(
        alt.datum["loinc_name"] == test
    ).transform_filter(
        alt.datum[COLUMNS.DAYS_SINCE_POSITIVE] == 1
        # nearest 
    )

    # Render
    bar = filtered_chart.mark_bar().encode(
        x=alt.X(f"{COLUMNS.SITE_ID}:N"),
        y=alt.Y(f"{COLUMNS.MEAN_VALUE}:Q"),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=color_scale, legend=None)
    ).properties(width=500,height=365)
    circle = bar.mark_circle().encode(color=alt.value("black"))

    errorbar = filtered_chart.mark_errorbar(ticks=True).encode(
        x=alt.X(f"{COLUMNS.SITE_ID}:N", title=None),
        y=alt.Y("upper:Q", title="Mean value (stdev)"),
        y2="under:Q",
        color=alt.value("black")
    )
    
    result_vis = (bar + circle + errorbar).properties(
        title={
            "text": "Lab results on the date of positive result",
            "subtitle": "(days_since_positive == 1)",
            "subtitleColor": "red", 
            "dx": 50
        }
    )
    return result_vis

is_save = True
is_debug = True
for i in range(len(LOINCS)): 
    t = LOINCS[i]
    t_id = LOINC_IDS[i]

    # Combined | Bar Chart (Day = 1)
    v = alt.vconcat()
    combined_chart = lab_by_date(test=t, country=COMBINED).properties(title={
        "text": [f"{COMBINED}"], 
        "subtitle": [SUBTITLE], 
        "color": COLOR_BY_COUNTRY_AND_COMBINED[COMBINED], 
        "subtitleColor": "gray", 
        "dx": 60
    })
    right_chart = lab_on_positive_by_country(test=t)
    v &= (combined_chart | right_chart)

    # By Country
    h = alt.hconcat()
    for c in COUNTRIES:
        h |= lab_by_date(test=t, country=c).properties(title={
            "text": f"{c}", 
            "subtitle": [SUBTITLE], 
            "color": COLOR_BY_COUNTRY_AND_COMBINED[c],
            "subtitleColor": "gray", 
            "dx": 60
        })
    v &= h
    out = apply_theme(v.properties(title={"text": t, "dx": 60, "dy": -10, "fontSize":24}))
    out.display()  
    
    if is_save:
        save(out, join(SAVE_DIR, f"lab_by_date_{t_id}.png"))
    if is_debug:
        break

# Daily Counts

In [None]:
CATEGORY = "category"

def preprocess_daily_df(df_dc):

    # Wide to long
    df_dc = pd.melt(df_dc, id_vars=[
        COLUMNS.SITE_ID, COLUMNS.DATE,
        COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
        COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS,
        COLUMNS.UNMASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.UNMASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.UNMASKED_SITES_NEW_DEATHS,
        COLUMNS.MASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.MASKED_SITES_NEW_DEATHS
    ])
    df_dc = df_dc.rename(columns={"variable": CATEGORY, "value": COLUMNS.NUM_PATIENTS})

    # Leave only the 'upper' and 'under' values for the certain 'category' only
    for c in [COLUMNS.NEW_POSITIVE_CASES, COLUMNS.PATIENTS_IN_ICU, COLUMNS.NEW_DEATHS]:
        filter_c = df_dc[CATEGORY] == c
        df_dc.loc[filter_c, "upper"] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dc.loc[filter_c, "masked_upper_bound_" + c]
        df_dc.loc[filter_c, "under"] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS]
        df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dc.loc[filter_c, "masked_upper_bound_" + c] / 2.0
        
        # Add num of sites
        df_dc.loc[filter_c, COLUMNS.NUM_SITES] = df_dc["unmasked_sites_" + c] + df_dc["masked_sites_" + c]

    # Drop unused columns
    df_dc = df_dc.drop(columns=[
        COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
        COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS,
        COLUMNS.UNMASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.UNMASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.UNMASKED_SITES_NEW_DEATHS,
        COLUMNS.MASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.MASKED_SITES_NEW_DEATHS
    ])
    
    return df_dc

# Read files
df_dc = preprocess_daily_df(read_combined_by_country_daily_counts_df())
df_dc_combined = preprocess_daily_df(read_combined_daily_counts_df())

# Merge two
df_dc = pd.concat([df_dc, df_dc_combined])

df_dc.loc[df_dc[COLUMNS.SITE_ID] == "Combined", COLUMNS.SITE_ID] = COMBINED

df_dc

In [None]:
CATEGORIES = [COLUMNS.NEW_POSITIVE_CASES, COLUMNS.NEW_DEATHS, COLUMNS.PATIENTS_IN_ICU]
TITLE_BY_CATEGORY = {
    COLUMNS.NEW_POSITIVE_CASES: "possitive cases",
    COLUMNS.NEW_DEATHS: "deaths",
    COLUMNS.PATIENTS_IN_ICU: "ICU admissions"
}

def dailycount_by_date(category, is_cum = True, is_only_combined = False):
    
    color_scale = alt.Scale(domain=COUNTRIES, range=COUNTRY_COLOR)
    if is_only_combined: 
        color_scale = alt.Scale(domain=[COMBINED], range=[COMBINED_COLOR])

    # Filter
    filtered_chart = alt.Chart(df_dc).transform_filter(
        alt.datum[CATEGORY] == category
    )

    if is_only_combined:
        filtered_chart = filtered_chart.transform_filter(
            alt.datum[COLUMNS.SITE_ID] == COMBINED
        )
    else:
       filtered_chart = filtered_chart.transform_filter(
           alt.datum[COLUMNS.SITE_ID] != COMBINED
        )

    # Calculate cumulative values
    y_field = COLUMNS.NUM_PATIENTS
    upper = "upper"
    under = "under"
    if is_cum:
        filtered_chart = filtered_chart.transform_window(
            cum_val=f"sum({COLUMNS.NUM_PATIENTS})",
            sort=[{"field": COLUMNS.DATE}],
            groupby=[COLUMNS.SITE_ID]
        ).transform_window(
            cum_upper=f"sum(upper)",
            sort=[{"field": COLUMNS.DATE}],
            groupby=[COLUMNS.SITE_ID]
        ).transform_window(
            cum_under=f"sum(under)",
            sort=[{"field": COLUMNS.DATE}],
            groupby=[COLUMNS.SITE_ID]
        )
        y_field = "cum_val"
        upper = "cum_upper"
        under = "cum_under"

    # Render
    line = filtered_chart.mark_line(size=3).encode(
        x=alt.X(f"{COLUMNS.DATE}:T", axis=alt.Axis(tickCount=7), title=None),
        y=alt.Y(f"{y_field}:Q", axis=alt.Axis(tickCount=5), title="Number of patients"),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=color_scale, legend=alt.Legend(title=None))
    )
    point = line.mark_circle(size=40)
    errorband = filtered_chart.mark_errorband().encode(
        x=alt.X(f"{COLUMNS.DATE}:T", axis=alt.Axis(tickCount=7), title=None),
        y=alt.Y(f"{upper}:Q", title=""), 
        y2=f"{under}:Q",
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=color_scale, legend=alt.Legend(title=None))
    )

    top_line = (line + point + errorband).resolve_scale(color="shared").properties(width=750, height=400).interactive()

    bottom_bar = filtered_chart.mark_bar(size=10).encode(
        x=alt.X(f"{COLUMNS.DATE}:T", axis=alt.Axis(tickCount=7), title=None),
        y=alt.Y(f"{COLUMNS.NUM_SITES}:Q", title="# of sites"),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=color_scale, legend=alt.Legend(title=None))
    ).properties(height=60)
    
    title = f"{TITLE_BY_CATEGORY[category]} by date"
    title = f"Cumulative {title}" if is_cum else f"New {title}"

    # Apply Theme
    result_vis = apply_theme(top_line & bottom_bar).resolve_scale(x="shared").properties(title={
        "text": title, 
        "subtitle": SUBTITLE,
        "subtitleColor": "gray",
        "dx": 60
    })
    return result_vis

is_cum = True
is_save = True
for category in CATEGORIES:
    result_vis = dailycount_by_date(category=category, is_cum=is_cum)

    # Display and save
    result_vis.display()
    if is_save:
        is_cum_str = "_cum" if is_cum else ""
        save(result_vis, join(SAVE_DIR, f"dailycount_by_date_{category}{is_cum_str}.png"))

In [None]:
import datetime
df_dc["week"] = df_dc["date"].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').isocalendar()[1])

def dailycount_by_day_and_week(country, category):
    # Filter
    filtered_chart = alt.Chart(df_dc).transform_filter(
        alt.datum[CATEGORY] == category
    ).transform_filter(
        alt.datum[COLUMNS.SITE_ID] == country
    )

    # Rendering
    result_vis = filtered_chart.mark_rect().encode(
        y=alt.Y("day(date):O",title="Day of the week"),
        x=alt.X('week:O', title="Week of the year"),
        color=alt.Color('sum(num_patients):Q', title=None, scale=alt.Scale(scheme="lightorange"))
    ).properties(height=220, width=380, title={
        "text": f"New {TITLE_BY_CATEGORY[category]} ({country})",
        "subtitle": SUBTITLE,
        "color": COLOR_BY_COUNTRY_AND_COMBINED[country],
        "subtitleColor": "gray",
        "dx": 60
    })
    return result_vis

is_save = True
for category in CATEGORIES:
    v = alt.vconcat()
    for country in COUNTRIES_AND_COMBINED:
        result_vis = dailycount_by_day_and_week(country=country, category=category)
        v &= result_vis
    
    # Apply Theme
    result_vis = apply_theme(
        v, 
        legend_orient="right", 
        legend_stroke_color="white", 
        legend_padding=0
    ).resolve_scale(color="independent", x="shared")

    # Display and save
    result_vis.display()
    if is_save:
        save(result_vis, join(SAVE_DIR, f"dailycount_by_day_and_week_{category}.png"))

# Demographics

In [None]:
def preprocess_demo_df(df_dm):
    # Drop unused columns before preprocessing for the simplicity
    df_dm = df_dm.drop(columns=[
        COLUMNS.UNMASKED_SITES_TOTAL_PATIENTS,
        COLUMNS.UNMASKED_SITES_AGE_0TO2,
        COLUMNS.UNMASKED_SITES_AGE_3TO5,
        COLUMNS.UNMASKED_SITES_AGE_6TO11,
        COLUMNS.UNMASKED_SITES_AGE_12TO17,
        COLUMNS.UNMASKED_SITES_AGE_18TO25,
        COLUMNS.UNMASKED_SITES_AGE_26TO49,
        COLUMNS.UNMASKED_SITES_AGE_50TO69,
        COLUMNS.UNMASKED_SITES_AGE_70TO79,
        COLUMNS.UNMASKED_SITES_AGE_80PLUS,
        COLUMNS.MASKED_SITES_TOTAL_PATIENTS,
        COLUMNS.MASKED_SITES_AGE_0TO2,
        COLUMNS.MASKED_SITES_AGE_3TO5,
        COLUMNS.MASKED_SITES_AGE_6TO11,
        COLUMNS.MASKED_SITES_AGE_12TO17,
        COLUMNS.MASKED_SITES_AGE_18TO25,
        COLUMNS.MASKED_SITES_AGE_26TO49,
        COLUMNS.MASKED_SITES_AGE_50TO69,
        COLUMNS.MASKED_SITES_AGE_70TO79,
        COLUMNS.MASKED_SITES_AGE_80PLUS,
        COLUMNS.MASKED_UPPER_BOUND_TOTAL_PATIENTS,
        COLUMNS.TOTAL_PATIENTS,
    ])

    # Wide to long
    df_dm = pd.melt(df_dm, id_vars=[
        COLUMNS.SITE_ID,
        COLUMNS.SEX,
        COLUMNS.MASKED_UPPER_BOUND_AGE_0TO2,
        COLUMNS.MASKED_UPPER_BOUND_AGE_3TO5,
        COLUMNS.MASKED_UPPER_BOUND_AGE_6TO11,
        COLUMNS.MASKED_UPPER_BOUND_AGE_12TO17,
        COLUMNS.MASKED_UPPER_BOUND_AGE_18TO25,
        COLUMNS.MASKED_UPPER_BOUND_AGE_26TO49,
        COLUMNS.MASKED_UPPER_BOUND_AGE_50TO69,
        COLUMNS.MASKED_UPPER_BOUND_AGE_70TO79,
        COLUMNS.MASKED_UPPER_BOUND_AGE_80PLUS,
    ])
    df_dm = df_dm.rename(columns={"variable": COLUMNS.AGE_GROUP, "value": COLUMNS.NUM_PATIENTS})

    # Leave only the 'upper' and 'under' values for the certain 'age_group' only
    for c in [
            COLUMNS.AGE_0TO2,
            COLUMNS.AGE_3TO5,
            COLUMNS.AGE_6TO11,
            COLUMNS.AGE_12TO17,
            COLUMNS.AGE_18TO25,
            COLUMNS.AGE_26TO49,
            COLUMNS.AGE_50TO69,
            COLUMNS.AGE_70TO79,
            COLUMNS.AGE_80PLUS
            ]:
        filter_c = df_dm[COLUMNS.AGE_GROUP] == c
        df_dm.loc[filter_c, "upper"] = df_dm.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dm.loc[filter_c, "masked_upper_bound_" + c]
        df_dm.loc[filter_c, "under"] = df_dm.loc[filter_c, COLUMNS.NUM_PATIENTS]
        df_dm.loc[filter_c, COLUMNS.NUM_PATIENTS] = df_dm.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dm.loc[filter_c, "masked_upper_bound_" + c] / 2.0

    df_dm = df_dm[df_dm[COLUMNS.SEX] != "All"]

    # Drop unused columns
    df_dm = df_dm.drop(columns=[
        COLUMNS.MASKED_UPPER_BOUND_AGE_0TO2,
        COLUMNS.MASKED_UPPER_BOUND_AGE_3TO5,
        COLUMNS.MASKED_UPPER_BOUND_AGE_6TO11,
        COLUMNS.MASKED_UPPER_BOUND_AGE_12TO17,
        COLUMNS.MASKED_UPPER_BOUND_AGE_18TO25,
        COLUMNS.MASKED_UPPER_BOUND_AGE_26TO49,
        COLUMNS.MASKED_UPPER_BOUND_AGE_50TO69,
        COLUMNS.MASKED_UPPER_BOUND_AGE_70TO79,
        COLUMNS.MASKED_UPPER_BOUND_AGE_80PLUS,
    ])
    
    return df_dm

# Process data
df_dm = read_combined_by_country_demographics_df()
df_dm = preprocess_demo_df(df_dm)

df_dm_combined = read_combined_demographics_df()
df_dm_combined = preprocess_demo_df(df_dm_combined)

# Merge two
df_dm = pd.concat([df_dm, df_dm_combined])

df_dm.loc[df_dm[COLUMNS.SITE_ID] == "Combined", COLUMNS.SITE_ID] = COMBINED

df_dm

In [None]:
color_scale = alt.Scale(domain=["Male", "Female", "Other"], range=COLOR20[:2] + ["gray"])

def demographics(country):

    # Filter
    filtered_chart = alt.Chart(df_dm).transform_filter(
        alt.datum[COLUMNS.SITE_ID] == country
    )

    # Render
    bar = filtered_chart.mark_bar().encode(
        x=alt.X(f"{COLUMNS.SEX}:N", title=None, axis=None),
        y=alt.Y(f"{COLUMNS.NUM_PATIENTS}:Q", title="Number of patients", axis=alt.Axis(tickCount=5)),
        color=alt.Color(f"{COLUMNS.SEX}:N", title=None, scale=color_scale),
    ).properties(width=60,height=300)

    errorbar = filtered_chart.mark_errorbar().encode(
        x=alt.X(f"{COLUMNS.SEX}:N", title=None),
        y=alt.Y(f"upper:Q", title=""),
        y2=f"under:Q",
        color=alt.value("black"),
        size=alt.value(1.5)
    )

    result_vis = bar.encode(
        column=alt.Column(
            "age_group:O",
            sort=["age_0to2","age_3to5","age_6to11","age_12to17","age_18to25","age_26to49","age_50to69","age_70to79", "age_80plus"],
            header=alt.Header(labelOrient="bottom", title=None, titleOrient="bottom")
        )
    ).properties(title={
        "text": f"Demographics ({country})",
        "subtitle": SUBTITLE,
        "color": COLOR_BY_COUNTRY_AND_COMBINED[country],
        "subtitleColor": "gray",
        "anchor": "start",
        "dx": 60
    })

    # https://github.com/vega/vega-lite/issues/4680
    # Error msg: Javascript Error: Undefined data set name: "scale_concat_2_child_layer_0_main"
    # output_vis = alt.layer(base, errorbar).facet(
    #     column=alt.Column(
    #         "age_group:O", 
    #         sort=["age_0to2","age_3to5","age_6to11","age_12to17","age_18to25","age_26to49","age_50to69","age_70to79", "age_80plus"],
    #         header=alt.Header(labelOrient="bottom", title=None, titleOrient="bottom"),
    #     )
    # ).properties(
    #     title={
    #         "text": f"Demographics ({country})",
    #         "subtitle": SUBTITLE,
    #     }
    # )

    return result_vis

is_save = True
for country in COUNTRIES_AND_COMBINED:
    result_vis = apply_grouped_bar_theme(demographics(country=country), strokeColor="lightgray")
    
    # Display and save
    result_vis.display()
    if is_save:
        save(result_vis, join(SAVE_DIR, f"demographics_{country}.png".lower()), scalefactor=10.0)

# https://github.com/vega/vega-lite/issues/4680
# Error msg: Javascript Error: Undefined data set name: "scale_concat_2_child_layer_0_main"
# h = alt.vconcat()
# for c in COUNTRIES:
#     h &= demo_chart(country=c)
    
# apply_grouped_bar_theme(h, strokeColor="lightgray").resolve_scale(color="independent")
# .properties(
#     title={
#         "text": "Demographics", 
#         "subtitle": SUBTITLE,
#         "subtitleColor": "gray"
#     }
# )

# Diagnoses

# DEPRECATED CODES BELOW

In [None]:
def process_diagnoses_df(df_dg):

    # Drop unused columns before preprocessing for the simplicity
    df_dg = df_dg.drop(columns=[
        COLUMNS.UNMASKED_SITES_NUM_PATIENTS,
        COLUMNS.MASKED_SITES_NUM_PATIENTS
    ])

    df_dg = df_dg.rename(columns={COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS: "upper"})
    uppers = df_dg["upper"]
    df_dg["upper"] += df_dg[COLUMNS.NUM_PATIENTS]
    df_dg["under"] = df_dg[COLUMNS.NUM_PATIENTS]
    df_dg[COLUMNS.NUM_PATIENTS] = df_dg[COLUMNS.NUM_PATIENTS] + (df_dg["upper"] - df_dg["under"]) / 2.0

    # Our lookup table does not contain dots
    df_dg[COLUMNS.ICD_CODE] = df_dg[COLUMNS.ICD_CODE].apply(lambda x: f"{x}".replace(".", ""))

    # Merge with a lookup table
    icd_df = read_icd_df()
    df_dg = df_dg.merge(icd_df, how="left", left_on=COLUMNS.ICD_CODE, right_on="ICDcode")

    # Handle the missing data
    df_dg.loc[pd.isna(df_dg["ICDdescription"]), "ICDdescription"] = df_dg.loc[pd.isna(df_dg["ICDdescription"]), COLUMNS.ICD_CODE]
    df_dg.loc[pd.isna(df_dg["Category"]), "Category"] = df_dg.loc[pd.isna(df_dg["Category"]), COLUMNS.ICD_CODE]

    # Consistent capitalization
    df_dg["ICDdescription"] = df_dg["ICDdescription"].apply(lambda x: x.capitalize())
    df_dg["Category"] = df_dg["Category"].apply(lambda x: x.capitalize())

    return df_dg

# Process data
df_dg = read_combined_by_country_diagnoses_df()
df_dg = process_diagnoses_df(df_dg)

df_dg_combined = read_combined_diagnoses_df()
df_dg_combined = process_diagnoses_df(df_dg_combined)

# Merge
df_dg = pd.concat([df_dg, df_dg_combined])

# Filter?
df_dg = df_dg[df_dg[COLUMNS.NUM_PATIENTS] >= 10]

df_dg

In [None]:
# Add filter

def diagnoses_chart(YAxis): 

    yfield = "icd_code"
    if YAxis == "ICD Description":
        yfield = "ICDdescription"
    elif YAxis == "ICD Category":
        yfield = "Category"
    
    sort = df_dg.sort_values(by=[COLUMNS.NUM_PATIENTS], ascending=False)[yfield].unique()
    
    errorbar = alt.Chart(df_dg).mark_errorbar().encode(
        x=alt.X(
            f"upper:Q", title=""
        ),
        x2=alt.X2(
            f"under:Q"
        ),
        y=alt.Y(f"{yfield}:N", title=None, sort=sort),
        size=alt.value(1)
    )

    base = alt.Chart(df_dg).mark_circle(size=50, color="black").encode(
        x=alt.X(f"sum({COLUMNS.NUM_PATIENTS}):Q", title="Number of patients", axis=alt.Axis(tickCount=5)),
        y=alt.Y(f"{yfield}:N", title=None, axis=alt.Axis(grid=True))
    ).properties(
        title={
            "text": "Diagnoses starting 7 days before positive test (Patients >= 10)",
            "subtitle": SUBTITLE
        },
        width=500
    )

    chart = apply_theme(base + errorbar)
    return chart

interact(
    diagnoses_chart, 
    YAxis=["ICD Description", "ICD Code", "ICD Category"]
)

In [None]:
LOINCS = df_lb["loinc_name"].unique()
LOINC_IDS = df_lb["loinc"].unique()
LAB_TOOLTIP = [
    alt.Tooltip(COLUMNS.SITE_ID, title="Country"),
    alt.Tooltip(COLUMNS.DAYS_SINCE_POSITIVE, title="Days since positive"),
    alt.Tooltip(COLUMNS.MEAN_VALUE, title="Mean value"),
    alt.Tooltip(COLUMNS.NUM_PATIENTS, title="# of patients"),
    alt.Tooltip(COLUMNS.NUM_SITES, title="# of institutions")
]

# nearest = alt.selection(type='single', nearest=True, on='mouseover', fields=[COLUMNS.DAYS_SINCE_POSITIVE], empty='none', clear="mouseout")

def lab_chart(test):

    """
    vertical rule
    """
    df_v_rule = pd.DataFrame({"date": [1]})
    v_rule = alt.Chart(df_v_rule).mark_rule(color="red", strokeDash=[3,3]).encode(
        x="date:Q"
    )
#     nearest_rule = alt.Chart(df_lb).mark_rule(color="black").encode(
#         x=f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
#         size=alt.value(0.5)
#     ).transform_filter(
#         nearest
#     )

    """
    Data preprocessing
    """
    filtered_chart = alt.Chart(df_lb).transform_filter(
        alt.datum["loinc_name"] == test
    )
    
    Y_EXTENT = [
        min(df_lb.loc[df_lb["loinc_name"] == test, "under"]), 
        max(df_lb.loc[df_lb["loinc_name"] == test, "upper"])
    ]
    
    """
    Top Chart
    """
    line = filtered_chart.mark_line(size=2, opacity=1).encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
            title=None, 
            axis=alt.Axis(
                grid=True,
                labels=False, ticks=False, domain=True
                # labelFontSize=0, labelOpacity=0, tickOpacity=0
            ),
            scale=alt.Scale(domain=DAYS_SINCE_EXTENT)
        ),
        y=alt.Y(
            f"{COLUMNS.MEAN_VALUE}:Q", 
            title="Mean value (stdev)",
            scale=alt.Scale(domain=Y_EXTENT)
        ),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=alt.Scale(domain=COUNTRIES + [COMBINED], range=COUNTRY_COLOR + [COMBINED_COLOR]), title=None),
        tooltip=LAB_TOOLTIP,
    )

    circle = line.mark_circle(size=30)
#     TODO: Too slow
#     .encode(
#         size=alt.condition(~nearest, alt.value(10), alt.value(50))
#     )
    errorband = filtered_chart.mark_errorband().encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
            title=None,
        ),
        y=alt.Y(
            f"under:Q",
            title="", 
        ),
        y2="upper:Q",
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=alt.Scale(domain=COUNTRIES + [COMBINED], range=COUNTRY_COLOR + [COMBINED_COLOR]), legend=None),
        tooltip=LAB_TOOLTIP
    )   
    white_errorline = errorband.mark_errorbar().encode(
        size=alt.value(1),
        opacity=alt.value(0.3)
    )

    top_chart = (circle + line + v_rule).properties(height=150, width=500)

    """
    Middle Chart
    """
    bar = filtered_chart.mark_bar().encode(
        y=alt.Y(
            f"sum({COLUMNS.NUM_PATIENTS}):Q", 
            title="# of tested",
            scale=alt.Scale(domain=NUM_PATIENTS_EXTENT)
        ),
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
            # bin=alt.Bin(step=1),
            title=None,
            axis=alt.Axis(
                grid=True,
                labels=False, ticks=False, domain=True, 
                # labelFontSize=0, labelOpacity=0, tickOpacity=0
            )
        ),
        color=alt.value("gray"),
        tooltip=LAB_TOOLTIP
    )

    errorbar = filtered_chart.mark_errorbar().encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
        ),
        y=alt.Y(
            f"under_p:Q",
            title="", 
        ),
        y2="upper_p:Q",
        color=alt.value("black"),
        size=alt.value(1)
    )
    
    middle_chart = (bar + errorbar + v_rule).properties(height=60)

    """
    Bottom Chart
    """
    bottom_bar = filtered_chart.mark_bar().encode(
        y=alt.Y(
            f"{COLUMNS.NUM_SITES}:Q", 
            title="# of sites",
            scale=alt.Scale(domain=NUM_SITES_EXTENT)
        ),
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
            # bin=alt.Bin(step=1),
            title="Days since positive",
            axis=alt.Axis(
                grid=True,
                labelExpr="abs(parseInt(datum.value)) % 2 == 1 ? null : datum.label"
            )
        ),
        color=alt.value("gray"),
        tooltip=LAB_TOOLTIP
    )

    bottom_chart = (bottom_bar + v_rule).properties(height=60)

    return alt.vconcat(top_chart, middle_chart, bottom_chart, spacing=5).resolve_scale(y="independent", x="shared").properties(
        title={
            "text": test,
            "subtitle": SUBTITLE
        })
        #.add_selection(nearest)

for i in range(len(LOINCS)): 
    t = LOINCS[i]
    t_id = LOINC_IDS[i]
    
    out = lab_chart(test=t).properties(title={
        "text": [f"{t}"], 
        "subtitle": [SUBTITLE],
        "subtitleColor": "gray", 
        "dx": 60
    })
    
    out = apply_theme(out)#.properties(title={"text": t, "dx": 60, "dy": -10, "fontSize":24}))
    out.display()
    save(out, join(SAVE_DIR, f"lab-overlay-{t_id}.png"))
    break # TODO: for debug

In [None]:
line = alt.Chart(df).mark_line(size=1, opacity=1).encode(
    x=alt.X(
        f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
        title=None, 
        axis=alt.Axis(
            grid=True,
            labelOpacity=0, tickOpacity=0
        )
    ),
    y=alt.Y(
        f"mean({COLUMNS.MEAN_VALUE}):Q", 
        title=None, 
        axis=alt.Axis(orient="right")
    ),
    color=alt.Color("loinc_name:N", scale=alt.Scale(scheme="category20"), legend=None),
).properties(height=150, width=500)

circle = line.mark_circle(size=10)
errorband = alt.Chart(df).mark_errorband().encode(
    x=alt.X(
        f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
        title=None,
    ),
    y=alt.Y(
        f"under:Q",
        title=None, 
    ),
    y2="upper:Q",
    color=alt.Color("loinc_name:N", scale=alt.Scale(scheme="category20"), legend=None),
)   

top_chart = (circle + line + errorband).facet(
    row=alt.Row(
        "loinc_name:N",
        header=alt.Header(labelAngle=0, labelAlign="left", labelAnchor="middle", labelColor="black", title=None)
    ),
).resolve_scale(y="independent")

bottom_chart = line.mark_bar().encode(
    y=alt.Y(
        f"sum({COLUMNS.NUM_PATIENTS}):Q", 
        title="Number of tested patients", 
        axis=alt.Axis(
            tickCount=2, 
            titleAngle=0,
            titleAlign="right", 
            titleBaseline="middle",
            titlePadding=-545,
            orient="right"
        )
    ),
    x=alt.X(
        f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
        # bin=alt.Bin(step=1),
        title="Days since positive",
        axis=alt.Axis(
            grid=True,
            labelExpr="abs(parseInt(datum.value)) % 2 == 1 ? null : datum.label"
        )
    ),
    color=alt.value("gray")
).properties(height=45)

apply_trellis_theme(
    top_chart & bottom_chart
).resolve_scale(y="independent", x="shared").properties(
    title={
        "text": "LOINC test results",
        "subtitle": SUBTITLE
    })