In [1]:
# Auto-reload frequently changed files
%load_ext autoreload
%autoreload 2
%aimport utils

import pandas as pd
import numpy as np
import altair as alt
from ipywidgets import interact
from os.path import join

from constants import COLUMNS
from utils import (
    read_combined_daily_counts_df,
    read_combined_demographics_df, 
    read_combined_diagnoses_df,
    read_combined_labs_df,
    apply_theme, apply_grouped_bar_theme, apply_trellis_theme,
    read_icd_df, read_loinc_df
)

In [60]:
# Common info that should be defined everytime before rendering visualizations
NUM_SITES = "Four"
DATA_DATE = "2020-04-01"
SUBTITLE = f"Data as of {DATA_DATE}" + " | " + NUM_SITES + " Institutions"

SAVE_DIR = join("..", "output") # Where to save visualization *.PNG files 

COUNTRIES = ["Italy", "France", "Germany", "USA"]
COUNTRY_COLOR = ["#009E73", "#0072B2", "#E69F00", "#D55E00"]

COMBINED = "Combined"
COMBINED_COLOR = "#444444"

COLOR20 = [
    "#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", 
    "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", 
    "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"
]

# Required Setups

- All four combined datasets should be placed in `../data/combined` (e.g., `../data/combined/DailyCounts-Combinedyymmdd.csv` for the DailyCounts file).
- To save PNG files for visualizations, a folder named "output" should be present (i.e., `../output/`).

# Daily Counts

In [61]:
df_dc = read_combined_daily_counts_df()

# Drop unused columns before preprocessing for the simplicity
df_dc = df_dc.drop(columns=[
    COLUMNS.UNMASKED_SITES_NEW_POSITIVE_CASES,
    COLUMNS.UNMASKED_SITES_PATIENTS_IN_ICU,
    COLUMNS.UNMASKED_SITES_NEW_DEATHS,
    COLUMNS.MASKED_SITES_NEW_POSITIVE_CASES,
    COLUMNS.MASKED_SITES_PATIENTS_IN_ICU,
    COLUMNS.MASKED_SITES_NEW_DEATHS
])

# Wide to long
CATEGORY = "category"
df_dc = pd.melt(df_dc, id_vars=[
    COLUMNS.SITE_ID, COLUMNS.DATE,
    COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
    COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
    COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS    
])
df_dc = df_dc.rename(columns={"variable": CATEGORY, "value": COLUMNS.NUM_PATIENTS})

# Leave only the 'upper' and 'under' values for the certain 'category' only
for c in [COLUMNS.NEW_POSITIVE_CASES, COLUMNS.PATIENTS_IN_ICU, COLUMNS.NEW_DEATHS]:
    filter_c = df_dc[CATEGORY] == c
    df_dc.loc[filter_c, "upper"] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dc.loc[filter_c, "masked_upper_bound_" + c]
    df_dc.loc[filter_c, "under"] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS]
    df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dc.loc[filter_c, "masked_upper_bound_" + c] / 2.0

# Drop unused columns
df_dc = df_dc.drop(columns=[
    COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
    COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
    COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS
])

# Combined df
df_dc_combined = df_dc.groupby([COLUMNS.DATE, CATEGORY]).agg("sum").reset_index()
df_dc_combined[COLUMNS.SITE_ID] = COMBINED

# Merge two
df_dc = pd.concat([df_dc, df_dc_combined])

df_dc

Unnamed: 0,siteid,date,category,num_patients,upper,under
0,Italy,2/21/20,new_positive_cases,1.0,1.0,1.0
1,Italy,2/22/20,new_positive_cases,1.0,1.0,1.0
2,Italy,2/29/20,new_positive_cases,1.0,1.0,1.0
3,Italy,3/1/20,new_positive_cases,1.0,1.0,1.0
4,Italy,3/2/20,new_positive_cases,4.0,4.0,4.0
...,...,...,...,...,...,...
91,Combined,3/8/20,new_positive_cases,12.0,12.0,12.0
92,Combined,3/8/20,patients_in_icu,0.0,0.0,0.0
93,Combined,3/9/20,new_deaths,0.0,0.0,0.0
94,Combined,3/9/20,new_positive_cases,4.0,4.0,4.0


In [64]:
# TODO: Add cumulative values

CATEGORIES = [COLUMNS.NEW_POSITIVE_CASES, COLUMNS.NEW_DEATHS, COLUMNS.PATIENTS_IN_ICU]

def dailycount_chart(category):
    filtered_chart = alt.Chart(df_dc).transform_filter(
        alt.datum[CATEGORY] == category
    )
    # .transform_filter(
    #     alt.datum[COLUMNS.SITE_ID] != COMBINED
    # )

    line = filtered_chart.mark_line(size=2).encode(
        x=alt.X(
            f"{COLUMNS.DATE}:T", axis=alt.Axis(tickCount=7), 
            title=None,
        ),
        y=alt.Y(
            f"{COLUMNS.NUM_PATIENTS}:Q", axis=alt.Axis(tickCount=5), 
            title="Number of patients",
            scale=alt.Scale(domain=[-1,200])
        ),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=alt.Scale(domain=COUNTRIES + [COMBINED], range=COUNTRY_COLOR + [COMBINED_COLOR]), legend=alt.Legend(title=None))
    )
    point = line.mark_circle(size=30)
    errorband = filtered_chart.mark_errorband().encode(
        x=alt.X(
            f"{COLUMNS.DATE}:T", axis=alt.Axis(tickCount=7), 
            title=None,
        ),y=alt.Y(
            "upper:Q", title=""
        ),y2=alt.Y2(
            "under:Q"
        ),
        color=alt.Color(f"{COLUMNS.SITE_ID}:N", scale=alt.Scale(domain=COUNTRIES + [COMBINED], range=COUNTRY_COLOR + [COMBINED_COLOR]), legend=alt.Legend(title=None))
    )

    agg_chart = (line + point + errorband).resolve_scale(color="shared").properties(
        width=750, height=400
    )
    # TODO: Use this line for country level facet
    # .facet(
    #     row=alt.Row(f"{CATEGORY}", title=None)
    # )

    output_vis = apply_theme(agg_chart).properties(
        title={
        "text": category, 
        "subtitle": SUBTITLE
        }
    ).interactive()

    return output_vis

for c in (CATEGORIES):
    dailycount_chart(category=c).display()

# Demographics

In [67]:
df_dm = read_combined_demographics_df()

# Drop unused columns before preprocessing for the simplicity
df_dm = df_dm.drop(columns=[
    COLUMNS.UNMASKED_SITES_TOTAL_PATIENTS,
    COLUMNS.UNMASKED_SITES_AGE_0TO2,
    COLUMNS.UNMASKED_SITES_AGE_3TO5,
    COLUMNS.UNMASKED_SITES_AGE_6TO11,
    COLUMNS.UNMASKED_SITES_AGE_12TO17,
    COLUMNS.UNMASKED_SITES_AGE_18TO25,
    COLUMNS.UNMASKED_SITES_AGE_26TO49,
    COLUMNS.UNMASKED_SITES_AGE_50TO69,
    COLUMNS.UNMASKED_SITES_AGE_70TO79,
    COLUMNS.UNMASKED_SITES_AGE_80PLUS,
    COLUMNS.MASKED_SITES_TOTAL_PATIENTS,
    COLUMNS.MASKED_SITES_AGE_0TO2,
    COLUMNS.MASKED_SITES_AGE_3TO5,
    COLUMNS.MASKED_SITES_AGE_6TO11,
    COLUMNS.MASKED_SITES_AGE_12TO17,
    COLUMNS.MASKED_SITES_AGE_18TO25,
    COLUMNS.MASKED_SITES_AGE_26TO49,
    COLUMNS.MASKED_SITES_AGE_50TO69,
    COLUMNS.MASKED_SITES_AGE_70TO79,
    COLUMNS.MASKED_SITES_AGE_80PLUS,
    COLUMNS.MASKED_UPPER_BOUND_TOTAL_PATIENTS,
    COLUMNS.TOTAL_PATIENTS,
])

# Wide to long
df_dm = pd.melt(df_dm, id_vars=[
    COLUMNS.SITE_ID,
    COLUMNS.SEX,
    COLUMNS.MASKED_UPPER_BOUND_AGE_0TO2,
    COLUMNS.MASKED_UPPER_BOUND_AGE_3TO5,
    COLUMNS.MASKED_UPPER_BOUND_AGE_6TO11,
    COLUMNS.MASKED_UPPER_BOUND_AGE_12TO17,
    COLUMNS.MASKED_UPPER_BOUND_AGE_18TO25,
    COLUMNS.MASKED_UPPER_BOUND_AGE_26TO49,
    COLUMNS.MASKED_UPPER_BOUND_AGE_50TO69,
    COLUMNS.MASKED_UPPER_BOUND_AGE_70TO79,
    COLUMNS.MASKED_UPPER_BOUND_AGE_80PLUS,
])
df_dm = df_dm.rename(columns={"variable": COLUMNS.AGE_GROUP, "value": COLUMNS.NUM_PATIENTS})

# Leave only the 'upper' and 'under' values for the certain 'age_group' only
for c in [
        COLUMNS.AGE_0TO2,
        COLUMNS.AGE_3TO5,
        COLUMNS.AGE_6TO11,
        COLUMNS.AGE_12TO17,
        COLUMNS.AGE_18TO25,
        COLUMNS.AGE_26TO49,
        COLUMNS.AGE_50TO69,
        COLUMNS.AGE_70TO79,
        COLUMNS.AGE_80PLUS
        ]:
    filter_c = df_dm[COLUMNS.AGE_GROUP] == c
    df_dm.loc[filter_c, "upper"] = df_dm.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dm.loc[filter_c, "masked_upper_bound_" + c]
    df_dm.loc[filter_c, "under"] = df_dm.loc[filter_c, COLUMNS.NUM_PATIENTS]
    df_dm.loc[filter_c, COLUMNS.NUM_PATIENTS] = df_dm.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dm.loc[filter_c, "masked_upper_bound_" + c] / 2.0

df_dm = df_dm[df_dm[COLUMNS.SEX] != "ALL"]

# Drop unused columns
df_dm = df_dm.drop(columns=[
    COLUMNS.MASKED_UPPER_BOUND_AGE_0TO2,
    COLUMNS.MASKED_UPPER_BOUND_AGE_3TO5,
    COLUMNS.MASKED_UPPER_BOUND_AGE_6TO11,
    COLUMNS.MASKED_UPPER_BOUND_AGE_12TO17,
    COLUMNS.MASKED_UPPER_BOUND_AGE_18TO25,
    COLUMNS.MASKED_UPPER_BOUND_AGE_26TO49,
    COLUMNS.MASKED_UPPER_BOUND_AGE_50TO69,
    COLUMNS.MASKED_UPPER_BOUND_AGE_70TO79,
    COLUMNS.MASKED_UPPER_BOUND_AGE_80PLUS,
])

# Combined df
df_dm_combined = df_dm.groupby([COLUMNS.SEX, COLUMNS.AGE_GROUP]).agg("sum").reset_index()
df_dm_combined[COLUMNS.SITE_ID] = COMBINED

# Merge two
df_dm = pd.concat([df_dm, df_dm_combined])

df_dm

Unnamed: 0,siteid,sex,age_group,num_patients,upper,under
1,Italy,Female,age_0to2,4.5,9.0,0.0
2,Italy,Male,age_0to2,4.5,9.0,0.0
3,Italy,Other,age_0to2,0.0,0.0,0.0
5,France,Female,age_0to2,4.5,9.0,0.0
6,France,Male,age_0to2,4.5,9.0,0.0
...,...,...,...,...,...,...
22,Combined,Other,age_3to5,0.0,0.0,0.0
23,Combined,Other,age_50to69,0.0,0.0,0.0
24,Combined,Other,age_6to11,0.0,0.0,0.0
25,Combined,Other,age_70to79,0.0,0.0,0.0


In [73]:
color_scale = alt.Scale(domain=["Male", "Female", "Other"], range=COLOR20[:2] + ["gray"])
    
filtered_chart = alt.Chart(df_dm).transform_filter(
    alt.datum[COLUMNS.SITE_ID] != COMBINED
)

base = filtered_chart.mark_bar().encode(
    x=alt.X('sex:N', title=None, axis=None),
    y=alt.Y(f"{COLUMNS.NUM_PATIENTS}:Q", title="Number of patients", axis=alt.Axis(tickCount=5)),
    color=alt.Color("sex:N", title=None, scale=color_scale),
).properties(
    width=70,
    height=400
)


errorbar = filtered_chart.mark_errorbar().encode(
    x=alt.X(
        f"{COLUMNS.SEX}:N", title=None,
    ),y=alt.Y(
        f"upper:Q", title=""
    ),y2=alt.Y2(
        f"under:Q"
    ),
    color=alt.value("black"),
    size=alt.value(1.5)
)

apply_grouped_bar_theme(
    alt.layer(base, errorbar).facet(
        column=alt.Column(
            "age_group:O", 
            sort=["0-2","3-5","6-11","12-17","18-25","26-49","50-69","70-79", "80+"],
            header=alt.Header(labelOrient="bottom", title="Age group", titleOrient="bottom")
        )
    ).properties(
        title={
            "text": "Demographics",
            "subtitle": SUBTITLE
        }
    ),
    strokeColor="lightgray"
)

# Labs

In [None]:
df_lb = read_combined_labs_df()

# Drop unused columns before preprocessing for the simplicity
df_lb = df_lb.drop(columns=[
    COLUMNS.UNMASKED_SITES_NUM_PATIENTS,
    COLUMNS.MASKED_SITES_NUM_PATIENTS
])

# Zero negative values
df_lb.loc[df_lb[COLUMNS.NUM_PATIENTS] < 0, COLUMNS.NUM_PATIENTS] = 0
df_lb.loc[df_lb[COLUMNS.MEAN_VALUE] < 0, COLUMNS.MEAN_VALUE] = 0
df_lb.loc[df_lb[COLUMNS.STDEV_VAL] < 0, COLUMNS.STDEV_VAL] = 0

df_lb["upper"] = df_lb[COLUMNS.MEAN_VALUE] + df_lb[COLUMNS.STDEV_VAL] 
df_lb["under"] = df_lb[COLUMNS.MEAN_VALUE] - df_lb[COLUMNS.STDEV_VAL]

df_lb["upper_p"] = df_lb[COLUMNS.NUM_PATIENTS] + df_lb[COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS]
df_lb["under_p"] = df_lb[COLUMNS.NUM_PATIENTS]
df_lb[COLUMNS.NUM_PATIENTS] += (df_lb["upper_p"] - df_lb["under_p"]) / 2.0

loinc_df = read_loinc_df().set_index('loinc').rename(columns={'labTest': 'name'})

df_lb["loinc_name"] = df_lb[COLUMNS.LOINC].apply(lambda code: 
    loinc_df.at[code, "name"] if loinc_df.at[code, "unit"] == "-1" else loinc_df.at[code, "name"] + " (" + loinc_df.at[code, "unit"] + ")"
)

# Drop unused columns
df_lb = df_lb.drop(columns=[
    COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS
])

df_lb

In [None]:
LOINCS = df_lb["loinc_name"].unique()

COLOR_BY_LOINC = {LOINCS[i]: COLOR20[i] for i in range(len(LOINCS))} 

def lab_chart(test):
    
    line = alt.Chart(df_lb).mark_line(size=1, opacity=1).encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
            title=None, 
            axis=alt.Axis(
                grid=True,
                labelFontSize=0,
                labelOpacity=0, tickOpacity=0
            )
        ),
        y=alt.Y(
            f"mean({COLUMNS.MEAN_VALUE}):Q", 
            title="Mean value (stdev)"
        ),
        color=alt.value(COLOR_BY_LOINC[test])
    ).properties(height=150, width=500)

    circle = line.mark_circle(size=10)
    errorband = alt.Chart(df_lb).mark_errorband().encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
            title=None,
        ),
        y=alt.Y(
            f"under:Q",
            title="", 
        ),
        y2="upper:Q",
        color=alt.value(COLOR_BY_LOINC[test])
    )   
    white_errorline = errorband.mark_errorbar().encode(
        size=alt.value(1),
        opacity=alt.value(0.3)
    )

    top_chart = (circle + line + errorband + white_errorline)

    bar = line.mark_bar().encode(
        y=alt.Y(
            f"sum({COLUMNS.NUM_PATIENTS}):Q", 
            title="# of tested"
        ),
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
            # bin=alt.Bin(step=1),
            title="Days since positive",
            axis=alt.Axis(
                grid=True,
                labelExpr="abs(parseInt(datum.value)) % 2 == 1 ? null : datum.label"
            ),
            scale=alt.Scale(domain=[-15, 30])
        ),
        color=alt.value("gray")
    ).properties(height=80)

    errorbar = alt.Chart(df_lb).mark_errorbar().encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
        ),
        y=alt.Y(
            f"under_p:Q",
            title="", 
        ),
        y2="upper_p:Q",
        color=alt.value("black"),
        size=alt.value(1)
    )
    
    bottom_chart = (bar + errorbar)

    return apply_theme(
        top_chart & bottom_chart
    ).transform_filter(
        alt.datum["loinc_name"] == test
    ).resolve_scale(y="independent", x="shared").properties(
        title={
            "text": test,
            "subtitle": SUBTITLE
        })

# interact(lab_chart, test=LOINCS,)

for t in LOINCS:
    lab_chart(test=t).display()
    # break # TODO: for debug

# DEPRECATED CODES BELOW

# Diagnoses

In [None]:
df_dg = read_combined_diagnoses_df()

# Drop unused columns before preprocessing for the simplicity
df_dg = df_dg.drop(columns=[
    COLUMNS.UNMASKED_SITES_NUM_PATIENTS,
    COLUMNS.MASKED_SITES_NUM_PATIENTS
])

df_dg = df_dg.rename(columns={COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS: "upper"})
uppers = df_dg["upper"]
df_dg["upper"] += df_dg[COLUMNS.NUM_PATIENTS]
df_dg["under"] = df_dg[COLUMNS.NUM_PATIENTS]
df_dg[COLUMNS.NUM_PATIENTS] = df_dg[COLUMNS.NUM_PATIENTS] + (df_dg["upper"] - df_dg["under"]) / 2.0

# Our lookup table does not contain dots
df_dg[COLUMNS.ICD_CODE] = df_dg[COLUMNS.ICD_CODE].apply(lambda x: x.replace(".", ""))

# Merge with a lookup table
icd_df = read_icd_df()
df_dg = df_dg.merge(icd_df, how="left", left_on=COLUMNS.ICD_CODE, right_on="ICDcode")

# Handle the missing data
df_dg.loc[pd.isna(df_dg["ICDdescription"]), "ICDdescription"] = df_dg.loc[pd.isna(df_dg["ICDdescription"]), COLUMNS.ICD_CODE]
df_dg.loc[pd.isna(df_dg["Category"]), "Category"] = df_dg.loc[pd.isna(df_dg["Category"]), COLUMNS.ICD_CODE]

# Consistent capitalization
df_dg["ICDdescription"] = df_dg["ICDdescription"].apply(lambda x: x.capitalize())
df_dg["Category"] = df_dg["Category"].apply(lambda x: x.capitalize())

df_dg = df_dg[df_dg[COLUMNS.NUM_PATIENTS] >= 10]

df_dg

In [None]:
# Add filter

def diagnoses_chart(YAxis): 

    yfield = "icd_code"
    if YAxis == "ICD Description":
        yfield = "ICDdescription"
    elif YAxis == "ICD Category":
        yfield = "Category"
    
    sort = df_dg.sort_values(by=[COLUMNS.NUM_PATIENTS], ascending=False)[yfield].unique()
    
    errorbar = alt.Chart(df_dg).mark_errorbar().encode(
        x=alt.X(
            f"upper:Q", title=""
        ),
        x2=alt.X2(
            f"under:Q"
        ),
        y=alt.Y(f"{yfield}:N", title=None, sort=sort),
        size=alt.value(1)
    )

    base = alt.Chart(df_dg).mark_circle(size=50, color="black").encode(
        x=alt.X(f"sum({COLUMNS.NUM_PATIENTS}):Q", title="Number of patients", axis=alt.Axis(tickCount=5)),
        y=alt.Y(f"{yfield}:N", title=None, axis=alt.Axis(grid=True))
    ).properties(
        title={
            "text": "Diagnoses starting 7 days before positive test (Patients >= 10)",
            "subtitle": SUBTITLE
        },
        width=500
    )

    chart = apply_theme(base + errorbar)
    return chart

interact(
    diagnoses_chart, 
    YAxis=["ICD Description", "ICD Code", "ICD Category"]
)

In [None]:
line = alt.Chart(df).mark_line(size=1, opacity=1).encode(
    x=alt.X(
        f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
        title=None, 
        axis=alt.Axis(
            grid=True,
            labelOpacity=0, tickOpacity=0
        )
    ),
    y=alt.Y(
        f"mean({COLUMNS.MEAN_VALUE}):Q", 
        title=None, 
        axis=alt.Axis(orient="right")
    ),
    color=alt.Color("loinc_name:N", scale=alt.Scale(scheme="category20"), legend=None),
).properties(height=150, width=500)

circle = line.mark_circle(size=10)
errorband = alt.Chart(df).mark_errorband().encode(
    x=alt.X(
        f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
        title=None,
    ),
    y=alt.Y(
        f"under:Q",
        title=None, 
    ),
    y2="upper:Q",
    color=alt.Color("loinc_name:N", scale=alt.Scale(scheme="category20"), legend=None),
)   

top_chart = (circle + line + errorband).facet(
    row=alt.Row(
        "loinc_name:N",
        header=alt.Header(labelAngle=0, labelAlign="left", labelAnchor="middle", labelColor="black", title=None)
    ),
).resolve_scale(y="independent")

bottom_chart = line.mark_bar().encode(
    y=alt.Y(
        f"sum({COLUMNS.NUM_PATIENTS}):Q", 
        title="Number of tested patients", 
        axis=alt.Axis(
            tickCount=2, 
            titleAngle=0,
            titleAlign="right", 
            titleBaseline="middle",
            titlePadding=-545,
            orient="right"
        )
    ),
    x=alt.X(
        f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
        # bin=alt.Bin(step=1),
        title="Days since positive",
        axis=alt.Axis(
            grid=True,
            labelExpr="abs(parseInt(datum.value)) % 2 == 1 ? null : datum.label"
        )
    ),
    color=alt.value("gray")
).properties(height=45)

apply_trellis_theme(
    top_chart & bottom_chart
).resolve_scale(y="independent", x="shared").properties(
    title={
        "text": "LOINC test results",
        "subtitle": SUBTITLE
    })