In [None]:
# Auto-reload frequently changed files
%load_ext autoreload
%autoreload 2
%aimport utils

import pandas as pd
import numpy as np
import altair as alt
from ipywidgets import interact
from os.path import join

from constants import COLUMNS
from utils import (
    read_combined_daily_counts_df,
    read_combined_demographics_df, 
    read_combined_diagnoses_df,
    read_combined_labs_df,
    apply_theme, apply_grouped_bar_theme, apply_trellis_theme,
    read_icd_df, read_loinc_df
)

# Daily Counts

In [None]:
df = read_combined_daily_counts_df()

# Drop unused columns before preprocessing for the simplicity
df = df.drop(columns=[
    COLUMNS.UNMASKED_SITES_NEW_POSITIVE_CASES,
    COLUMNS.UNMASKED_SITES_PATIENTS_IN_ICU,
    COLUMNS.UNMASKED_SITES_NEW_DEATHS,
    COLUMNS.MASKED_SITES_NEW_POSITIVE_CASES,
    COLUMNS.MASKED_SITES_PATIENTS_IN_ICU,
    COLUMNS.MASKED_SITES_NEW_DEATHS
])

# Wide to long
CATEGORY = "category"
df = pd.melt(df, id_vars=[
    COLUMNS.SITE_ID, COLUMNS.DATE,
    COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
    COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
    COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS    
])
df = df.rename(columns={"variable": CATEGORY, "value": COLUMNS.NUM_PATIENTS})

# Leave only the 'upper' and 'under' values for the certain 'category' only
for c in [COLUMNS.NEW_POSITIVE_CASES, COLUMNS.PATIENTS_IN_ICU, COLUMNS.NEW_DEATHS]:
    filter_c = df[CATEGORY] == c
    df.loc[filter_c, "upper"] = df.loc[filter_c, COLUMNS.NUM_PATIENTS] + df.loc[filter_c, "masked_upper_bound_" + c]
    df.loc[filter_c, "under"] = df.loc[filter_c, COLUMNS.NUM_PATIENTS]
    df.loc[filter_c, COLUMNS.NUM_PATIENTS] = df.loc[filter_c, COLUMNS.NUM_PATIENTS] + df.loc[filter_c, "masked_upper_bound_" + c] / 2.0

# Drop unused columns
df = df.drop(columns=[
    COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
    COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
    COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS
])

df.head()

In [None]:
MID_FIELDS = [COLUMNS.NEW_POSITIVE_CASES, COLUMNS.NEW_DEATHS, COLUMNS.PATIENTS_IN_ICU]
THREE_COLORS = ["#377FB8", "#CA2026", "#60B75D"]
COLOR_BY_FIELD = {
    COLUMNS.NEW_POSITIVE_CASES: "#CA2026",
    COLUMNS.NEW_DEATHS: "#60B75D",
    COLUMNS.PATIENTS_IN_ICU: "#377FB8"
}
GRAY_COLOR = "lightgray"

line = alt.Chart(df).mark_line(size=2).encode(
    x=alt.X(
        f"{COLUMNS.DATE}:T", axis=alt.Axis(tickCount=7), 
        title=None,
    ),
    y=alt.Y(
        f"{COLUMNS.NUM_PATIENTS}:Q", axis=alt.Axis(tickCount=5), 
        title="Number of patients",
        scale=alt.Scale(domain=[-1,170])
    ),
    color=alt.Color(f"{CATEGORY}:N", scale=alt.Scale(domain=MID_FIELDS, range=THREE_COLORS), legend=alt.Legend(title=None))
)
point = line.mark_circle(size=30)
errorband = alt.Chart(df).mark_errorband().encode(
    x=alt.X(
        f"{COLUMNS.DATE}:T", axis=alt.Axis(tickCount=7), 
        title=None,
    ),y=alt.Y(
        "upper:Q", title=""
    ),y2=alt.Y2(
        "under:Q"
    ),
    color=alt.Color(f"{CATEGORY}:N", scale=alt.Scale(domain=MID_FIELDS, range=THREE_COLORS))
)

agg_chart = (line + point + errorband).resolve_scale(color="shared")

apply_theme(agg_chart).properties(
    width=750, height=400, title="Number of Positive Cases, Patients in ICU, and Deaths"
).interactive()

# Demographics

In [None]:
df = read_combined_demographics_df()

# Drop unused columns before preprocessing for the simplicity
df = df.drop(columns=[
    COLUMNS.UNMASKED_SITES_TOTAL_PATIENTS,
    COLUMNS.UNMASKED_SITES_AGE_0TO2,
    COLUMNS.UNMASKED_SITES_AGE_3TO5,
    COLUMNS.UNMASKED_SITES_AGE_6TO11,
    COLUMNS.UNMASKED_SITES_AGE_12TO17,
    COLUMNS.UNMASKED_SITES_AGE_18TO25,
    COLUMNS.UNMASKED_SITES_AGE_26TO49,
    COLUMNS.UNMASKED_SITES_AGE_50TO69,
    COLUMNS.UNMASKED_SITES_AGE_70TO79,
    COLUMNS.UNMASKED_SITES_AGE_80PLUS,
    COLUMNS.MASKED_SITES_TOTAL_PATIENTS,
    COLUMNS.MASKED_SITES_AGE_0TO2,
    COLUMNS.MASKED_SITES_AGE_3TO5,
    COLUMNS.MASKED_SITES_AGE_6TO11,
    COLUMNS.MASKED_SITES_AGE_12TO17,
    COLUMNS.MASKED_SITES_AGE_18TO25,
    COLUMNS.MASKED_SITES_AGE_26TO49,
    COLUMNS.MASKED_SITES_AGE_50TO69,
    COLUMNS.MASKED_SITES_AGE_70TO79,
    COLUMNS.MASKED_SITES_AGE_80PLUS,
    COLUMNS.MASKED_UPPER_BOUND_TOTAL_PATIENTS,
    COLUMNS.TOTAL_PATIENTS,
])

# Wide to long
df = pd.melt(df, id_vars=[
    COLUMNS.SITE_ID,
    COLUMNS.SEX,
    COLUMNS.MASKED_UPPER_BOUND_AGE_0TO2,
    COLUMNS.MASKED_UPPER_BOUND_AGE_3TO5,
    COLUMNS.MASKED_UPPER_BOUND_AGE_6TO11,
    COLUMNS.MASKED_UPPER_BOUND_AGE_12TO17,
    COLUMNS.MASKED_UPPER_BOUND_AGE_18TO25,
    COLUMNS.MASKED_UPPER_BOUND_AGE_26TO49,
    COLUMNS.MASKED_UPPER_BOUND_AGE_50TO69,
    COLUMNS.MASKED_UPPER_BOUND_AGE_70TO79,
    COLUMNS.MASKED_UPPER_BOUND_AGE_80PLUS,
])
df = df.rename(columns={"variable": COLUMNS.AGE_GROUP, "value": COLUMNS.NUM_PATIENTS})

# Leave only the 'upper' and 'under' values for the certain 'age_group' only
for c in [
        COLUMNS.AGE_0TO2,
        COLUMNS.AGE_3TO5,
        COLUMNS.AGE_6TO11,
        COLUMNS.AGE_12TO17,
        COLUMNS.AGE_18TO25,
        COLUMNS.AGE_26TO49,
        COLUMNS.AGE_50TO69,
        COLUMNS.AGE_70TO79,
        COLUMNS.AGE_80PLUS
        ]:
    filter_c = df[COLUMNS.AGE_GROUP] == c
    df.loc[filter_c, "upper"] = df.loc[filter_c, COLUMNS.NUM_PATIENTS] + df.loc[filter_c, "masked_upper_bound_" + c]
    df.loc[filter_c, "under"] = df.loc[filter_c, COLUMNS.NUM_PATIENTS]
    df.loc[filter_c, COLUMNS.NUM_PATIENTS] = df.loc[filter_c, COLUMNS.NUM_PATIENTS] + df.loc[filter_c, "masked_upper_bound_" + c] / 2.0

df = df[df[COLUMNS.SEX] != "ALL"]

# Drop unused columns
df = df.drop(columns=[
    COLUMNS.MASKED_UPPER_BOUND_AGE_0TO2,
    COLUMNS.MASKED_UPPER_BOUND_AGE_3TO5,
    COLUMNS.MASKED_UPPER_BOUND_AGE_6TO11,
    COLUMNS.MASKED_UPPER_BOUND_AGE_12TO17,
    COLUMNS.MASKED_UPPER_BOUND_AGE_18TO25,
    COLUMNS.MASKED_UPPER_BOUND_AGE_26TO49,
    COLUMNS.MASKED_UPPER_BOUND_AGE_50TO69,
    COLUMNS.MASKED_UPPER_BOUND_AGE_70TO79,
    COLUMNS.MASKED_UPPER_BOUND_AGE_80PLUS,
])

df

In [None]:
color_scale = alt.Scale(domain=["Male", "Female", "Other"], range=["#377FB8", "#CA2026", "gray"])
    
base = alt.Chart(df).mark_bar().encode(
    x=alt.X('sex:N', title=None, axis=None),
    y=alt.Y(f"{COLUMNS.NUM_PATIENTS}:Q", title="Number of patients", axis=alt.Axis(tickCount=5)),
    color=alt.Color("sex:N", title=None, scale=color_scale),
).properties(
    width=70,
    height=400
)


errorbar = alt.Chart(df).mark_errorbar().encode(
    x=alt.X(
        f"{COLUMNS.SEX}:N", title=None,
    ),y=alt.Y(
        f"upper:Q", title=""
    ),y2=alt.Y2(
        f"under:Q"
    ),
    color=alt.value("black"),
    size=alt.value(1.5)
)

apply_grouped_bar_theme(
    alt.layer(base, errorbar).facet(
        column=alt.Column(
            "age_group:O", 
            sort=["0-2","3-5","6-11","12-17","18-25","26-49","50-69","70-79", "80+"],
            header=alt.Header(labelOrient="bottom", title="Age group", titleOrient="bottom")
        )
    ).properties(title="Demographics"),
    strokeColor="lightgray"
)

# Diagnoses

In [None]:
df = read_combined_diagnoses_df()

# Drop unused columns before preprocessing for the simplicity
df = df.drop(columns=[
    COLUMNS.UNMASKED_SITES_NUM_PATIENTS,
    COLUMNS.MASKED_SITES_NUM_PATIENTS
])

df = df.rename(columns={COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS: "upper"})
uppers = df["upper"]
df["upper"] += df[COLUMNS.NUM_PATIENTS]
df["under"] = df[COLUMNS.NUM_PATIENTS]
df[COLUMNS.NUM_PATIENTS] = df[COLUMNS.NUM_PATIENTS] + (df["upper"] - df["under"]) / 2.0

# Our lookup table does not contain dots
df[COLUMNS.ICD_CODE] = df[COLUMNS.ICD_CODE].apply(lambda x: x.replace(".", ""))

# Merge with a lookup table
icd_df = read_icd_df()
df = df.merge(icd_df, how="left", left_on=COLUMNS.ICD_CODE, right_on="ICDcode")

# Handle the missing data
df.loc[pd.isna(df["ICDdescription"]), "ICDdescription"] = df.loc[pd.isna(df["ICDdescription"]), COLUMNS.ICD_CODE]
df.loc[pd.isna(df["Category"]), "Category"] = df.loc[pd.isna(df["Category"]), COLUMNS.ICD_CODE]

# Consistent capitalization
df["ICDdescription"] = df["ICDdescription"].apply(lambda x: x.capitalize())
df["Category"] = df["Category"].apply(lambda x: x.capitalize())

df = df[df[COLUMNS.NUM_PATIENTS] >= 10]

df

In [None]:
# Add filter

def diagnoses_chart(YAxis): 

    yfield = "icd_code"
    if YAxis == "ICD Description":
        yfield = "ICDdescription"
    elif YAxis == "ICD Category":
        yfield = "Category"
    
    sort = df.sort_values(by=[COLUMNS.NUM_PATIENTS], ascending=False)[yfield].unique()
    
    errorbar = alt.Chart(df).mark_errorbar().encode(
        x=alt.X(
            f"upper:Q", title=""
        ),
        x2=alt.X2(
            f"under:Q"
        ),
        y=alt.Y(f"{yfield}:N", title=None, sort=sort),
        size=alt.value(1)
    )

    base = alt.Chart(df).mark_circle(size=50, color="black").encode(
        x=alt.X(f"sum({COLUMNS.NUM_PATIENTS}):Q", title="Number of patients", axis=alt.Axis(tickCount=5)),
        y=alt.Y(f"{yfield}:N", title=None, axis=alt.Axis(grid=True))
    ).properties(
        title="Diagnoses starting 7 days before positive test (Patients >= 10)",
        width=500
    )

    chart = apply_theme(base + errorbar)
    return chart

interact(
    diagnoses_chart, 
    YAxis=["ICD Description", "ICD Code", "ICD Category"]
)

# Labs

In [None]:
df = read_combined_labs_df()

# Drop unused columns before preprocessing for the simplicity
df = df.drop(columns=[
    COLUMNS.UNMASKED_SITES_NUM_PATIENTS,
    COLUMNS.MASKED_SITES_NUM_PATIENTS
])

# Zero negative values
df.loc[df[COLUMNS.NUM_PATIENTS] < 0, COLUMNS.NUM_PATIENTS] = 0
df.loc[df[COLUMNS.MEAN_VALUE] < 0, COLUMNS.MEAN_VALUE] = 0
df.loc[df[COLUMNS.STDEV_VAL] < 0, COLUMNS.STDEV_VAL] = 0

df["upper"] = df[COLUMNS.MEAN_VALUE] + df[COLUMNS.STDEV_VAL] 
df["under"] = df[COLUMNS.MEAN_VALUE] - df[COLUMNS.STDEV_VAL]

df["upper_p"] = df[COLUMNS.NUM_PATIENTS] + df[COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS]
df["under_p"] = df[COLUMNS.NUM_PATIENTS]
df[COLUMNS.NUM_PATIENTS] += (df["upper_p"] - df["under_p"]) / 2.0

loinc_df = read_loinc_df().set_index('loinc').rename(columns={'labTest': 'name'})

df["loinc_name"] = df[COLUMNS.LOINC].apply(lambda code: loinc_df.at[code, "name"].capitalize())

# Drop unused columns
df = df.drop(columns=[
    COLUMNS.MASKED_UPPER_BOUND_NUM_PATIENTS
])

df

In [None]:
LOINCS = ["alanine aminotransferase (ALT)", "white blood cell count (Leukocytes)",
    "neutrophil count",
    "lymphocyte count",
    "albumin",
    "lactate dehydrogenase (LDH)",
    
    "aspartate aminotransferase (AST)",
    "total bilirubin",
    "creatinine",
    "cardiac troponin",
    "D-dimer",
    "prothrombin time (PT)",
    "procalcitonin",
    "C-reactive protein (CRP)"]
COLOR20 = [
    "#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", 
    "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", 
    "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"
]

COLOR_BY_LOINC = {LOINCS[i].capitalize(): COLOR20[i] for i in range(len(LOINCS))} 

def lab_chart(test):
    test = test.capitalize()
    line = alt.Chart(df).mark_line(size=1, opacity=1).encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
            title=None, 
            axis=alt.Axis(
                grid=True,
                labelFontSize=0,
                labelOpacity=0, tickOpacity=0
            )
        ),
        y=alt.Y(
            f"mean({COLUMNS.MEAN_VALUE}):Q", 
            title="Mean value (stdev)"
        ),
        color=alt.value(COLOR_BY_LOINC[test])
    ).properties(height=150, width=500)

    circle = line.mark_circle(size=10)
    errorband = alt.Chart(df).mark_errorband().encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
            title=None,
        ),
        y=alt.Y(
            f"under:Q",
            title="", 
        ),
        y2="upper:Q",
        color=alt.value(COLOR_BY_LOINC[test])
    )   

    top_chart = (circle + line + errorband)

    bar = line.mark_bar().encode(
        y=alt.Y(
            f"sum({COLUMNS.NUM_PATIENTS}):Q", 
            title="# of tested"
        ),
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
            # bin=alt.Bin(step=1),
            title="Days since positive",
            axis=alt.Axis(
                grid=True,
                labelExpr="abs(parseInt(datum.value)) % 2 == 1 ? null : datum.label"
            ),
            scale=alt.Scale(domain=[-15, 30])
        ),
        color=alt.value("gray")
    ).properties(height=80)

    errorbar = alt.Chart(df).mark_errorbar().encode(
        x=alt.X(
            f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
        ),
        y=alt.Y(
            f"under_p:Q",
            title="", 
        ),
        y2="upper_p:Q",
        color=alt.value("black"),
        size=alt.value(1)
    )
    
    bottom_chart = (bar + errorbar)

    return apply_theme(
        top_chart & bottom_chart
    ).transform_filter(
        alt.datum["loinc_name"] == test
    ).resolve_scale(y="independent", x="shared").properties(title=test)

# interact(lab_chart, test=LOINCS,)

for t in LOINCS:
    lab_chart(test=t).display()
    # break # TODO: for debug

In [None]:
line = alt.Chart(df).mark_line(size=1, opacity=1).encode(
    x=alt.X(
        f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
        title=None, 
        axis=alt.Axis(
            grid=True,
            labelOpacity=0, tickOpacity=0
        )
    ),
    y=alt.Y(
        f"mean({COLUMNS.MEAN_VALUE}):Q", 
        title=None, 
        axis=alt.Axis(orient="right")
    ),
    color=alt.Color("loinc_name:N", scale=alt.Scale(scheme="category20"), legend=None),
).properties(height=150, width=500)

circle = line.mark_circle(size=10)
errorband = alt.Chart(df).mark_errorband().encode(
    x=alt.X(
        f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q", 
        title=None,
    ),
    y=alt.Y(
        f"under:Q",
        title=None, 
    ),
    y2="upper:Q",
    color=alt.Color("loinc_name:N", scale=alt.Scale(scheme="category20"), legend=None),
)   

top_chart = (circle + line + errorband).facet(
    row=alt.Row(
        "loinc_name:N",
        header=alt.Header(labelAngle=0, labelAlign="left", labelAnchor="middle", labelColor="black", title=None)
    ),
).resolve_scale(y="independent")

bottom_chart = line.mark_bar().encode(
    y=alt.Y(
        f"sum({COLUMNS.NUM_PATIENTS}):Q", 
        title="Number of tested patients", 
        axis=alt.Axis(
            tickCount=2, 
            titleAngle=0,
            titleAlign="right", 
            titleBaseline="middle",
            titlePadding=-545,
            orient="right"
        )
    ),
    x=alt.X(
        f"{COLUMNS.DAYS_SINCE_POSITIVE}:Q",
        # bin=alt.Bin(step=1),
        title="Days since positive",
        axis=alt.Axis(
            grid=True,
            labelExpr="abs(parseInt(datum.value)) % 2 == 1 ? null : datum.label"
        )
    ),
    color=alt.value("gray")
).properties(height=45)

apply_trellis_theme(
    top_chart & bottom_chart
).resolve_scale(y="independent", x="shared").properties(title="LOINC test results")