In [None]:
# Auto-reload frequently changed files
%load_ext autoreload
%autoreload 2
%aimport utils

import pandas as pd
import numpy as np
import altair as alt
from altair_saver import save
from os.path import join
import datetime
import dateutil.parser
from web import for_website

from constants import COLUMNS, DATA_AGGREGATE_TYPES
from utils import (
    read_combined_daily_counts_df, read_combined_by_country_daily_counts_df, read_combined_by_site_daily_counts_df,
    apply_theme
)

In [None]:
"""
Common info that should be defined everytime before rendering visualizations
"""
SITES = read_combined_by_site_daily_counts_df()[COLUMNS.SITE_ID].unique()

# Titles
NUM_SITES = len(SITES)
DATA_DATE = "2020-04-10"
VIS_DATE = "2020-04-10"
SUBTITLE = f"Data as of {DATA_DATE} | {NUM_SITES} Sites | Plots generated on {VIS_DATE}"

SAVE_DIR = join("..", "output") # Where to save visualization *.PNG files

# Colors
COMBINED = "All countries"
COMBINED_COLOR = "#444444"

COUNTRIES = ["France", "Germany", "Italy", "Singapore", "USA"]
COUNTRY_COLOR = ["#0072B2", "#E69F00", "#009E73", "#CC79A7", "#D55E00"]
COLOR_BY_COUNTRY = {COUNTRIES[i]: COUNTRY_COLOR[i] for i in range(len(COUNTRIES))} 


In [None]:
CATEGORY = "category"

def preprocess_daily_df(df_dc):

    # Wide to long
    df_dc = pd.melt(df_dc, id_vars=[
        COLUMNS.SITE_ID, COLUMNS.DATE,
        COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
        COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS,
        COLUMNS.UNMASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.UNMASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.UNMASKED_SITES_NEW_DEATHS,
        COLUMNS.MASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.MASKED_SITES_NEW_DEATHS
    ])
    df_dc = df_dc.rename(columns={"variable": CATEGORY, "value": COLUMNS.NUM_PATIENTS})

    # Leave only the 'upper' and 'under' values for the certain 'category' only
    for c in [COLUMNS.NEW_POSITIVE_CASES, COLUMNS.PATIENTS_IN_ICU, COLUMNS.NEW_DEATHS]:
        filter_c = df_dc[CATEGORY] == c
        df_dc.loc[filter_c, "upper"] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dc.loc[filter_c, "masked_upper_bound_" + c]
        df_dc.loc[filter_c, "under"] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS]
        df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] = df_dc.loc[filter_c, COLUMNS.NUM_PATIENTS] + df_dc.loc[filter_c, "masked_upper_bound_" + c] / 2.0
        
        # Add num of sites
        df_dc.loc[filter_c, COLUMNS.NUM_SITES] = df_dc["unmasked_sites_" + c] + df_dc["masked_sites_" + c]

    # Drop unused columns
    df_dc = df_dc.drop(columns=[
        COLUMNS.MASKED_UPPER_BOUND_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_UPPER_BOUND_PATIENTS_IN_ICU,
        COLUMNS.MASKED_UPPER_BOUND_NEW_DEATHS,
        COLUMNS.UNMASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.UNMASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.UNMASKED_SITES_NEW_DEATHS,
        COLUMNS.MASKED_SITES_NEW_POSITIVE_CASES,
        COLUMNS.MASKED_SITES_PATIENTS_IN_ICU,
        COLUMNS.MASKED_SITES_NEW_DEATHS
    ])
    
    return df_dc

# Read files
df_dc = preprocess_daily_df(read_combined_by_country_daily_counts_df())
df_dc_site = preprocess_daily_df(read_combined_by_site_daily_counts_df())

# Remove zero num_sites
df_dc = df_dc[df_dc[COLUMNS.NUM_SITES] != 0]

df_dc
df_dc_site

In [None]:
for_category = "new_positive_cases"

df_dc = df_dc.loc[df_dc["category"] == for_category]
df_dc = df_dc.rename(columns={"siteid": "country", "num_patients": "count"})

unique_countries = df_dc["country"].unique().tolist()
df_dc

In [None]:
def convert_date(date_str):
    try:
        return dateutil.parser.parse(date_str)
    except:
        return np.nan

In [None]:

# countries have different ids in the JHU data than in ours
country_map = {
    "US": "USA"
}

In [None]:
jhu_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
jhu_df = pd.read_csv(jhu_url)

jhu_df = jhu_df.rename(columns={"Country/Region": "country", "Province/State": "state"})
jhu_df = jhu_df.drop(columns=["Lat", "Long"])

jhu_df["country"] = jhu_df["country"].apply(lambda c: country_map[c] if c in country_map else c)
jhu_df = jhu_df.loc[jhu_df["country"].isin(unique_countries)]
jhu_df = jhu_df.loc[~pd.notna(jhu_df["state"])]
jhu_df = jhu_df.drop(columns=["state"])

jhu_df = jhu_df.melt(id_vars=["country"], var_name="date", value_name="cumulative_count")

jhu_df["date"] = jhu_df["date"].astype(str)
jhu_df["date"] = jhu_df["date"].apply(convert_date)
df_dc = df_dc.sort_values(by="date", ascending=True)

jhu_roc_df = pd.DataFrame(index=[], data=[], columns=["country", "date", "cumulative_count", "diff", "gradient"])
for country, country_df in jhu_df.groupby("country"):
    country_df = country_df.copy()
    country_df["change"] = np.concatenate((np.array([np.nan]), np.diff(country_df["cumulative_count"].values)))
    country_df["gradient"] = np.gradient(country_df["cumulative_count"].values)
    country_df["cumulative_count"] = country_df["cumulative_count"].replace(0, np.nan)
    
    country_df["change"] = country_df["change"] / country_df["cumulative_count"].max()

    jhu_roc_df = jhu_roc_df.append(country_df, ignore_index=True)
jhu_roc_df

In [None]:
df_dc = df_dc.loc[df_dc["category"] == for_category]
df_dc["date"] = df_dc["date"].astype(str)
df_dc["date"] = df_dc["date"].apply(convert_date)
df_dc = df_dc.sort_values(by="date", ascending=True)


dc_roc_df = pd.DataFrame(index=[], data=[], columns=["country", "date", "count", "diff", "gradient"])
for country, country_df in df_dc.groupby("country"):
    country_df = country_df.copy()
    country_df["cumulative_count"] = np.cumsum(country_df["count"].values)
    country_df["cumulative_count"] = country_df["cumulative_count"].replace(0, np.nan)
    
    country_df["cumulative_upper"] = np.cumsum(country_df["upper"].values)
    country_df["cumulative_upper"] = country_df["cumulative_upper"].replace(0, np.nan)
    
    country_df["cumulative_under"] = np.cumsum(country_df["under"].values)
    country_df["cumulative_under"] = country_df["cumulative_under"].replace(0, np.nan)
    
    
    cumulative_count_max = country_df["cumulative_count"].max()
    
    country_df["change"] = country_df["count"] / cumulative_count_max
    
    country_df["change_upper"] = country_df["upper"] / cumulative_count_max
    country_df["change_under"] = country_df["under"] / cumulative_count_max
    

    dc_roc_df = dc_roc_df.append(country_df, ignore_index=True)
dc_roc_df

In [None]:
min_date = datetime.datetime(2020, 1, 20)
max_date = max(dc_roc_df["date"].max(), jhu_roc_df["date"].max())

dc_roc_df = dc_roc_df.loc[dc_roc_df["date"] >= min_date]
jhu_roc_df = jhu_roc_df.loc[jhu_roc_df["date"] >= min_date]

country_selection = alt.selection_multi(fields=["country"], bind="legend")
country = alt.condition(country_selection, alt.Color("country:N"), alt.value("#EAEAEA"))

date_domain = [alt.DateTime(year=min_date.year, month=min_date.month, date=min_date.day), alt.DateTime(year=max_date.year, month=max_date.month, date=max_date.day)]
date_scale = alt.X("date:T", scale=alt.Scale(domain=date_domain))

pct_domain = [0.0, 0.22]
count_domain = [1, 1000000]

plot = (
    (
        (
            alt.Chart(dc_roc_df)
                .mark_line()
                .encode(
                    x=date_scale,
                    y=alt.Y("change:Q", scale=alt.Scale(domain=pct_domain)),
                    color=country
                )
                .properties(title="Rate of Change per Country (4CE)")
            +
            alt.Chart(dc_roc_df)
                .mark_errorband()
                .encode(
                    x=date_scale,
                    y=alt.Y("change_upper:Q", scale=alt.Scale(domain=pct_domain)), 
                    y2="change_under:Q",
                    color=country
                )
        ).resolve_scale(y="shared").interactive()
    | 
        (
            alt.Chart(dc_roc_df)
                .mark_line()
                .encode(
                    x=date_scale,
                    y=alt.Y("cumulative_count:Q", scale=alt.Scale(type="log", domain=count_domain)),
                    color=country
                )
                .properties(title="Country Cumulative Counts (4CE)")
            +
            alt.Chart(dc_roc_df)
                .mark_errorband()
                .encode(
                    x=date_scale,
                    y=alt.Y("cumulative_upper:Q", scale=alt.Scale(type="log", domain=count_domain)), 
                    y2="cumulative_under:Q",
                    color=country
                )
        ).resolve_scale(color="shared", y="shared", x="shared")
    ) & (
    alt.Chart(jhu_roc_df)
        .mark_line()
        .encode(
            x=date_scale,
            y=alt.Y("change:Q", scale=alt.Scale(domain=pct_domain)),
            color=country
        )
        .properties(title="Rate of Change per Country (JHU)")
     | alt.Chart(jhu_roc_df)
        .mark_line()
        .encode(
            x=date_scale,
            y=alt.Y("cumulative_count:Q", scale=alt.Scale(type="log", domain=count_domain)),
            color=country
        )
        .properties(title="Country Cumulative Counts (JHU)")
    )
).add_selection(
    country_selection
)

plot