## World Data charting

In [0]:
from datetime import datetime, timedelta, timezone
import dateutil.parser as du_parser
import pandas as pd
import eloader as el
import eplotter as ep

# load from the data loader helper
(df_regions_daily, df_countries_population) = el.load_opencovid19_data()
(df_world_last_day) = el.load_latest_johnhopkins_daily()
(df_it_daily, df_it_regional_daily) = el.load_pcmdpc_it_data()
(df_us_daily, df_us_states_daily, df_us_states_latest) = el.load_covidtracking_us_data()

df_fused_daily = el.fuse_daily_sources(df_regions_daily, df_us_daily, df_it_daily)
el.add_canonical_differentials(df_fused_daily)
# df_fused_daily[df_fused_daily['CountryName'] == 'United States of America']

### Global death rate by country

In [0]:
df_over_500_deaths = df_fused_daily[df_fused_daily['Deaths'] > 500]
countries_by_deaths = ep.rank_data_by_metric(df_over_500_deaths, metric='Death_rate', unique_key='CountryName')

ep.scatter_plot_by_series(
    _df=df_fused_daily,
    x_key='X', y_key='Death_rate',
    series_key='CountryName', series_names=countries_by_deaths['CountryName'],
    series_is_secondary=(lambda df: df['Population'].iloc[-1] < 5E+06 or df['Deaths'].iloc[-1] < 500),
    series_secondary_width=1,
    # shift_x_to_intersect_y=5,
    # y_log=True,
    y_filter='expo',
    bounds=[70, None, 0, 16],
    legend_decimals=1,
    data_labels="legend", data_labels_align="right",
    title="Death rate by Country, over time",
    label_x="Day of 2020", label_y="Reported deaths / Total cases (percent)",
    stamp_1='Grayed-out: low population or case count',
)

### Global confirmed cases by country

In [0]:
df_over_1000_cases = df_fused_daily[df_fused_daily['Confirmed'] > 1000]
ranked_countries_by_cases = ep.rank_data_by_metric(df_over_1000_cases, metric='Confirmed', unique_key='CountryName')

intersection_at=4000
ep.scatter_plot_by_series(
    _df=df_fused_daily,
    x_key='X', y_key='Confirmed',
    series_key='CountryName', series_names=ranked_countries_by_cases['CountryName'],
    series_is_secondary=(lambda df: df['Population'].iloc[-1] < 5E+06 or df['Confirmed'].iloc[-1] < 2*intersection_at),
    series_secondary_width=1,
    shift_x_to_intersect_y=intersection_at,
    y_log=True,
    data_labels="series", data_labels_align="center",
    title='Confirmed cases by country, since crossing ' + str(intersection_at),
    label_x='Days since case ' + str(intersection_at),
)

In [0]:
ep.scatter_plot_by_series(
    _df=df_fused_daily,
    x_key='X', y_key='Confirmed',
    series_key='CountryName', series_names=ranked_countries_by_cases['CountryName'],
    series_is_secondary=(lambda df: df['Population'].iloc[-1] < 20E+06 or df['Confirmed'].iloc[-1] < 5000),
    series_secondary_width=1,
    y_log=True,
    bounds=[el.date_to_day_of_year(datetime(2020, 2, 15)),None, 100,None],
    legend_decimals=0,
    data_labels="series", data_labels_align="right",
    title='Confirmed cases by country, since Feb 15, 2020',
    label_x='Day of the year, 2020',
    stamp_1='Grayed-out: low population or case count',
)

### Population-normalized Confirmed and Death numbers (linear)

For these plots, we use the countries where there have been at least 5,000 confirmed cases, which has been choosen for 'signal to noise' purposes, and will need to be raised in the future.

In [0]:
# add two fields to the data frame
df_fused_daily['ConfirmedPerMillion'] = df_fused_daily['Confirmed'] / (df_fused_daily['Population'] / 1E+06)
df_fused_daily['DeathsPerMillion'] = df_fused_daily['Deaths'] / (df_fused_daily['Population'] / 1E+06)

# rank countries by {confirmed, deats}/1M people
df_over_5000_cases = df_fused_daily[df_fused_daily['Confirmed'] > 5000]
df_ranked_countries_by_cases_per_million = ep.rank_data_by_metric(df_over_5000_cases, metric='ConfirmedPerMillion', unique_key='CountryName')
df_ranked_countries_by_deaths_per_million = ep.rank_data_by_metric(df_over_5000_cases, metric='DeathsPerMillion', unique_key='CountryName')

# get the maximums amongst the ranked (for scaling purposes)
max_infected_per_million = int(round(df_over_5000_cases['ConfirmedPerMillion'].max() * 1.1))
max_deaths_per_million = int(round(df_over_5000_cases['DeathsPerMillion'].max() * 1.1))

We first see the trends, shifting countries until they reach a common infection rate.

In [0]:
intersection_pct=0.05
intersection_at=int(intersection_pct * 1E+06 / 100) # 0.02%

ep.scatter_plot_by_series(
    _df=df_fused_daily,
    x_key='X', y_key='ConfirmedPerMillion',
    series_key='CountryName', series_names=df_ranked_countries_by_cases_per_million['CountryName'],
    series_is_secondary=(lambda df: df['Population'].iloc[-1] < 1E+06 or df['Confirmed'].iloc[-1] < 8000),
    series_secondary_width=1,
    shift_x_to_intersect_y=intersection_at,
    bounds=[None, el.date_to_day_of_year(datetime.now()) - 50, 0, max_infected_per_million],
    data_labels="series", data_labels_align="center",
    title='Confirmed cases per 1M people, since crossing ' + str(intersection_at) + " (" + str(intersection_pct) + '% of the country)',
    label_x='Days since crossing ' + str(intersection_at),
    # stamp_1='For countries with > 5,000 cases',
)

Time plot of normalized confirmed cases:

In [0]:
ep.scatter_plot_by_series(
    _df=df_fused_daily,
    x_key='X', y_key='ConfirmedPerMillion',
    series_key='CountryName', series_names=df_ranked_countries_by_cases_per_million['CountryName'],
    series_is_secondary=(lambda df: df['ConfirmedPerMillion'].iloc[-1] < 4000 and df['Confirmed'].iloc[-1] < 10000),
    series_secondary_width=1,
    bounds=[el.date_to_day_of_year(datetime(2020, 3, 1)), None, 0, max_infected_per_million],
    data_labels="legend", data_labels_align="right",
    title='Confirmed cases per 1M population',
    label_x='Day of the year, 2020',
    stamp_1='For countries with > 5,000 cases',
)

Time plot of normalized deaths:

In [0]:
ep.scatter_plot_by_series(
    _df=df_fused_daily,
    x_key='X', y_key='DeathsPerMillion',
    series_key='CountryName', series_names=df_ranked_countries_by_deaths_per_million['CountryName'],
    series_is_secondary=(lambda df: df['DeathsPerMillion'].iloc[-1] < 100 and df['Confirmed'].iloc[-1] < 10000),
    series_secondary_width=1,
    # y_log=True,
    bounds=[el.date_to_day_of_year(datetime(2020, 3, 1)), None, 0, max_deaths_per_million],
    # legend_decimals=1,
    data_labels="legend", data_labels_align="right",
    title='Deaths per 1M population',
    label_x='Day of the year, 2020',
    stamp_1='For countries with > 5,000 cases',
)

Infections and deaths per million (latest data), ranked by Infections/M.

For countries with at least 5,000 cases.

In [0]:
df_ranked_countries_by_cases_per_million[['Date', 'CountryName', 'Confirmed', 'Deaths', 'Population', 'ConfirmedPerMillion', 'DeathsPerMillion', 'Death_rate']]