## World Data analysis

In [None]:
from datetime import datetime, timedelta, timezone
import dateutil.parser as du_parser
import pandas as pd
import eloader as el
import eplotter as ep

# load from the data loader helper
(df_regions_daily, _) = el.load_opencovid19_data()
(df_it_daily, _) = el.load_pcmdpc_it_data()
(df_us_daily, _, _) = el.load_covidtracking_us_data()

# TEMP: remove france as it's spiking (on 6/15)
df_regions_daily = el.filter_rows_remove_equals(df_regions_daily, 'CountryName', 'France', 'spiking inconsisntently')
# TEMP: remove Iraq as the death rate is inconsistent
df_regions_daily = el.filter_rows_remove_equals(df_regions_daily, 'CountryName', 'Iraq', 'death rate is inconsistent')

# TEMP: focus analysis only on a specific country (Plus Italy, US)
select_country = None #'Mexico'
if select_country is not None:
    df_regions_daily = df_regions_daily[df_regions_daily['CountryName'] == select_country]

# Fuse daily data, and add missing data
df_fused_daily = el.fuse_daily_sources(df_regions_daily, df_us_daily, df_it_daily)
el.add_canonical_differentials(df_fused_daily)

### Global death rate by country
For countries with more than a thousand deaths.

In [None]:
df_over_1000_deaths = df_fused_daily[df_fused_daily['Deaths'] > 1000]
countries_by_deaths = ep.rank_data_by_metric(df_over_1000_deaths, metric='Death_rate', unique_key='CountryName', max_results=25)

ep.scatter_plot_by_series(
    _df=df_fused_daily,
    x_key='X', y_key='Death_rate',
    series_key='CountryName', series_names=countries_by_deaths['CountryName'],
    series_is_secondary=(lambda df: df['Population'].iloc[-1] < 5E+06 or df['Deaths'].iloc[-1] < 500),
    series_secondary_width=1,
    # shift_x_to_intersect_y=5,
    # y_log=True,
    y_filter='expo',
    bounds=[80, None, 0, 18],
    legend_decimals=1, legend_suffix='%',
    data_labels="legend", data_labels_align="right",
    title="Death rate (deaths/cases) by Country, over time",
    label_x="Day of 2020", label_y="Reported deaths / Total cases (percent)",
    stamp_1='Grayed-out: low population or case count',
)

### Global confirmed cases by country

In [None]:
df_over_10000_cases = df_fused_daily[df_fused_daily['Confirmed'] > 10000]
ranked_countries_by_cases = ep.rank_data_by_metric(df_over_10000_cases, metric='Confirmed', unique_key='CountryName', max_results=60)

intersection_at=10000
ep.scatter_plot_by_series(
    _df=df_fused_daily,
    x_key='X', y_key='Confirmed',
    series_key='CountryName', series_names=ranked_countries_by_cases['CountryName'],
    series_is_secondary=(lambda df: df['Population'].iloc[-1] < 1E+06 or df['Confirmed'].iloc[-1] < 1*intersection_at),
    series_secondary_width=1,
    shift_x_to_intersect_y=intersection_at,
    y_log=True,
    bounds=[0, el.current_day_of_year()-40, None, ranked_countries_by_cases['Confirmed'].iloc[0]*1.0],
    legend_off=True,
    data_labels="legend", data_labels_align="left",
    title='Confirmed cases (log, since crossing ' + str(intersection_at) + ')',
    label_x='Days since case ' + str(intersection_at),
)

In [None]:
ep.scatter_plot_by_series(
    _df=df_fused_daily,
    x_key='X', y_key='Confirmed',
    series_key='CountryName', series_names=ranked_countries_by_cases['CountryName'],
    series_is_secondary=(lambda df: df['Population'].iloc[-1] < 40E+06 or df['Confirmed'].iloc[-1] < 10000),
    series_secondary_width=1,
    y_log=True,
    bounds=[el.date_to_day_of_year(datetime(2020, 2, 15)),None, 10000,None],
    legend_decimals=0,
    data_labels="series", data_labels_align="right",
    title='Confirmed cases by country, since Feb 15, 2020',
    label_x='Day of the year, 2020',
    stamp_1='Grayed-out: population < 40M',
)

### Population-normalized Confirmed and Death numbers (linear)

For these plots, we use the countries where there have been at least 5,000 confirmed cases, which has been choosen for 'signal to noise' purposes, and will need to be raised in the future.

In [None]:
# add two fields to the data frame
df_fused_daily['ConfirmedPerMillion'] = df_fused_daily['Confirmed'] / (df_fused_daily['Population'] / 1E+06)
df_fused_daily['DeathsPerMillion'] = df_fused_daily['Deaths'] / (df_fused_daily['Population'] / 1E+06)

# Remove countries that are anomalies as of 6/15
df_pm = el.filter_rows_remove_equals(df_fused_daily, 'CountryName', 'Qatar', 'high due to testing')
df_pm = el.filter_rows_remove_equals(df_pm, 'CountryName', 'Bahrain', 'high due to testing')
#df_pm=df_fused_daily

# rank countries by {confirmed, deats}/1M people
df_over_5000_cases = df_pm[df_pm['Confirmed'] > 5000]
df_ranked_countries_by_cases_per_million = ep.rank_data_by_metric(df_over_5000_cases, metric='ConfirmedPerMillion', unique_key='CountryName', max_results=25)
df_ranked_countries_by_deaths_per_million = ep.rank_data_by_metric(df_over_5000_cases, metric='DeathsPerMillion', unique_key='CountryName', max_results=25)

# get the maximums amongst the ranked (for scaling purposes)
max_infected_per_million = int(round(df_over_5000_cases['ConfirmedPerMillion'].max() * 1.1))
max_deaths_per_million = int(round(df_over_5000_cases['DeathsPerMillion'].max() * 1.1))

We first see the trends, shifting countries until they reach a common infection rate.

In [None]:
intersection_pct=0.2 # 0.1%
intersection_at=int(intersection_pct * 1E+06 / 100)

ep.scatter_plot_by_series(
    _df=df_pm,
    x_key='X', y_key='ConfirmedPerMillion',
    series_key='CountryName', series_names=df_ranked_countries_by_cases_per_million['CountryName'],
    series_is_secondary=(lambda df: df['Population'].iloc[-1] < 1E+06 or df['Confirmed'].iloc[-1] < 5000),
    series_secondary_width=1,
    shift_x_to_intersect_y=intersection_at,
    bounds=[None, el.current_day_of_year() - 15, 0, max_infected_per_million],
    data_labels="series", data_labels_align="left",
    title='Confirmed cases per 1M people, since crossing ' + str(intersection_at) + " (" + str(intersection_pct) + '%)',
    label_x='Days since crossing ' + str(intersection_at),
    stamp_1='Excluding Qatar, Bahrain',
)

Time plot of normalized confirmed cases:

In [None]:
ep.scatter_plot_by_series(
    _df=df_pm,
    x_key='X', y_key='ConfirmedPerMillion',
    series_key='CountryName', series_names=df_ranked_countries_by_cases_per_million['CountryName'],
    series_is_secondary=(lambda df: df['ConfirmedPerMillion'].iloc[-1] < 2000 and df['Confirmed'].iloc[-1] < 10000),
    series_secondary_width=1,
    bounds=[el.date_to_day_of_year(datetime(2020, 3, 10)), None, 0, max_infected_per_million],
    data_labels="legend", data_labels_align="right",
    title='Confirmed cases per 1M population',
    label_x='Day of the year, 2020',
    stamp_1='Excluding Qatar, Bahrain',
)

Trends plot of normalized deaths:

In [None]:
intersection_pct=0.005 # 0.1%
intersection_at=int(intersection_pct * 1E+06 / 100)

ep.scatter_plot_by_series(df_pm,
    x_key='X', y_key='DeathsPerMillion',
    series_key='CountryName', series_names=df_ranked_countries_by_deaths_per_million['CountryName'],
    series_is_secondary=(lambda df: df['Population'].iloc[-1] < 1E+06 or df['Confirmed'].iloc[-1] < 5000),
    series_secondary_width=1,
    shift_x_to_intersect_y=intersection_at,
    bounds=[60, el.current_day_of_year() - 30, 0, max_deaths_per_million],
    data_labels="series", data_labels_align="left",
    title='Deaths per Million, since crossing ' + str(intersection_at),
    label_x='Days since crossing ' + str(intersection_at),
    stamp_1='Excluding Qatar, Bahrain',
)

Time plot of normalized deaths:

In [None]:
ep.scatter_plot_by_series(
    _df=df_pm,
    x_key='X', y_key='DeathsPerMillion',
    series_key='CountryName', series_names=df_ranked_countries_by_deaths_per_million['CountryName'],
    series_is_secondary=(lambda df: df['DeathsPerMillion'].iloc[-1] < 50 and df['Confirmed'].iloc[-1] < 10000),
    series_secondary_width=1,
    # y_log=True,
    bounds=[el.date_to_day_of_year(datetime(2020, 3, 10)), None, 0, max_deaths_per_million],
    # legend_decimals=1,
    data_labels="legend", data_labels_align="right",
    title='Deaths per Million',
    label_x='Day of the year, 2020',
    stamp_1='Excluding Qatar, Bahrain',
)

Infections and deaths per million (latest data), ranked by Infections/M.

For countries with at least 5,000 cases.

In [None]:
df_ranked_countries_by_cases_per_million[['Date', 'CountryName', 'Confirmed', 'Deaths', 'Population', 'ConfirmedPerMillion', 'DeathsPerMillion', 'Death_rate']]

In [None]:
df_fused_daily[df_fused_daily['CountryName'] == 'United States of America']