# Comparative visualization of COVID-19 over time

## Prepare required Python packages

In [None]:
%pip install numpy pandas plotly --upgrade

## Load Python packages

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

## Postprocessing functions

Define functions to modify/analyze the data.

In [None]:
def isolate(df,labels):
    """Isolate columns indicated by dictionary "labels" from dataframe "df" as a new dataframe."""
    
    return pd.DataFrame(dict((a,df[b]) for (a,b) in labels.items() ))

In [None]:
def add_rate(df):
    """Split dataset by location and calculate the daily difference in cases and deaths."""

    label = 'perDay'
    mydf = df.copy()
    
    for what in ['cases',
                 'deaths',
                 'cases_avg',
                 'deaths_avg',
                ]:

        if what in df.columns:
            mydf[f'{what}_{label}'] = 0

            for where in df['location'].unique():
                here = df[ df['location'] == where ][what]
                mydf.loc[here.index,f'{what}_{label}'] = here-np.hstack((np.zeros(1),here[:-1]))

    return mydf

In [None]:
def add_active(df,period=20):
    """Accummulate (active) cases during rolling period."""

    label = 'active'
    mydf = df.copy()
    
    for what in ['cases',
                 'cases_avg',
                ]:

        if what in df.columns:
            mydf[f'{what}_{label}'] = 0
            
            for where in df['location'].unique():
                here = df[ df['location'] == where ][what]
                p = min(len(here.values),period)
                mydf.loc[here.index,f'{what}_{label}'] = here-np.hstack((np.zeros(p),here[:-p]))
    
    return mydf

In [None]:
def add_normalization(df):
    """Normalize values by population."""
    
    label = 'normalized'
    mydf = df.copy()

    
    for what in ['cases',
                 'cases_avg',
                 'cases_avg_active',
                 'cases_avg_perDay',
                 'cases_active',
                 'cases_active_avg',
                 'cases_perDay',
                 'cases_perDay_avg',
                 'deaths',
                 'deaths_avg',
                 'deaths_avg_new',
                 'deaths_perDay',
                 'deaths_perDay_avg',
                ]:

        if what in df.columns:
            mydf[f'{what}_{label}'] = 0

            for where in df['location'].unique():
                here = df[ df['location'] == where ][what]
                mydf.loc[here.index,f'{what}_{label}'] = here/df.iloc[here.index]['population']

    return mydf

In [None]:
def add_averaging(df,period=7,center=True):
    """Average over period."""
    
    label = 'avg'
    mydf = df.copy()
    
    for what in ['cases',
                 'cases_active',
                 'cases_active_normalized',
                 'cases_perDay',
                 'cases_perDay_normalized',
                 'deaths',
                 'deaths_perDay',
                 'deaths_perDay_normalized',
                ]:

        if what in df.columns:
            mydf[f'{what}_{label}'] = 0
            
            for where in df['location'].unique():
                here = df[ df['location'] == where ][what]
                p = min(len(here.values),period)
                mydf.loc[here.index,f'{what}_{label}'] = here.rolling(p,center=center).mean()

    return mydf

In [None]:
def datasets(source,echo=True):
    """Report (or print if echo==True) the available data sets of data frame "source"."""
    
    exclusions = set(['date',
                      'population',
                      'location',
                     ])
    valid = set(source.columns) - exclusions
    if echo:
        print('\n'.join(sorted(list(valid))))
    else:
        return valid


In [None]:
def plot(df,what,width=800,height=600,y_min=None,y_max=None):
    """Generate a semilog plot of the time series of "what"."""

  
    def lastValid(series,index=False):
        for i,item in enumerate(series[::-1]):
            if not (np.isnan(item) or item == 0) : return len(series)-(i+1) if index else item
        return -1 if index else np.nan

    possible = datasets(df,echo=False)
    requested = set(what if isinstance(what,list) else [what])
    valid = requested & possible
    if bool(valid ^ requested):
        print(f'invalid data: {valid^requested}')

    if valid ^ requested != requested:
        valid = list(valid)
        y_min = max(1e-9,min(abs(df[valid]).min())) if y_min is None else y_min
        y_max = max(abs(df[valid]).max()) if y_max is None else y_max

        p = px.line(df,
                    x='date',
                    y=valid,
                    color='location',
                    color_discrete_sequence=px.colors.qualitative.Alphabet,
                    line_group='variable',
                    hover_name='location',
                    hover_data=['population',],
                    log_y=True,
                    range_y=[y_min,y_max],
                    )
        annotations = []
        p.for_each_trace(
            lambda trace: annotations.append(dict(x=trace.x[lastValid(trace.y,index=True)],
                                                  y=np.log10(lastValid(trace.y)),
                                                  text=trace.name,
                                                  clicktoshow='onoff',
                                                  visible=False,
                                                  opacity=0.5,
                                            )),
        )

        p.update_xaxes(title_text='<b>date</b>')
        p.update_yaxes(title_text='<br>'.join(map(lambda x:f'<b>{x}</b>',valid)))
        p.update_layout(annotations=annotations,
                        template='plotly_white',
                        height=height,
                        width=width,
                       )

        p.show()

## Data acquisition

Read data and transform into minimum required format that contains

* date
* location
* population
* cases
* [deaths]

In [None]:
World = isolate(pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv',
                            parse_dates=['date']),
                {'date':       'date',
                 'location':   'location',
                 'population': 'population',
                 'cases':      'total_cases',
                 'deaths':     'total_deaths',
                })

US = isolate(pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv',
                         parse_dates=['date']).
                sort_values(by=['state','date'],ignore_index=True),
             {'date':     'date',
              'location': 'state',
              'cases':    'cases',
              'deaths':   'deaths',   
             })

US['population'] = US['location'].map(
    {
        'California':    39937489,
        'Texas':         29472295,
        'Florida':       21992985,
        'New York':      19440469,
        'Pennsylvania':  12820878,
        'Illinois':      12659682,
        'Ohio':          11747694,
        'Georgia':       10736059,
        'North Carolina':10611862,
        'Michigan':      10045029,
        'New Jersey':     8936574,
        'Virginia':       8626207,
        'Washington':     7797095,
        'Arizona':        7378494,
        'Massachusetts':  6976597,
        'Tennessee':      6897576,
        'Indiana':        6745354,
        'Missouri':       6169270,
        'Maryland':       6083116,
        'Wisconsin':      5851754,
        'Colorado':       5845526,
        'Minnesota':      5700671,
        'South Carolina': 5210095,
        'Alabama':        4908621,
        'Louisiana':      4645184,
        'Kentucky':       4499692,
        'Oregon':         4301089,
        'Oklahoma':       3954821,
        'Connecticut':    3563077,
        'Utah':           3282115,
        'Iowa':           3179849,
        'Nevada':         3139658,
        'Arkansas':       3038999,
        'Puerto Rico':    3032165,
        'Mississippi':    2989260,
        'Kansas':         2910357,
        'New Mexico':     2096640,
        'Nebraska':       1952570,
        'Idaho':          1826156,
        'West Virginia':  1778070,
        'Hawaii':         1412687,
        'New Hampshire':  1371246,
        'Maine':          1345790,
        'Montana':        1086759,
        'Rhode Island':   1056161,
        'Delaware':        982895,
        'South Dakota':    903027,
        'North Dakota':    761723,
        'Alaska':          734002,
        'District of Columbia':720687,
        'Vermont':         628061,
        'Wyoming':         567025,
    }
)

df = pd.read_csv('https://raw.githubusercontent.com/jgehrcke/covid-19-germany-gae/master/data.csv',
                 parse_dates=['time_iso8601']).\
        rename(columns={'sum_cases': 'DE_cases',
                        'sum_deaths':'DE_deaths'}).\
        melt('time_iso8601',var_name='tmp')
df['location'] = df['tmp'].apply(lambda x: (x.split('_'))[0])
df['type'] = df.pop('tmp').apply(lambda x: (x.split('_'))[-1])
tmp = df[ df['location'] != 'source' ].set_index(['time_iso8601','location','type']).unstack(level=-1)

Germany = isolate(pd.DataFrame({'deaths': tmp[('value',  'deaths')],
                                 'cases': tmp[('value',  'cases')],
                               }).reset_index(),
             {'date':     'time_iso8601',
              'location': 'location',
              'cases':    'cases',
              'deaths':   'deaths',
             })



Germany['population'] = Germany['location'].map(
    {
        'DE-BW':11069533,
        'DE-BY':13076721,
        'DE-BE': 3644826,
        'DE-BB': 2511917,
        'DE-HB':  682986,
        'DE-HH': 1841179,
        'DE-HE': 6265809,
        'DE-NI': 7982448,
        'DE-MV': 1609675,
        'DE-NW':17932651,
        'DE-RP': 4084844,
        'DE-SL':  990509,
        'DE-SN': 4077937,
        'DE-ST': 2208321,
        'DE-SH': 2896712,
        'DE-TH': 2143145,
        'DE':   83019213,
    }
)

World = add_normalization(
      add_averaging(
      add_rate(
      add_active(World)
      )
      )
      )

US = add_normalization(
      add_averaging(
      add_rate(
      add_active(US)
      )
      )
      )

Germany = add_normalization(
      add_averaging(
      add_rate(
      add_active(Germany)
      )
      )
      )


## Visualization of results

### Data sources

There are presently three separate data sources available:

*   World
*   US
*   Germany

### Data source subsets

Specific subsets of those data sources can be created using the `query` method.

#### Examples

*   Select three particular US states:
```python
US.query('location in ["California","Texas","New York"]')
```

*   Restrict to countries with more than 100 million people:
```python
World.query('population > 100e6')
```

### Available data sets

Use the function `datasets` to interrogate what data can be plotted for each source.

In [None]:
datasets(US)

### Plotting

The function `plot` can take a single item or list of available data sets to create a visualization of the time evolution for one data source.

Visibility of curves in the resulting graph can be toggled by a single click on their respective legend. A double click on a legend item toggles between *only this curve* and *all curves* being shown.

#### Examples

*   Daily change of cases smoothed over five days and normalized by population size for all reporting countries in the world:
```
plot(World,'cases_perDay_avg_normalized')
```
*   Comparison of cumulated cases *and* deaths, i.e. two separate data sets, across German Bundesländer (states) normalized by population size:
```
plot(Germany,['cases_normalized','deaths_normalized'])
```
*   Daily change of deaths smoothed over five days and normalized by population size for US states North Carolina and South Carolina:
```
plot(US.query('location in ["North Carolina","South Carolina"]'),'deaths_perDay_avg_normalized')
```

In [None]:
plot(World,
     [
      'cases_active_normalized',
     ],
     y_min=1e-6,
    )