# Comparative visualization of COVID-19 over time

In [1]:
%pip install plotly --upgrade

Collecting plotly
[?25l  Downloading https://files.pythonhosted.org/packages/70/56/eabdc7b7187cdb9d6121f6de2831ad5b85f7d002fa4bfe0476dbdb554bf6/plotly-4.8.1-py2.py3-none-any.whl (11.5MB)
[K     |████████████████████████████████| 11.5MB 357kB/s 
Installing collected packages: plotly
  Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-4.8.1


Load all necessary Python packages

In [0]:
import numpy as np
import pandas as pd
import plotly.express as px

Define functions to modify/analyze the data.

In [0]:
def isolate(df,labels):
  return pd.DataFrame(dict((a,df[b]) for (a,b) in labels.items() ))

In [0]:
def add_new(df):
    """Split dataset by location and calculate the daily difference in cases and deaths."""

    label = 'new'
    mydf = df.copy()
    
    for what in ['cases',
                 'deaths',
                 'cases_avg',
                 'deaths_avg',
                ]:

        if what in df.columns:
            mydf[f'{what}_{label}'] = 0

            for where in df['location'].unique():
                here = df[ df['location'] == where ][what]
                mydf.loc[here.index,f'{what}_{label}'] = here-np.hstack((np.zeros(1),here[:-1]))

    return mydf

In [0]:
def add_active(df,period=20):
    """Accummulate (active) cases during rolling period."""

    label = 'active'
    mydf = df.copy()
    
    for what in ['cases',
                ]:

        if what in df.columns:
            mydf[f'{what}_{label}'] = 0
            
            for where in df['location'].unique():
                here = df[ df['location'] == where ][what]
                p = min(len(here.values),period)
                mydf.loc[here.index,f'{what}_{label}'] = here-np.hstack((np.zeros(p),here[:-p]))
    
    return mydf

In [0]:
def add_normalization(df):
    """Normalize values by population."""
    
    label = 'normalized'
    mydf = df.copy()

    
    for what in ['cases_new',
                 'deaths_new',
                 'cases_new_avg',
                 'deaths_new_avg',
                 'cases_avg_new',
                 'deaths_avg_new',
                ]:

        if what in df.columns:
            mydf[f'{what}_{label}'] = 0

            for where in df['location'].unique():
                here = df[ df['location'] == where ][what]
                mydf.loc[here.index,f'{what}_{label}'] = here/df.iloc[here.index]['population']

    return mydf

In [0]:
def add_averaging(df,period=3):
    """Average over period."""
    
    label = 'avg'
    mydf = df.copy()
    
    for what in ['cases_new',
                 'cases_new_normalized',
                 'deaths_new',
                 'deaths_new_normalized',
                ]:

        if what in df.columns:
            mydf[f'{what}_{label}'] = 0
            
            for where in df['location'].unique():
                here = df[ df['location'] == where ][what]
                p = min(len(here.values),period)
                mydf.loc[here.index,f'{what}_{label}'] = here.rolling(p,center=True).mean()

    return mydf

In [0]:
def add_population(df):
    """Add Population from csv file"""

    label = 'population'
    mydf = df.copy()



In [0]:
def plot(df,what,width=800,height=600,y_min=None,y_max=None):
    """Generate a semilog plot of the timeseries of "what"."""

    y_min = max(1e-7,min(abs(df[what]).min())) if y_min is None else y_min
    y_max = max(abs(df[what]).max()) if y_max is None else y_max
    
    p = px.line(df,
                x='date',
                y=what,
                color='location',color_discrete_sequence=px.colors.qualitative.Alphabet,
                log_y=True,
                range_y=[y_min,y_max],
                template='plotly_white',
                height=height,
                width=width,
                )

    p.show()

In [0]:
def data(what):
    exclusions = ['date',
                  'population',
                  'location',
                  ]
    for item in what.columns:
      if item not in exclusions:
        print(item)

Read data and transform into minimum required format that contains

*   date
*   location
* population
* cases
* deaths

In [0]:
World = isolate(pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv',parse_dates=['date']),
                {'date':'date',
                 'location':'location',
                 'population':'population',
                 'cases':'total_cases',
                 'deaths':'total_deaths',
                })

US = isolate(pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv',parse_dates=['date']),
             {'date':'date',
              'location':'state',
              'cases':'cases',
              'deaths':'deaths',   
             })

US['population'] = US['location'].map(
    {
        'California':39937489,
        'Texas':29472295,
        'Florida':21992985,
        'New York':19440469,
        'Pennsylvania':12820878,
        'Illinois':12659682,
        'Ohio':11747694,
        'Georgia':10736059,
        'North Carolina':10611862,
        'Michigan':10045029,
        'New Jersey':8936574,
        'Virginia':8626207,
        'Washington':7797095,
        'Arizona':7378494,
        'Massachusetts':6976597,
        'Tennessee':6897576,
        'Indiana':6745354,
        'Missouri':6169270,
        'Maryland':6083116,
        'Wisconsin':5851754,
        'Colorado':5845526,
        'Minnesota':5700671,
        'South Carolina':5210095,
        'Alabama':4908621,
        'Louisiana':4645184,
        'Kentucky':4499692,
        'Oregon':4301089,
        'Oklahoma':3954821,
        'Connecticut':3563077,
        'Utah':3282115,
        'Iowa':3179849,
        'Nevada':3139658,
        'Arkansas':3038999,
        'Puerto Rico':3032165,
        'Mississippi':2989260,
        'Kansas':2910357,
        'New Mexico':2096640,
        'Nebraska':1952570,
        'Idaho':1826156,
        'West Virginia':1778070,
        'Hawaii':1412687,
        'New Hampshire':1371246,
        'Maine':1345790,
        'Montana':1086759,
        'Rhode Island':1056161,
        'Delaware':982895,
        'South Dakota':903027,
        'North Dakota':761723,
        'Alaska':734002,
        'District of Columbia':720687,
        'Vermont':628061,
        'Wyoming':567025,
    }
)

Germany = isolate(pd.read_csv('https://raw.githubusercontent.com/jgehrcke/covid-19-germany-gae/master/cases-rki-by-state.csv',
                 parse_dates=['time_iso8601']).\
                 rename(columns={'sum_cases':'DE'}).\
                 melt('time_iso8601',var_name='location',value_name='cases'),
             {'date':'time_iso8601',
              'location':'location',
              'cases':'cases',   
             })

Germany['population'] = Germany['location'].map(
    {
        'DE-BW':11069533,
        'DE-BY':13076721,
        'DE-BE':3644826,
        'DE-BB':2511917,
        'DE-HB':682986,
        'DE-HH':1841179,
        'DE-HE':6265809,
        'DE-NI':7982448,
        'DE-MV':1609675,
        'DE-NW':17932651,
        'DE-RP':4084844,
        'DE-SL':990509,
        'DE-SN':4077937,
        'DE-ST':2208321,
        'DE-SH':2896712,
        'DE-TH':2143145,
        'DE':83019213,
    }
)

World = add_normalization(
      add_averaging(
      add_new(
      add_active(World)
      ),5
      )
      )

US = add_normalization(
      add_averaging(
      add_new(
      add_active(US)
      ),5
      )
      )

Germany = add_normalization(
      add_averaging(
      add_new(
      add_active(Germany)
      ),5
      )
      )


## Plot the result

There are three separate data sets available:

*   World
*   US
*   Germany

Use the function `data` to interrogate what data can be plotted

In [12]:
data(US)

cases
deaths
cases_active
cases_new
deaths_new
cases_new_avg
deaths_new_avg
cases_new_normalized
deaths_new_normalized
cases_new_avg_normalized
deaths_new_avg_normalized


The function `plot` can take a single item or list of available data to create a visualization of the time evolution.

In [13]:
plot(US,['cases_new_avg_normalized','deaths_new_avg_normalized'],
     )
