# First Steps
+ Nbstriput should be initialized to extract the metadata on commit. Just comment out the command in the next cell (NBStripout is only requiered if you want to commit, push changes) So just uncomment if you want to contribute ;-)
+ The folder COVID-19 contains the current data of the John Hopkins CSSE for the 2019-nCoV, these were initialized as submodule, therefore activate the command for the corresponding cell
+ In the 3rd cell the current data from the John Hopkins Repo is pulled 

In [None]:
# Nbstripout installieren
#!cd ~/corona_jupyternotebooks && nbstripout --install

# Imports

In [None]:
# Imports
import os
import glob
import math
import pandas as pd
import numpy as np
import plotly
# Libraries for visualization
import cufflinks as cf
import ipywidgets
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from ipywidgets import interact, interact_manual
from IPython.display import Javascript
import datetime
import plotly.io as pio
try:
    import wget
except:
    !pip3 install wget

In [None]:
# Use this import only if runnung jupyterlab, otherwise visualisations won't work
#pio.renderers.default = 'colab'

# Initialize dataset
**Important** You have 2 possibilities:
+ Pull via Git (slower, but some more files wot work with are pulled too)
+ Just download the csv (faster, but only needed files are downloaded)
**Choose from dropdown**

In [None]:
choice = ipywidgets.Dropdown(
    options=['GIT', 'Download'],
    value='GIT',
    description='Choices:',
    disabled=False,
)
btn_conf = ipywidgets.Button(
    description='Confirm',
    disabled=False,
    button_style='',
    tooltip='Confirm selected countries',
    icon='check'
)
def update(self):
    global selection
    selection = choice.value 


btn_conf.on_click(update)
ipywidgets.HBox([choice, btn_conf])

In [None]:
if(selection=='GIT'):
    print('Using GIT Repo, this may take a while...')
    # Initialisieren des GIT submoduls
    !cd ~/corona_jupyternotebooks/COVID-19 && git submodule init && git submodule update
    # Update Data
    !cd ~/corona_jupyternotebooks/COVID-19 && git checkout master && git pull
if(selection=='Download'):
    print('Using wget to download data')
    confirmed_url='https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    death_url='https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
    recovered_url='https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
    urls=[confirmed_url,death_url,recovered_url]
    os.chdir(os.path.expanduser("~")+'/corona_jupyternotebooks/')

    if not os.path.exists('COVID-19/csse_covid_19_data/csse_covid_19_time_series/'):
        os.makedirs('COVID-19/csse_covid_19_data/csse_covid_19_time_series/')
        os.chdir('COVID-19/csse_covid_19_data/csse_covid_19_time_series/')
        for url in urls:
            wget.download(url)
        os.chdir(os.path.expanduser("~")+'/corona_jupyternotebooks/')
    else:
        os.chdir('COVID-19/csse_covid_19_data/csse_covid_19_time_series/')
        for url in urls:
            filename = os.path.basename(url) # get the full path of the file
            print(filename)
            if os.path.exists(filename):
                os.remove(filename) # if exist, remove it directly
            wget.download(url)
        os.chdir(os.path.expanduser("~")+'/corona_jupyternotebooks/')

# Read data from John Hopkins into a dataframe and prepare
For better visualization it is helpful to build a new dataframe, because e.g. political territory outside the EU is assigned to the administering states with naturally much smaller numbers.
In addition, a clean dataframe helps with further data-moves

In [None]:
# UK is part of it, after all it was once EU
eu_countries = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czechia', 'Denmark', 'Estonia', 
                'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 
                'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden']
#Check the file COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv for more countries
non_eu_countries = ['US', 'New Zealand']
countries = eu_countries + non_eu_countries
adm_states = ['France', 'United Kingdom', 'Australia', 'Netherlands', 'Denmark']

# Data cleaning for infections data
df_infected = pd.read_csv('COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv', sep=',') 
pd.to_datetime(df_infected.columns[4:], dayfirst=True)
df_series_filter_infected = df_infected.drop(['Lat', 'Long'], 1)
df_infected_cleaned = pd.DataFrame()

for c in countries:
        if any(c in col for col in adm_states):
            df_temp = df_series_filter_infected[df_series_filter_infected['Country/Region']==c]
            df_temp = df_temp[df_temp['Province/State'].isnull()].transpose()[2:]
            df_temp.columns = [c]
            temp = df_temp.loc[:, c]
            df_infected_cleaned[c]=temp
        else:
            df_temp = df_series_filter_infected[df_series_filter_infected['Country/Region']==c].transpose()[2:]
            df_temp.columns = [c]
            temp = df_temp.loc[:, c]
            df_infected_cleaned[c]=temp

df_infected_cleaned.index= pd.to_datetime(df_infected_cleaned.index)


# Data cleaning for death data
df_death = pd.read_csv('COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv', sep=',')
pd.to_datetime(df_death.columns[4:], dayfirst=True)
df_series_filter_death = df_death.drop(['Lat', 'Long'], 1)
df_death_cleaned = pd.DataFrame()

for c in countries:
        if any(c in col for col in adm_states):
            df_temp = df_series_filter_death[df_series_filter_death['Country/Region']==c]
            df_temp = df_temp[df_temp['Province/State'].isnull()].transpose()[2:]
            df_temp.columns = [c]
            temp = df_temp.loc[:, c]
            df_death_cleaned[c]=temp
        else:
            df_temp = df_series_filter_death[df_series_filter_death['Country/Region']==c].transpose()[2:]
            df_temp.columns = [c]
            temp = df_temp.loc[:, c]
            df_death_cleaned[c]=temp


# Select countries for visualisation

In [None]:
selected = ipywidgets.SelectMultiple(
    options=countries,
    value=['Germany'],
    rows=20,
    description='Countries',
    disabled=False
)
print('Select countries to work with. Holding Strg let you select multiple')

btn_conf = ipywidgets.Button(
    description='Confirm',
    disabled=False,
    button_style='',
    tooltip='Confirm selected countries',
    icon='check'
)

#df_sel_countries=pd.DataFrame()
def update(self):
    sel_countries = list(selected.value)
    global df_sel_countries_infected
    df_sel_countries_infected = df_infected_cleaned[sel_countries]

btn_conf.on_click(update)
ipywidgets.HBox([selected, btn_conf])

In [None]:
start = ipywidgets.DatePicker(
    description='From',
    disabled=False,
    value=datetime.datetime(2020, 1, 22)
)
end = ipywidgets.DatePicker(
    description='Until',
    disabled=False,
    value=datetime.datetime.now()
)

def infected_persons(s,e):
    start_date = pd.to_datetime(start.value)
    end_date = pd.to_datetime(end.value)
    mask = ((df_sel_countries_infected.index >= start_date) & (df_sel_countries_infected.index <= end_date))
    return df_sel_countries_infected.loc[mask].iplot()
    
interact(infected_persons, s=start, e=end)

# Visualization of infected persons in absolute values
**Starting date is 1/22/2020 (Day of first datapoint)**

# Decadic logarithm visualisation

In [None]:
start = ipywidgets.DatePicker(
    description='From',
    disabled=False,
    value=datetime.datetime(2020, 1, 22)
)
end = ipywidgets.DatePicker(
    description='Until',
    disabled=False,
    value=datetime.datetime.now()
)

clean_log10 = df_sel_countries_infected.copy()

for c in clean_log10.columns:
    clean_log10[c]=clean_log10[c].apply(lambda x: math.log10(x) if x != 0 else 0)

def infected_persons(s,e):
    start_date = pd.to_datetime(start.value)
    end_date = pd.to_datetime(end.value)
    mask = ((clean_log10.index >= start_date) & (clean_log10.index <= end_date))
    return clean_log10.loc[mask].iplot()
interact(infected_persons, s=start, e=end)

# New infections compared to the previous day
**With 0 reports, next chapter filtering it out**

In [None]:
start = ipywidgets.DatePicker(
    description='From',
    disabled=False,
    value=datetime.datetime(2020, 1, 22)
)
end = ipywidgets.DatePicker(
    description='Until',
    disabled=False,
    value=datetime.datetime.now()
)

growth = df_sel_countries_infected.copy()

# pull out incorrect data, for example spain reported less total infected persons in 2 following days
for c in growth.columns:   
    growth.loc[:,c].size
    for i in range(growth.loc[:,c].size-1):
        if(df_sel_countries_infected.loc[:,c][i+1]<df_sel_countries_infected.loc[:,c][i]):
            df_sel_countries_infected.loc[:,c][i+1]=df_sel_countries_infected.loc[:,c][i]
        growth.loc[:,c][i+1]=df_sel_countries_infected.loc[:,c][i+1]-df_sel_countries_infected.loc[:,c][i]
              
def new_infections(s,e):
    start_date = pd.to_datetime(start.value)
    end_date = pd.to_datetime(end.value)
    mask = ((growth.index >= start_date) & (growth.index <= end_date))
    return growth.loc[mask].iplot()
interact(new_infections, s=start, e=end)

# 7-day incidence (absolute)

In [None]:
start = ipywidgets.DatePicker(
    description='From',
    disabled=False,
    value=datetime.datetime(2020, 1, 22)
)
end = ipywidgets.DatePicker(
    description='Until',
    disabled=False,
    value=datetime.datetime.now()
)

growth = df_sel_countries_infected.copy()

# pull out incorrect data, for example spain reported less total infected persons in 2 following days
for c in growth.columns:   
    growth.loc[:,c].size
    for i in range(growth.loc[:,c].size):
        if(i<growth.loc[:,c].size-1):
            if(df_sel_countries_infected.loc[:,c][i+1]<df_sel_countries_infected.loc[:,c][i]):
                df_sel_countries_infected.loc[:,c][i+1]=df_sel_countries_infected.loc[:,c][i]
        if(i<=7):
            growth.loc[:,c][i]=df_sel_countries_infected.loc[:,c][i]
        else:
            growth.loc[:,c][i]=df_sel_countries_infected.loc[:,c][i]-df_sel_countries_infected.loc[:,c][i-7]
            
def new_infections(s,e):
    start_date = pd.to_datetime(start.value)
    end_date = pd.to_datetime(end.value)
    mask = ((growth.index >= start_date) & (growth.index <= end_date))
    return growth.loc[mask].replace(0, np.nan).iplot()
interact(new_infections, s=start, e=end)

# 7-day incidence (per 100.000)


In [None]:
# Import population data from population.json to dict
import json 
population = {}
with open('population.json') as json_file: 
    data = json.load(json_file)
for c in data:
    population[c['country']] = int(c['population'])

start = ipywidgets.DatePicker(
    description='From',
    disabled=False,
    value=datetime.datetime(2020, 1, 22)
)
end = ipywidgets.DatePicker(
    description='Until',
    disabled=False,
    value=datetime.datetime.now()
)

growth = df_sel_countries_infected.copy()

# pull out incorrect data, for example spain reported less total infected persons in 2 following days
for c in growth.columns:   
    growth.loc[:,c].size
    for i in range(growth.loc[:,c].size):
        if(i<growth.loc[:,c].size-1):
            if(df_sel_countries_infected.loc[:,c][i+1]<df_sel_countries_infected.loc[:,c][i]):
                df_sel_countries_infected.loc[:,c][i+1]=df_sel_countries_infected.loc[:,c][i]
        if(i<=7):
            growth.loc[:,c][i]=df_sel_countries_infected.loc[:,c][i]
        else:
            growth.loc[:,c][i]=df_sel_countries_infected.loc[:,c][i]-df_sel_countries_infected.loc[:,c][i-7]

for c in growth.columns:
    growth[c]=growth[c].apply(lambda x: x*100000/population[c] if x != 0 else 0)

            
def new_infections(s,e):
    start_date = pd.to_datetime(start.value)
    end_date = pd.to_datetime(end.value)
    mask = ((growth.index >= start_date) & (growth.index <= end_date))
    return growth.loc[mask].replace(0, np.nan).iplot()
interact(new_infections, s=start, e=end)

# Percentage of infected persons in the total population
**IMPORTANT** Population Data is a little bit old (2016)

In [None]:
# Import population data from population.json to dict
import json 
population = {}
with open('population.json') as json_file: 
    data = json.load(json_file)
for c in data:
    population[c['country']] = int(c['population'])
start = ipywidgets.DatePicker(
    description='From',
    disabled=False,
    value=datetime.datetime(2020, 1, 22)
)
end = ipywidgets.DatePicker(
    description='Until',
    disabled=False,
    value=datetime.datetime.now()
)

df_percentage = df_sel_countries_infected.copy()

for c in df_percentage.columns:
    df_percentage[c]=df_percentage[c].apply(lambda x: x/population[c] if x != 0 else 0)

def population_percentage_infected(s,e):
    start_date = pd.to_datetime(start.value)
    end_date = pd.to_datetime(end.value)
    mask = ((df_percentage.index >= start_date) & (df_percentage.index <= end_date))
    return df_percentage.loc[mask].iplot()
interact(population_percentage_infected, s=start, e=end)


#df_percentage = df_cleaned.copy()

#for c in countries:
#    df_percentage[c]=df_percentage[c].apply(lambda x: x/population[c] if x != 0 else 0)
#def population_percentage_infected(offset):
#    return df_percentage.iloc[slide_n_pop.value:].iplot()
#interact(population_percentage_infected, offset=slide_n_pop)

# ToDo
+ Common start time at which the number of infected was greater than or equal to 100 for better comparision
+ Select starting day for all graphics on top with date widget DONE
+ implement deaths, recoverd