In [1]:
from bokeh.layouts import row, column
from bokeh.models import Title, ColumnDataSource, HoverTool, LinearAxis, LabelSet, Div, SingleIntervalTicker
from bokeh.palettes import Category10
from bokeh.io import output_file, show
from bokeh.models import BasicTickFormatter
from bokeh.plotting import figure
from bokeh.models import Span
from bokeh.models import Label
from bokeh.models import LogScale

In [2]:
import numpy as np
from bokeh.io import show
import re

In [3]:
COLORS_FOR_LOCKDOWN_LEVEL = ["purple", "blue", "green", "orange", "red"]

In [4]:
import pandas as pd
import datetime
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [21]:
data = pd.read_csv("Reports (18).csv")
available_countries = countries = data["country"].unique()
# print(sorted(available_countries))

In [22]:
data.head()

Unnamed: 0,country,state,county,city,population,update_time,db_source_name,db_source_url,db_source_time,confirmed,...,home_confinement,current_positive_cases,new_positive_cases,note_it,note_en,cases/100k pop.,hospitalized,percent,tests,quarantine
0,United States of America,Alaska,,,731545.0,2020-03-28 20:00:00+00:00,The COVID Tracking Project,https://covidtracking.com/api/,2020-03-28 00:00:00+00:00,85.0,...,,,,,,,,,,
1,United States of America,Alabama,,,4903185.0,2020-03-28 20:00:00+00:00,The COVID Tracking Project,https://covidtracking.com/api/,2020-03-28 00:00:00+00:00,696.0,...,,,,,,,,,,
2,United States of America,Arkansas,,,3017804.0,2020-03-28 20:00:00+00:00,The COVID Tracking Project,https://covidtracking.com/api/,2020-03-28 00:00:00+00:00,404.0,...,,,,,,,,,,
3,United States of America,American Samoa,,,,2020-03-28 20:00:00+00:00,The COVID Tracking Project,https://covidtracking.com/api/,2020-03-28 00:00:00+00:00,0.0,...,,,,,,,,,,
4,United States of America,Arizona,,,7278717.0,2020-03-28 20:00:00+00:00,The COVID Tracking Project,https://covidtracking.com/api/,2020-03-28 00:00:00+00:00,873.0,...,,,,,,,,,,


In [7]:
lockdown_dataframe = pd.read_csv(r"C:\Users\shira\Downloads\all_states (5).csv")
available_countries = countries = lockdown_dataframe["country"].unique()
# for now we'll focus only on these columns
lockdown_dataframe = lockdown_dataframe[["lockdown_level", "country", "province", "start_date"]]
lockdown_dataframe = lockdown_dataframe.replace("US", "United States of America")

In [8]:
# verifying that each state is updated once per day means that we don't need to worry about double counting
# a state twice in a single day
def verify_each_state_is_updated_at_most_once_per_day():
    for state in data["state"].unique():
        state_data = data[data["state"] == state]
        if len(state_data["update_time"]) != len(state_data):
            print(state)
verify_each_state_is_updated_at_most_once_per_day()

In [9]:
def get_data_for_country(data, country, db_source=None, state=None):
    country_data = data[data["country"] == country]
    # some data is dirty in that it sums all the cases and sets state to be NAN (LOOK AT ITALY!!)
    
    # we need to see how we want to handle the nan states (sometimes its fine, sometimes its not good - Italy...)
    if country == "Italy":
        country_data = country_data[country_data['state'].notna()]
    country_data = country_data[country_data['state'] != 'sum']
    if state is not None:
        country_data = country_data[country_data["state"] == state]
        print("Found {} entries for country {} from state {}".format(len(country_data), country, state))
    if db_source is not None:
        country_data = country_data[country_data["db_source_url"] == db_source]
        print("Found {} entries for country {} from source {}".format(len(country_data), country, db_source))
    print("Found {} entries for country {}".format(len(country_data), country))
    return country_data

In [10]:
def get_lockdown_data_for_country(data, country, province=None):
    country_data = data[data["country"] == country]    
    # if no province is given, we will return the data for all the provinces
    if province is not None:
        country_data = country_data[country_data["province"] == province]
        print("Found {} entries for country {} from province {}".format(len(country_data), country, province))
    else:
        country_data = country_data[country_data["province"] == "all"]
        print("Found {} entries for country {} from all provinces".format(len(country_data), country))
    return country_data

In [11]:
def get_unique_dates_for_country(country_data, country):
    patient20 = country_data.loc[country_data["confirmed"] >= 1, ["update_time"]]
    unique_dates = sorted(patient20["update_time"].unique())
    #print('Found {} unique dates for country {} starting from date: {}'.format(len(unique_dates), country, unique_dates[0]))
    return unique_dates

In [12]:
def string_date_to_datetime_day(date):
    day = re.split('-|:',date)[0:3]
    day = datetime.date(int(day[0]), int(day[1]), int(day[2][0:2]))
    return day

In [13]:
def compute_days_since_patient_zero(country_data, country_unique_dates):
    # for each day, get the number of days that have passed since patient zero discovered
    starting_date = country_unique_dates[0]
    starting_date = string_date_to_datetime_day(starting_date)
    days_since_patient_zero = []
    for date in country_data["update_time"]:
        day = string_date_to_datetime_day(date)
        days_since_patient_zero.append((day - starting_date).days)
    country_data["days_since_patient_zero"] = days_since_patient_zero

In [14]:
def compute_days_since_patient_zero_for_lockdown_data(country_data, country_lockdown_data, country_unique_dates):
    # for each day, get the number of days that have passed since patient zero discovered
    starting_date = country_unique_dates[0]
    starting_date = string_date_to_datetime_day(starting_date)
    days_since_patient_zero = []
    for date in country_lockdown_data["start_date"]:
        day = string_date_to_datetime_day(date)
        diff = (day - starting_date).days
        # so apparently, the date for all the lockdown level 0 is 1/1/2019 so we'll just set negative values
        # here to be 0 (its not really important anyway)
        diff = max(0, diff)
        days_since_patient_zero.append(diff)
    country_lockdown_data["days_since_patient_zero"] = days_since_patient_zero

In [15]:
def get_total_growth_rate_per_day(country_data):
    previous_number_of_cases = 0
    total_cases = []
    new_cases_per_day = []
    for days_since_0 in sorted(country_data["days_since_patient_zero"].unique()):
        data_for_day = country_data[country_data["days_since_patient_zero"] == days_since_0]
        day = np.unique([d.split()[0] for d in data_for_day["update_time"]])[0]
        total_confirmed_cases = np.sum(data_for_day["confirmed"])
        new_cases = total_confirmed_cases - previous_number_of_cases
#         print("New cases for day {}: {}".format(day, int(new_cases)))
#         print("Total cases up to and including day {}: {}\n".format(day, int(total_confirmed_cases)))
        previous_number_of_cases = total_confirmed_cases
        total_cases.append(int(total_confirmed_cases))
        new_cases_per_day.append(int(new_cases))
    return total_cases, new_cases_per_day

In [36]:
def get_R_rate_per_day(country_data):
    previous_number_of_cases = 1
    total_cases = []
    ratio_per_day = []
    for days_since_0 in sorted(country_data["days_since_patient_zero"].unique()):
        data_for_day = country_data[country_data["days_since_patient_zero"] == days_since_0]
        day = np.unique([d.split()[0] for d in data_for_day["update_time"]])[0]
        total_confirmed_cases = np.sum(data_for_day["confirmed"])
        ratio = float(total_confirmed_cases / previous_number_of_cases)
#         print("New cases for day {}: {}".format(day, int(new_cases)))
#         print("Total cases up to and including day {}: {}\n".format(day, int(total_confirmed_cases)))
        previous_number_of_cases = total_confirmed_cases
        total_cases.append(int(total_confirmed_cases))
        ratio_per_day.append(float(ratio))
    return ratio_per_day

In [108]:
def plot_total_cumulative_cases(countries, states, db_sources, use_log_scale=False):
    
    p = figure(title="Coronavirus Cumulative Total Cases for Countries: ",
           tools='pan,wheel_zoom,box_zoom,reset,save',
           plot_width=1100, plot_height=800)

    country_text = []
    for country, state in zip(countries, states):
        print(country, state)
        if state is not None:
            country_text.append('{} - {}'.format(country, state))
        else:
            country_text.append('{}'.format(country))
    
    p.add_layout(Title(text=", ".join(country_text), text_font_style='italic', text_font_size='15pt'), 'above')

    for idx, (country, db_source, state) in enumerate(zip(countries, db_sources, states)):
        country_data = get_data_for_country(data, country, db_source, state)
        country_unique_dates = get_unique_dates_for_country(country_data, country)
        compute_days_since_patient_zero(country_data, country_unique_dates)

        total_cases, _ = get_total_growth_rate_per_day(country_data)
        country_growth_data = pd.DataFrame()
        country_growth_data["days_since_patient_zero"] = sorted(country_data["days_since_patient_zero"].unique())
        country_growth_data["total_number_of_cases"] = total_cases
        country_growth_data["country"] = [country] * len(total_cases)
        growth_source = ColumnDataSource.from_df(country_growth_data)

        s = p.scatter('days_since_patient_zero', 'total_number_of_cases', size=5,
                      source=growth_source, 
                      color=Category10[10][idx], fill_color=Category10[10][idx], line_color=Category10[10][idx],
                      hover_fill_color=Category10[10][idx],
                      legend='{}'.format(country_text[idx]))

        tool_tips = [("Country", "@country"),
                     ('Days Since Patient Zero', '@days_since_patient_zero'),
                     ('Total Number of Cases', '@total_number_of_cases')]
        hover = HoverTool(renderers=[s], tooltips=tool_tips)
        p.add_tools(hover)

       # here we add data for lockdowns
        country_lockdown_data = get_lockdown_data_for_country(lockdown_dataframe, country, province=state)
        compute_days_since_patient_zero_for_lockdown_data(country_data, country_lockdown_data, country_unique_dates)
        
        lockdown_data = pd.DataFrame()
        lockdown_data["level"] = country_lockdown_data["lockdown_level"]
        lockdown_data["days_since_patient_0"] = country_lockdown_data["days_since_patient_zero"]
        lockdown_data = lockdown_data.sort_values('days_since_patient_0', ascending=True).drop_duplicates('level').sort_index()
        
        for level, day in zip(lockdown_data["level"], lockdown_data["days_since_patient_0"]):
            if level == 0:
                continue
            vline = Span(location=day, dimension='height', line_dash='dashed', 
                         line_color=COLORS_FOR_LOCKDOWN_LEVEL[level], line_width=2)
            p.renderers.extend([vline])
            
            level_text = Label(x=day + 0.1, 
                               y=np.max(total_cases) * (level / 4), 
                               text='{} \nLevel: {}'.format(country_text[idx], level))
            p.add_layout(level_text)
        
    if use_log_scale:
        p.y_scale = LogScale()
    p.yaxis.formatter = BasicTickFormatter(use_scientific=False)
    p.title.text_font_size = '15pt'
    p.legend.location = "top_left"
    p.legend.click_policy = 'hide'
    p.legend.background_fill_color = '#fefefe'
    p.xaxis.axis_label = "Days Since Patient Zero"
    p.yaxis.axis_label = "Total Number of Cases"
    p.grid.grid_line_color = "gray"
    p.xaxis.axis_label_text_font_size = '14pt'
    p.yaxis.axis_label_text_font_size = '14pt'
    p.xaxis.major_label_text_font_size = '10pt'
    p.yaxis.major_label_text_font_size = '10pt'

    show(p)

In [109]:
def plot_daily_new_cases(countries, states, db_sources, use_log_scale=True):
    
    p = figure(title="Coronavirus Daily Number of New Cases for Countries: ",
           tools='pan,wheel_zoom,box_zoom,reset,save',
           plot_width=1100, plot_height=800)

    country_text = []
    for country, state in zip(countries, states):
        if state is not None:
            country_text.append('{} - {}'.format(country, state))
        else:
            country_text.append('{}'.format(country))
        
    p.add_layout(Title(text=", ".join(country_text), text_font_style='italic', text_font_size='15pt'), 'above')

    for idx, (country, db_source, state) in enumerate(zip(countries, db_sources, states)):
        country_data = get_data_for_country(data, country, db_source, state)
        country_unique_dates = get_unique_dates_for_country(country_data, country)

        compute_days_since_patient_zero(country_data, country_unique_dates)

        _, daily_new_cases = get_total_growth_rate_per_day(country_data)
        country_growth_data = pd.DataFrame()
        country_growth_data["days_since_patient_zero"] = sorted(country_data["days_since_patient_zero"].unique())
        country_growth_data["daily_new_cases"] = daily_new_cases
        country_growth_data["country"] = [country] * len(daily_new_cases)

        growth_source = ColumnDataSource.from_df(country_growth_data)

        s = p.scatter('days_since_patient_zero', 'daily_new_cases', size=5,
                      source=growth_source, 
                      color=Category10[10][idx], fill_color=Category10[10][idx], line_color=Category10[10][idx],
                      hover_fill_color=Category10[10][idx],
                      legend='{}'.format(country_text[idx]))

        tool_tips = [("Country", "@country"),
                     ('Days Since Patient Zero', '@days_since_patient_zero'),
                     ('Daily Number of New Cases', '@daily_new_cases')]
        hover = HoverTool(renderers=[s], tooltips=tool_tips)
        p.add_tools(hover)

# here we add data for lockdowns
        country_lockdown_data = get_lockdown_data_for_country(lockdown_dataframe, country, province=state)
        compute_days_since_patient_zero_for_lockdown_data(country_data, country_lockdown_data, country_unique_dates)
        
        lockdown_data = pd.DataFrame()
        lockdown_data["level"] = country_lockdown_data["lockdown_level"]
        lockdown_data["days_since_patient_0"] = country_lockdown_data["days_since_patient_zero"]
        lockdown_data = lockdown_data.sort_values('days_since_patient_0', ascending=True).drop_duplicates('level').sort_index()
        
        for level, day in zip(lockdown_data["level"], lockdown_data["days_since_patient_0"]):
            if level == 0:
                continue
            vline = Span(location=day, dimension='height', line_dash='dashed', 
                         line_color=COLORS_FOR_LOCKDOWN_LEVEL[level], line_width=2)
            p.renderers.extend([vline])
            
            level_text = Label(x=day + 0.1, 
                               y=np.max(daily_new_cases) * (level / 4), 
                               text='{} \nLevel: {}'.format(country_text[idx], level))
            p.add_layout(level_text)
        
        
    if use_log_scale:
        p.y_scale = LogScale()
    p.yaxis.formatter = BasicTickFormatter(use_scientific=False)
    p.title.text_font_size = '15pt'
    p.legend.location = "top_left"
    p.legend.click_policy = 'hide'
    p.legend.background_fill_color = '#fefefe'
    p.xaxis.axis_label = "Days Since Patient Zero"
    p.yaxis.axis_label = "Daily Number of New Cases"
    p.grid.grid_line_color = "gray"
    p.xaxis.axis_label_text_font_size = '14pt'
    p.yaxis.axis_label_text_font_size = '14pt'
    p.xaxis.major_label_text_font_size = '10pt'
    p.yaxis.major_label_text_font_size = '10pt'

    show(p)

In [141]:
def plot_R_ratio_daily(countries, states, db_sources, use_log_scale=True):
    
    p = figure(title="Coronavirus Daily Increase Factor: ",
           tools='pan,wheel_zoom,box_zoom,reset,save',
           plot_width=1100, plot_height=800)
    

    country_text = []
    for country, state in zip(countries, states):
        if state is not None:
            country_text.append('{} - {}'.format(country, state))
        else:
            country_text.append('{}'.format(country))
        
    p.add_layout(Title(text=", ".join(country_text), text_font_style='italic', text_font_size='15pt'), 'above')

    for idx, (country, db_source, state) in enumerate(zip(countries, db_sources, states)):
        country_data = get_data_for_country(data, country, db_source, state)
        country_unique_dates = get_unique_dates_for_country(country_data, country)

        compute_days_since_patient_zero(country_data, country_unique_dates)

        daily_R_ratio = get_R_rate_per_day(country_data)
        country_R_data = pd.DataFrame()
        country_R_data["days_since_patient_zero"] = sorted(country_data["days_since_patient_zero"].unique())
        country_R_data["daily_R_ratio"] = daily_R_ratio
        country_R_data["country"] = [country] * len(daily_R_ratio)

        R_source = ColumnDataSource.from_df(country_R_data)

        s = p.scatter('days_since_patient_zero', 'daily_R_ratio', size=5,
                      source=R_source, 
                      color=Category10[10][idx], fill_color=Category10[10][idx], line_color=Category10[10][idx],
                      hover_fill_color=Category10[10][idx],
                      legend='{}'.format(country_text[idx]))

        tool_tips = [("Country", "@country"),
                     ('Days Since Patient Zero', '@days_since_patient_zero'),
                     ('Daily Increase Factor', '@daily_R_ratio')]
        hover = HoverTool(renderers=[s], tooltips=tool_tips)
        p.add_tools(hover)

        # here we add data for lockdowns
        country_lockdown_data = get_lockdown_data_for_country(lockdown_dataframe, country, province=state)
        compute_days_since_patient_zero_for_lockdown_data(country_data, country_lockdown_data, country_unique_dates)
        
        lockdown_data = pd.DataFrame()
        lockdown_data["level"] = country_lockdown_data["lockdown_level"]
        lockdown_data["days_since_patient_0"] = country_lockdown_data["days_since_patient_zero"]
        lockdown_data = lockdown_data.sort_values('days_since_patient_0', ascending=True).drop_duplicates('level').sort_index()
        
        for level, day in zip(lockdown_data["level"], lockdown_data["days_since_patient_0"]):
            if level == 0:
                continue
            vline = Span(location=day, dimension='height', line_dash='dashed', 
                         line_color=COLORS_FOR_LOCKDOWN_LEVEL[level], line_width=2)
            p.renderers.extend([vline])
            
            level_text = Label(x=day + 0.1, 
                               y=0.1* (level / 4), 
                               text='{} \nLevel: {}'.format(country_text[idx], level))
            p.add_layout(level_text)
        
    #if use_log_scale:
     #   p.y_scale = LogScale()
    p.yaxis.formatter = BasicTickFormatter(use_scientific=False)
    p.title.text_font_size = '15pt'
    p.legend.location = "top_left"
    p.legend.click_policy = 'hide'
    p.legend.background_fill_color = '#fefefe'
    p.xaxis.axis_label = "Days Since Patient Zero"
    p.yaxis.axis_label = "Daily Increase Factor"
    p.grid.grid_line_color = "gray"
    p.xaxis.axis_label_text_font_size = '14pt'
    p.yaxis.axis_label_text_font_size = '14pt'
    p.xaxis.major_label_text_font_size = '10pt'
    p.yaxis.major_label_text_font_size = '10pt'
    p.y_range.end = 3
    p.y_range.start= 0

    show(p)

In [136]:
# need to match countries with desired states and dbs
# THE PLOTS FOR THIS ARE MESSED UP BECAUSE OF THE OVERLAPS...
countries = ["Sweden"]
states = [None]
db_sources = ["github.com/CSSEGISandData/COVID-19.git"]

In [137]:
plot_total_cumulative_cases(countries, states, db_sources, use_log_scale=True)

Sweden None
Found 58 entries for country Sweden from source github.com/CSSEGISandData/COVID-19.git
Found 58 entries for country Sweden
Found 0 entries for country Sweden from all provinces


In [122]:
plot_daily_new_cases(countries, states, db_sources, use_log_scale=True)

Found 475 entries for country Spain from source git://github.com/victorvicpal/COVID19_es
Found 475 entries for country Spain
Found 8 entries for country Spain from all provinces


In [140]:
plot_R_ratio_daily(countries, states, db_sources, use_log_scale=True)

Found 58 entries for country Sweden from source github.com/CSSEGISandData/COVID-19.git
Found 58 entries for country Sweden
Found 0 entries for country Sweden from all provinces
