**Sources of information** and last update:

In [26]:
# @title { vertical-output: true, display-mode: "form" }
import pandas as pd
from datetime import datetime, timedelta


# Consider adding:
#  https://github.com/COVID19Tracking

# https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data
# FIPS, Admin2, Province_State, Country_Region, Last_Update, Lat, Long_, Confirmed, Deaths, Recovered, Active, Combined_Key
# Issue: US and others are broken down, while Italy for example is whole
def load_latest_johnhopkins_daily():
    tries = 3
    try_date_utc = datetime.utcnow()
    while True:
        try:
            loc_template = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/%m-%d-%Y.csv"
            try_url = try_date_utc.strftime(loc_template)
            df = pd.read_csv(try_url)
            return also_print_df(df, "John Hopkins", try_date_utc)
        except:
            try_date_utc = try_date_utc - timedelta(days=1)
            tries = tries - 1
            if tries is 0:
                print("Out of tries looking for John Hopkins' data (walking back 1 day at a time)")
                exit(1)


# https://github.com/open-covid-19
#  Date, CountryCode, CountryName, RegionCode, RegionName, Confirmed, Deaths, Latitude, Longitude, Population --- sorted by date, ascending
def load_latest_opencovid_feed():
    loc_template = "https://raw.githubusercontent.com/open-covid-19/data/master/output/data.csv"
    df = pd.read_csv(loc_template)
    last_date = datetime.strptime(df.iloc[-1]['Date'], '%Y-%m-%d')
    return also_print_df(df, "Open COVID-19", last_date)


# https://github.com/pcm-dpc/COVID-19/
#  data, stato, ricoverati_con_sintomi, terapia_intensiva, totale_ospedalizzati, isolamento_domiciliare, totale_attualmente_positivi, nuovi_attualmente_positivi, dimessi_guariti, deceduti, totale_casi, tamponi, note_it, note_en
def load_latest_italian_dpc_nationwide_by_date():
    loc_template = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv"
    df = pd.read_csv(loc_template)
    last_date = datetime.strptime(df.iloc[-1]['data'], '%Y-%m-%dT%H:%M:%S')
    return also_print_df(df, "Italian PCM-DPC (nationwide by date)", last_date)


#  data, stato, codice_regione, denominazione_regione, lat, long, ricoverati_con_sintomi, terapia_intensiva, totale_ospedalizzati, isolamento_domiciliare, totale_attualmente_positivi, nuovi_attualmente_positivi, dimessi_guariti, deceduti, totale_casi, tamponi, note_it, note_en
def load_latest_italian_dpc_regional_latest():
    loc_template = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni-latest.csv"
    df = pd.read_csv(loc_template)
    last_date = datetime.strptime(df.iloc[-1]['data'], '%Y-%m-%dT%H:%M:%S')
    return also_print_df(df, "Italian PCM-DPC (regional snapshot)", last_date)


def also_print_df(df, name, date):
    print("Loaded latest " + name + " dataset (" + date.strftime("%Y-%m-%d") + "): " +
          "[" + str(len(df)) + " rows x " + str(len(df.columns)) + " columns]: " + ", ".join(list(df)) + "\n")
    return df


# Load the latest data sets
df_jh = load_latest_johnhopkins_daily()
df_oc = load_latest_opencovid_feed()
df_it_nat_daily = load_latest_italian_dpc_nationwide_by_date()
df_it_by_reg = load_latest_italian_dpc_regional_latest()


Loaded latest John Hopkins dataset (2020-03-25): [3420 rows x 12 columns]: FIPS, Admin2, Province_State, Country_Region, Last_Update, Lat, Long_, Confirmed, Deaths, Recovered, Active, Combined_Key

Loaded latest Open COVID-19 dataset (2020-03-26): [11287 rows x 10 columns]: Date, CountryCode, CountryName, RegionCode, RegionName, Confirmed, Deaths, Latitude, Longitude, Population

Loaded latest Italian PCM-DPC (nationwide by date) dataset (2020-03-25): [31 rows x 14 columns]: data, stato, ricoverati_con_sintomi, terapia_intensiva, totale_ospedalizzati, isolamento_domiciliare, totale_attualmente_positivi, nuovi_attualmente_positivi, dimessi_guariti, deceduti, totale_casi, tamponi, note_it, note_en

Loaded latest Italian PCM-DPC (regional snapshot) dataset (2020-03-25): [21 rows x 18 columns]: data, stato, codice_regione, denominazione_regione, lat, long, ricoverati_con_sintomi, terapia_intensiva, totale_ospedalizzati, isolamento_domiciliare, totale_attualmente_positivi, nuovi_attualmente

Global analysis:

In [27]:
# starting 'Open-Covid-19', get clean daily data for all countries
def get_countries_data_per_day(df):
    # [select rows] remove regional data
    df = df[df['RegionCode'].isna()]

    # [select rows] remove countries with no population data (Kosovo and Antilles) - for uniformity
    df = df[df['Population'].notna()]

    # [select columns] take only the columns we care about
    df = df.loc[:, ['Date', 'CountryName', 'Confirmed', 'Deaths', 'Population']]

    # [add columns] normalize Confirmed and Deaths to the population (%)
    df['Confirmed_Share'] = 100 * df['Confirmed'] / df['Population']
    df['Deaths_Share'] = 100 * df['Deaths'] / df['Population']
    return df


def get_highest_share_countries(df):
    # [unique] get freshest data, unique by country
    df = df.drop_duplicates('CountryName', keep='last')

    # [sort] by Confirmed_Share, descending
    df = df.sort_values("Confirmed_Share", ascending=False)

    # [snr] remove countries with fewer than 1M people
    df = df[df['Population'] > 1E+06]

    # [snr] remove countries with less than 0.005% share (China: 0.005708 as of now)
    df = df[df['Confirmed_Share'] > 0.005]
    return df


countries_daily = get_countries_data_per_day(df_oc)
countries_latest_top = get_highest_share_countries(countries_daily)
#print(countries_latest_top)
# [select rows] zoom into chinese data
#df = countries_latest_top[countries_latest_top['CountryName'] == "China"]

countries_latest_top


Unnamed: 0,Date,CountryName,Confirmed,Deaths,Population,Confirmed_Share,Deaths_Share
11286,2020-03-26,Italy,74386,7503.0,60550080.0,0.12285,0.012391
10975,2020-03-25,Switzerland,8789,86.0,8591365.0,0.1023,0.001001
11285,2020-03-26,Spain,47610,3434.0,46736780.0,0.101868,0.007348
10944,2020-03-25,Austria,5282,30.0,8955102.0,0.058983,0.000335
11161,2020-03-25,Norway,2566,10.0,5378857.0,0.047705,0.000186
11001,2020-03-25,Germany,31554,149.0,83517040.0,0.037782,0.000178
10958,2020-03-25,Belgium,4269,122.0,11539330.0,0.036995,0.001057
11051,2020-03-25,France,22302,1100.0,65129730.0,0.034242,0.001689
11160,2020-03-25,Netherlands,5560,276.0,17097130.0,0.03252,0.001614
11095,2020-03-25,Iran,24811,1934.0,82913910.0,0.029924,0.002333
