# Loading libraries and data 

In [1]:
import pandas as pd
import altair as alt

from github import Github

import corona_secret_api as secret

g = Github(secret.api_key)

In [2]:
START_CASES_INFECTION = 100
START_CASES_DEATH = 5

In [3]:
URL = "https://covid.ourworldindata.org/data/ecdc/full_data.csv"

In [4]:
df = pd.read_csv(URL)

In [5]:
df.date = pd.to_datetime(df.date)

In [6]:
df = df.sort_values(['location','date'], ascending=True)

In [7]:
df.head()

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths
0,2019-12-31,Afghanistan,0,0,0,0
1,2020-01-01,Afghanistan,0,0,0,0
2,2020-01-02,Afghanistan,0,0,0,0
3,2020-01-03,Afghanistan,0,0,0,0
4,2020-01-04,Afghanistan,0,0,0,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6271 entries, 0 to 6270
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          6271 non-null   datetime64[ns]
 1   location      6271 non-null   object        
 2   new_cases     6271 non-null   int64         
 3   new_deaths    6271 non-null   int64         
 4   total_cases   6271 non-null   int64         
 5   total_deaths  6271 non-null   int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 342.9+ KB


# Date of data

In [9]:
df.date.max()

Timestamp('2020-03-22 00:00:00')

# Creating data for graphs

In [10]:
df = df.drop(columns=['new_cases', 'new_deaths'])

In [11]:
df_infection = df[df.total_cases >= START_CASES_INFECTION].copy()
df_death = df[df.total_deaths >= START_CASES_DEATH].copy()

In [12]:
df_infection.head()

Unnamed: 0,date,location,total_cases,total_deaths
191,2020-03-20,Argentina,128,3
192,2020-03-21,Argentina,158,3
193,2020-03-22,Argentina,225,4
264,2020-03-19,Armenia,115,0
265,2020-03-20,Armenia,122,0


In [13]:
df_infection[df_infection.location == 'Austria']

Unnamed: 0,date,location,total_cases,total_deaths
420,2020-03-09,Austria,102,0
421,2020-03-10,Austria,131,0
422,2020-03-11,Austria,182,0
423,2020-03-12,Austria,246,0
424,2020-03-13,Austria,361,1
425,2020-03-14,Austria,504,1
426,2020-03-15,Austria,655,1
427,2020-03-16,Austria,860,1
428,2020-03-17,Austria,1016,3
429,2020-03-18,Austria,1332,3


In [14]:
df_death.head()

Unnamed: 0,date,location,total_cases,total_deaths
160,2020-03-18,Algeria,60,5
161,2020-03-19,Algeria,73,6
162,2020-03-20,Algeria,82,7
163,2020-03-21,Algeria,94,10
164,2020-03-22,Algeria,94,10


In [15]:
df_infection['total_cases_normalized'] = df_infection.groupby(
    "location")[['total_cases']].transform(lambda x: x / x.min() * START_CASES_INFECTION)
df_infection.total_cases_normalized = df_infection.total_cases_normalized.astype("int")

df_death['total_deaths_normalized'] = df_death.groupby(
    "location")[['total_deaths']].transform(lambda x: x / x.min() * START_CASES_DEATH)
df_death.total_deaths_normalized = df_death.total_deaths_normalized.astype("int")

In [16]:
df_infection["days_after"] = df_infection.groupby("location").cumcount()
df_death["days_after"] = df_death.groupby("location").cumcount()

In [17]:
df_infection.head()

Unnamed: 0,date,location,total_cases,total_deaths,total_cases_normalized,days_after
191,2020-03-20,Argentina,128,3,100,0
192,2020-03-21,Argentina,158,3,123,1
193,2020-03-22,Argentina,225,4,175,2
264,2020-03-19,Armenia,115,0,100,0
265,2020-03-20,Armenia,122,0,106,1


In [18]:
df_death.head()

Unnamed: 0,date,location,total_cases,total_deaths,total_deaths_normalized,days_after
160,2020-03-18,Algeria,60,5,5,0
161,2020-03-19,Algeria,73,6,6,1
162,2020-03-20,Algeria,82,7,7,2
163,2020-03-21,Algeria,94,10,10,3
164,2020-03-22,Algeria,94,10,10,4


In [19]:
df_summary = df[df.date == df.date.max()].drop(columns='date')
df_summary['cases'] = df_summary.total_cases
df_summary = df_summary.drop(['total_cases', 'total_deaths'], axis=1)

In [20]:
df_summary.head()

Unnamed: 0,location,cases
72,Afghanistan,24
86,Albania,76
164,Algeria,94
173,Andorra,88
174,Angola,2


In [21]:
df_infection.location.unique()

array(['Argentina', 'Armenia', 'Australia', 'Austria', 'Bahrain',
       'Belgium', 'Brazil', 'Bulgaria', 'Canada', 'Chile', 'China',
       'Colombia', 'Costa Rica', 'Croatia', 'Czech Republic', 'Denmark',
       'Dominican Republic', 'Ecuador', 'Egypt', 'Estonia', 'Finland',
       'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'India',
       'Indonesia', 'International', 'Iran', 'Iraq', 'Ireland', 'Israel',
       'Italy', 'Japan', 'Kuwait', 'Latvia', 'Lebanon', 'Lithuania',
       'Luxembourg', 'Malaysia', 'Mexico', 'Netherlands', 'Norway',
       'Pakistan', 'Panama', 'Peru', 'Philippines', 'Poland', 'Portugal',
       'Qatar', 'Romania', 'Russia', 'San Marino', 'Saudi Arabia',
       'Serbia', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa',
       'South Korea', 'Spain', 'Sweden', 'Switzerland', 'Taiwan',
       'Thailand', 'Turkey', 'United Arab Emirates', 'United Kingdom',
       'United States', 'Uruguay', 'World'], dtype=object)

In [22]:
df_death.location.unique()

array(['Algeria', 'Australia', 'Austria', 'Belgium', 'Brazil', 'Canada',
       'China', 'Denmark', 'Ecuador', 'Egypt', 'France', 'Germany',
       'Greece', 'Indonesia', 'International', 'Iran', 'Iraq', 'Italy',
       'Japan', 'Luxembourg', 'Netherlands', 'Norway', 'Peru',
       'Philippines', 'Poland', 'Portugal', 'San Marino', 'South Korea',
       'Spain', 'Sweden', 'Switzerland', 'Turkey', 'United Kingdom',
       'United States', 'World'], dtype=object)

In [23]:
COUNTRIES = set(df[(df.date == df.date.max())].nlargest(columns='total_cases', n=34).sort_values('location').location)
COUNTRIES.remove('World')

# Altair

## Storing data locally 

In [24]:
url_infection = 'data_infection.json'
url_death = 'data_death.json'
url_summary = 'data_summary.json'

df_infection[df_infection.location.isin(COUNTRIES)].to_json(url_infection, orient='records')
df_death[df_death.location.isin(COUNTRIES)].to_json(url_death, orient='records')
df_summary[df_summary.location.isin(COUNTRIES)].to_json(url_summary, orient='records')

## Uploading data to github

In [25]:
repo = g.get_repo("Datenspieler/notebooks_for_blog")

In [26]:
FILELIST = ['data_infection.json', 'data_death.json', 'data_summary.json']

In [38]:
for file_for_upload in FILELIST:
    print('Uploading', file_for_upload, end=' - ')
    with open(file_for_upload) as f:
        data = f.read()
    contents = repo.get_contents("2020-corona/" + file_for_upload)
    repo.update_file(contents.path, "Update data as of " + df.date.max().strftime('%Y-%m-%d'), data, 
                     contents.sha, branch="master")
    print(contents.last_modified)

Uploading data_infection.json - Sun, 22 Mar 2020 20:15:13 GMT
Uploading data_death.json - Sun, 22 Mar 2020 20:19:16 GMT
Uploading data_summary.json - Sun, 22 Mar 2020 20:19:18 GMT


## Preparing link to data in github 

In [None]:
# In future version files should be pushed to git automatically, this time I uploaded them 

BASEURL = 'https://raw.githubusercontent.com/Datenspieler/notebooks_for_blog/master/2020-corona/'

url_infection = BASEURL + url_infection
url_death = BASEURL + url_death
url_summary = BASEURL + url_summary

In [None]:
for repo in g.get_user().get_repos():
    print(repo.name)
    print(repo)

In [None]:
repo = g.get_repo("Datenspieler/notebooks_for_blog")

In [None]:
contents = repo.get_contents("/2020-corona")

In [None]:
contents

In [None]:
content = contents[0]

In [None]:
content.update()

In [None]:
#repo.create_file("2020-corona/test.txt", 
#                 "test", "test_content", branch="master")

In [None]:
contents = repo.get_contents("2020-corona/test.txt")

In [None]:
contents = repo.get_contents("2020-corona/data_death.json")

In [None]:
repo.update_file(contents.path, "more tests", data, 
                 contents.sha, branch="master")


In [None]:
contents.update()

In [None]:
contents.last_modified

In [None]:
with open('data_summary.json') as f:
    data = f.read()

In [None]:
data[:100]

## Plotting data 

In [None]:
highlight = alt.selection(type='single', on='mouseover',
                          fields=['location'], nearest=True)

base_infection = alt.Chart(url_infection).encode(
    alt.X('days_after', type='quantitative', scale=alt.Scale(domain=[0,20], type='ordinal'),
         title='Days since the 100th confirmed infection'),
    alt.Y('total_cases_normalized', type='quantitative', scale=alt.Scale(type='log', base=10), 
          title='Total confirmed infections of COVID-19, normalized'),
    alt.Color('location:N', title="Country", legend=None),
    alt.Tooltip(['location:N', 'total_cases:Q', 'date:T'])
).properties(
    title='Development of Corona infections',
)

base_death = alt.Chart(url_death).encode(
    alt.X('days_after', type='quantitative',  scale=alt.Scale(domain=[0,20], type='ordinal', bins=list(range(100))),
         title='Days since the 5th confirmed death'),
    alt.Y('total_deaths_normalized', type='quantitative', scale=alt.Scale(type='log', base=10), 
          title='Total confirmed deaths of COVID-19, normalized'),
    alt.Color('location:N', title="Country", legend=None),
    alt.Tooltip(['location:N', 'total_deaths:Q', 'date:T'])
).properties(
    title='Development of Corona deaths',
)

base_summary = alt.Chart(url_summary).mark_bar().encode(
    x = alt.X('cases', type='quantitative', title='Confirmed cases', scale=alt.Scale(type='linear')),
    y = alt.Y('location:N', title='Country'),
    color = alt.condition(highlight, alt.Color('location:N', title="Country", legend=None), alt.ColorValue("grey")),
    tooltip = alt.Tooltip(['location:N', 'cases:Q'])
).add_selection(
    highlight
).properties(
    width=200,
    title='Corona cases by country'
)


points_infection = base_infection.mark_circle().encode(
    opacity=alt.value(0)
).add_selection(
    highlight
).properties(
    width=600
).interactive(
    bind_y = False
)

points_death = base_death.mark_circle().encode(
    opacity=alt.value(0)
).add_selection(
    highlight
).properties(
    width=600
).interactive(
    bind_y = False
)


lines_infection = base_infection.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(5), legend=None)
)

lines_death = base_death.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(5), legend=None)
)

chart = alt.vconcat(points_infection + lines_infection, points_death + lines_death)

chart = alt.hconcat(base_summary, chart)


#chart.save('corona.html')
chart.save('corona.json')

chart