In [576]:
import pandas as pd
import ssl
import json
import urllib

import plotly
import plotly.graph_objs as go
import plotly.express as px
import collections 
import numpy as np
import matplotlib.pyplot as plt

# Retrieve COVID data from ECDC

In [577]:
covid_url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"
ssl._create_default_https_context = ssl._create_unverified_context

In [578]:
covid_json_unformated = urllib.request.urlopen(covid_url).read().decode("utf-8")
covid_json = json.loads(covid_json_unformated)


In [579]:
cdf = pd.DataFrame(covid_json['records'])
date = pd.to_datetime(cdf["year"] + "-" + cdf["month"] + "-" + cdf["day"]).dt.date
date = date.to_frame(name = "date_reported")
cdf = date.join(cdf.drop(columns = ["dateRep", "day", "month", "year"]))
cdf.rename(columns = {
    "countriesAndTerritories": "Place",
    "geoId" : "ISO_alpha-2_code",
    "countryterritoryCode" : "ISO_alpha-3_code",
    "popData2019": "Population",
    "continentExp" : "Continent",
    "Cumulative_number_for_14_days_of_COVID-19_cases_per_100000" : "14d-incidence"
    }, inplace = True)
cdf["14d-incidence"] = pd.to_numeric(cdf["14d-incidence"], errors = "coerce")

In [580]:
cdf.dtypes

date_reported        object
cases                 int64
deaths                int64
Place                object
ISO_alpha-2_code     object
ISO_alpha-3_code     object
Population          float64
Continent            object
14d-incidence       float64
dtype: object

In [581]:
start_of_recording = min(cdf["date_reported"])
deltaTime = (cdf["date_reported"]-start_of_recording)
cdf["deltaTime_since_start_of_recording"] = deltaTime.dt.days

# Cleanup

In [582]:
cdf.describe()

Unnamed: 0,cases,deaths,Population,14d-incidence,deltaTime_since_start_of_recording
count,61900.0,61900.0,61777.0,59021.0,61900.0
mean,1155.147237,26.05546,40987700.0,66.320586,199.795735
std,6779.224479,131.227055,153129400.0,162.32924,89.753042
min,-8261.0,-1918.0,815.0,-147.419587,0.0
25%,0.0,0.0,1293120.0,0.757526,128.0
50%,15.0,0.0,7169456.0,6.724045,202.0
75%,273.0,4.0,28515830.0,52.572719,276.0
max,234633.0,4928.0,1433784000.0,1900.83621,349.0


In [583]:
for column in cdf.columns:
    print(column, "has NaN:", cdf[column].isna().sum())

cdf.shape

date_reported has NaN: 0
cases has NaN: 0
deaths has NaN: 0
Place has NaN: 0
ISO_alpha-2_code has NaN: 0
ISO_alpha-3_code has NaN: 123
Population has NaN: 123
Continent has NaN: 0
14d-incidence has NaN: 2879
deltaTime_since_start_of_recording has NaN: 0


(61900, 10)

In [584]:
cdf = cdf[cdf["14d-incidence"].notna()]
for column in cdf.columns:
    print(column, "has NaN:", cdf[column].isna().sum())
cdf.shape

date_reported has NaN: 0
cases has NaN: 0
deaths has NaN: 0
Place has NaN: 0
ISO_alpha-2_code has NaN: 0
ISO_alpha-3_code has NaN: 0
Population has NaN: 0
Continent has NaN: 0
14d-incidence has NaN: 0
deltaTime_since_start_of_recording has NaN: 0


(59021, 10)

In [585]:
for column in ["cases", "deaths", "14d-incidence"]:
    isnegative = cdf[column] < 0
    print(column, "has negatives:", isnegative.sum())
cdf.shape

cases has negatives: 17
deaths has negatives: 8
14d-incidence has negatives: 40


(59021, 10)

In [586]:
for column in ["cases", "deaths", "14d-incidence"]:
    ispositive = cdf[column] >= 0
    cdf = cdf.loc[ispositive]
    isnegative = cdf[column] < 0
    print(column, "has negatives:", isnegative.sum())
cdf.shape

cases has negatives: 0
deaths has negatives: 0
14d-incidence has negatives: 0


(58959, 10)

In [587]:
cdf.describe()

Unnamed: 0,cases,deaths,Population,14d-incidence,deltaTime_since_start_of_recording
count,58959.0,58959.0,58959.0,58959.0,58959.0
mean,1212.754168,27.316135,41241310.0,66.402666,206.241151
std,6941.109276,133.162595,153765500.0,162.373044,85.97275
min,0.0,0.0,815.0,0.0,13.0
25%,0.0,0.0,1324820.0,0.763241,138.0
50%,20.0,0.0,7813207.0,6.732521,209.0
75%,310.0,5.0,28608720.0,52.601692,279.0
max,234633.0,4928.0,1433784000.0,1900.83621,349.0


In [588]:
layout_hist = {
        "template" : "plotly", 
        "yaxis": {"title": {"text": "frequency"}},
        "xaxis": {"title": {"text": "number of counts"}}
        }

In [589]:
for column in ["cases", "deaths"]:
    counts = cdf[column].value_counts()
    data = go.Scatter(
        mode = "markers",
        x = counts.index,
        y = counts
    )
    fig = go.Figure(data = data, layout = layout_hist)
    fig.update_layout(title = column)
    fig.update_xaxes(type = "log")
    fig.show()

In [590]:
cdf.describe()

Unnamed: 0,cases,deaths,Population,14d-incidence,deltaTime_since_start_of_recording
count,58959.0,58959.0,58959.0,58959.0,58959.0
mean,1212.754168,27.316135,41241310.0,66.402666,206.241151
std,6941.109276,133.162595,153765500.0,162.373044,85.97275
min,0.0,0.0,815.0,0.0,13.0
25%,0.0,0.0,1324820.0,0.763241,138.0
50%,20.0,0.0,7813207.0,6.732521,209.0
75%,310.0,5.0,28608720.0,52.601692,279.0
max,234633.0,4928.0,1433784000.0,1900.83621,349.0


# Questions

## Which country showed the highest/lowest fluctuation in 14d-incidence within a year?

In [591]:
# fluctuation definition: high variance

countries_descr = cdf.groupby(["Place"])
countries_var = cdf[["Place", "14d-incidence"]].groupby(["Place"]).var()

In [592]:
n = 1
fluctuating_countries = countries_var.sort_values(by = "14d-incidence", ascending = False).head(n).index
non_fluctuating_countries = countries_var.sort_values(by = "14d-incidence", ascending = True).head(n).index

In [593]:
fluctuating_countries

Index(['Andorra'], dtype='object', name='Place')

In [594]:
for column in fluctuating_countries:
    data = go.Scatter(
        x = countries_descr.get_group(column)["deltaTime_since_start_of_recording"],
        y = countries_descr.get_group(column)["14d-incidence"]
    )
    fig = go.Figure(data = data, layout = layout_hist)
    fig.update_layout(title = column)
    fig.update_xaxes(type = "log")
    fig.show()

In [595]:
for column in non_fluctuating_countries:
    data = go.Scatter(
        x = countries_descr.get_group(column)["deltaTime_since_start_of_recording"],
        y = countries_descr.get_group(column)["14d-incidence"]
    )
    fig = go.Figure(data = data, layout = layout_hist)
    fig.update_layout(title = column)
    fig.update_xaxes(type = "log")
    fig.show()

In [596]:
cdf.sample(2)

Unnamed: 0,date_reported,cases,deaths,Place,ISO_alpha-2_code,ISO_alpha-3_code,Population,Continent,14d-incidence,deltaTime_since_start_of_recording
55575,2020-10-04,0,0,Thailand,TH,THA,69625581.0,Asia,0.110592,278
56138,2020-11-21,25,1,Togo,TG,TGO,8082359.0,Africa,4.157202,326


## Create a line plot showing the 14-incidence for all European countries. Use groupby operation to generate the data list for the plotly plot.

In [597]:
continents =cdf[["Continent", "Place", "14d-incidence", "date_reported", "deltaTime_since_start_of_recording"]].groupby(["Continent"])
european_countries = continents.get_group("Europe").drop(columns = "Continent")
european_countries.sample(3)

Unnamed: 0,Place,14d-incidence,date_reported,deltaTime_since_start_of_recording
20198,Finland,4.077624,2020-06-17,169
52857,Spain,60.223201,2020-08-01,214
5401,Belarus,1.121407,2020-04-02,93


In [610]:
layout = {"title": "14d-incidence in Europe", 
        "xaxis": {"title": {"text": "number of days since " + str(start_of_recording)}},
        "yaxis": {"title": {"text": "14d-incidence"}}
        }

fig1 = go.Figure(layout = layout)

for place in european_countries["Place"].unique()[0:14]:
    country = european_countries.loc[european_countries["Place"] == place]

    fig1.add_trace(go.Scatter(x = country["deltaTime_since_start_of_recording"],
                        y = country["14d-incidence"],
                        name = place))
fig2 = go.Figure(layout = layout)

for place in european_countries["Place"].unique()[14:28]:
    country = european_countries.loc[european_countries["Place"] == place]

    fig2.add_trace(go.Scatter(x = country["deltaTime_since_start_of_recording"],
                        y = country["14d-incidence"],
                        name = place))

fig3 = go.Figure(layout = layout)

for place in european_countries["Place"].unique()[28:42]:
    country = european_countries.loc[european_countries["Place"] == place]

    fig3.add_trace(go.Scatter(x = country["deltaTime_since_start_of_recording"],
                        y = country["14d-incidence"],
                        name = place))

fig4 = go.Figure(layout = layout)

for place in european_countries["Place"].unique()[42:]:
    country = european_countries.loc[european_countries["Place"] == place]

    fig4.add_trace(go.Scatter(x = country["deltaTime_since_start_of_recording"],
                        y = country["14d-incidence"],
                        name = place))


In [605]:
import ipywidgets

In [611]:
f1 = go.FigureWidget(fig1)
f2 = go.FigureWidget(fig2)
f3 = go.FigureWidget(fig3)
f4 = go.FigureWidget(fig4)

ipywidgets.VBox([f1,f2, f3,f4])

VBox(children=(FigureWidget({
    'data': [{'name': 'Albania',
              'type': 'scatter',
              …

## Create a smoothed version of the 14d-incidence by averaging 3 months.

In [548]:
def averaging(values, months = 3):
    window_size = 30 * months
    window = collections.deque([], maxlen=window_size)
    average = []
    for pos, aa in enumerate(values):
        value = values[pos]
        window.append(value)
        average.append(sum(window)/len(window))
    return average


In [614]:
fig5 = go.Figure(layout = layout)
for place in european_countries["Place"].unique()[42:]:
    country = european_countries.loc[european_countries["Place"] == place]
    
    fig5.add_trace(go.Scatter(x = country["deltaTime_since_start_of_recording"],
                        y = averaging(country["14d-incidence"].values),
                        name = place))

In [616]:
f5 = go.FigureWidget(fig5)
ipywidgets.VBox([f4, f5])

VBox(children=(FigureWidget({
    'data': [{'name': 'Portugal',
              'type': 'scatter',
             …

# Create a radial plot 

Create a radial plot of death rate / 100000 people (see popData2019), where one year completes a circle, i.e. 360˚. Visualize the recored years for Italy, Germany, Sweden and Greece. Hint you might need to turn the dateTime into day within the year (%j) and adjust 365 to 360 degrees.

In [619]:
cdf.head(5)

Unnamed: 0,date_reported,cases,deaths,Place,ISO_alpha-2_code,ISO_alpha-3_code,Population,Continent,14d-incidence,deltaTime_since_start_of_recording
0,2020-12-14,746,6,Afghanistan,AF,AFG,38041757.0,Asia,9.013779,349
1,2020-12-13,298,9,Afghanistan,AF,AFG,38041757.0,Asia,7.052776,348
2,2020-12-12,113,11,Afghanistan,AF,AFG,38041757.0,Asia,6.868768,347
3,2020-12-11,63,10,Afghanistan,AF,AFG,38041757.0,Asia,7.134266,346
4,2020-12-10,202,16,Afghanistan,AF,AFG,38041757.0,Asia,6.968658,345


In [638]:
death_rate = (cdf[["cases"]].values / cdf[["Population"]].values)*100000
cdf["death_rate_per_100k"] = death_rate
cdf.sample(2)

Unnamed: 0,date_reported,cases,deaths,Place,ISO_alpha-2_code,ISO_alpha-3_code,Population,Continent,14d-incidence,deltaTime_since_start_of_recording,death_rate_per_100k
5109,2020-04-22,0,0,Barbados,BB,BRB,287021.0,America,4.180879,113,0.0
20617,2020-04-05,4267,1053,France,FR,FRA,67012883.0,Europe,80.799389,96,6.367432


In [637]:
cols = ["Place", "Continent", "date_reported", "deltaTime_since_start_of_recording", "deaths", "Population", "death_rate_per_100k"]

gr = cdf[cols].groupby(["Place"])
gr.get_group("Germany").sample(2)

Unnamed: 0,Place,Continent,date_reported,deltaTime_since_start_of_recording,deaths,Population,death_rate_per_100k
21918,Germany,Europe,2020-11-06,311,166,83019213.0,25.904847
22158,Germany,Europe,2020-03-11,71,0,83019213.0,0.189113


In [679]:
country = gr.get_group("Germany")
fig = px.line_polar(country, r = "death_rate_per_100k", theta= "deltaTime_since_start_of_recording", line_close = True)
fig.show()

In [649]:
radi_steps = np.arange(0,360, 360/len(gr.get_group("Germany")[["deltaTime_since_start_of_recording"]]) )

In [675]:
data = go.Scatterpolar(r = gr.get_group("Germany")["death_rate_per_100k"],
            theta = radi_steps
            )
fig = go.Figure(data = data)
fig.show()

In [671]:
v = gr.get_group("Germany")["death_rate_per_100k"]
v


21880    19.708691
21881    24.331717
21882    34.254721
21883    35.985646
21884    28.522313
           ...    
22212     0.000000
22213     0.000000
22214     0.000000
22215     0.000000
22216     0.000000
Name: death_rate_per_100k, Length: 337, dtype: float64