### 1. Data preparation

In [1]:
import pandas as pd
from matplotlib.ticker import MultipleLocator, PercentFormatter

# url_1 = 'https://opendata.ecdc.europa.eu/covid19/casedistribution/csv'
# url_2 = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
# data = pd.read_csv(url_1)

# read data from the file
data = pd.read_csv('data/covid19/cases.csv')

# prep data for analysis
data = data.assign(
  date = pd.to_datetime(data.dateRep, format='%d/%m/%Y')
)

data.countriesAndTerritories = data.apply(
  lambda x: x.countriesAndTerritories.replace(
    'United_States_of_America', 'USA'
  ), axis=1
)

new_cases = data.pivot(
  index='date',
  columns='countriesAndTerritories',
  values='cases'
)

new_cases.sort_index(axis=1, inplace=True) # sort the records by date
new_cases.fillna(0, inplace=True) # missing values are considered as 0 cases

percent_new_cases = new_cases.apply(
  lambda x: x / new_cases.apply('sum', axis=1), axis=0
)

subset = percent_new_cases.loc[
  :, ['Italy', 'China', 'Spain', 'USA', 'India', 'Brazil']
]

### 2. Exploratory data analysis

In [2]:
data.describe()

Unnamed: 0,day,month,year,cases,deaths,popData2019,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000,date
count,43718.0,43718.0,43718.0,43718.0,43718.0,43654.0,40937.0,43718
mean,15.646919,5.61899,2019.998467,698.578297,21.792488,42870540.0,33.001167,2020-06-03 09:16:59.278100480
min,1.0,1.0,2019.0,-8261.0,-1918.0,815.0,-147.419587,2019-12-31 00:00:00
25%,8.0,4.0,2020.0,0.0,0.0,1355982.0,0.370634,2020-04-15 00:00:00
50%,16.0,6.0,2020.0,9.0,0.0,8082359.0,4.571738,2020-06-07 00:00:00
75%,23.0,7.0,2020.0,150.0,3.0,29161920.0,26.575105,2020-07-29 00:00:00
max,31.0,12.0,2020.0,97894.0,4928.0,1433784000.0,1058.225943,2020-09-19 00:00:00
std,8.776722,2.206138,0.039118,4352.355124,126.490919,157872000.0,76.067751,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43718 entries, 0 to 43717
Data columns (total 13 columns):
 #   Column                                                      Non-Null Count  Dtype         
---  ------                                                      --------------  -----         
 0   dateRep                                                     43718 non-null  object        
 1   day                                                         43718 non-null  int64         
 2   month                                                       43718 non-null  int64         
 3   year                                                        43718 non-null  int64         
 4   cases                                                       43718 non-null  int64         
 5   deaths                                                      43718 non-null  int64         
 6   countriesAndTerritories                                     43718 non-null  object        
 7   geoId                 

**Conslusion:** There are some variables with missing data, so either imputing or careful dropping of the records will be necessary

In [3]:
ax = subset.plot(
  figsize=(12, 7),
  title='Percentage of the World\'s New COVID-19 Cases\n(source: ECDC)',
  style=['-'] * 3 + ['--', ':', '-.']
)

tick_locs = subset.index[subset.index.day == 18].unique()
tick_labels = [loc.strftime('%b %d\n%Y') for loc in tick_locs]

ax.legend(title='Country', framealpha=0.5, ncol=2)
ax.set_xlabel('')
ax.set_ylabel('Percentage of the world\'s COVID-19 cases')
ax.set_ylim(0, None)
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))

for spine in ['top', 'right']:
  ax.spines[spine].set_visible(False)

show_plot(ticks=tick_locs, labels=tick_labels)

# plot daily new cases in New Zealand
ax = new_cases.New_Zealand['2020-04-18':'2020-09-18'].plot(
  title='Daily new COVID-19 cases in New Zealand\n(source: ECDC)'
)
ax.set(xlabel='', ylabel='new COVID-19 cases')

# the original ticks increment by 2.5, so fix this 
# by setting the major locator to 3 which makes more sense
ax.yaxis.set_major_locator(MultipleLocator(base=3)) 

for spine in ['top', 'right']:
  ax.spines[spine].set_visible(False)

show_plot()

ImportError: cannot import name 'get_percent_formatter' from 'plotting' (/home/dusan/projects/data_analytics/plotting.py)