## Global COVID-19 Data Analysis II
The dataset is sourced from this [upstream repository](https://github.com/CSSEGISandData/COVID-19) maintained by the amazing team at [Johns Hopkins University Center for Systems Science and Engineering](https://systems.jhu.edu/) (CSSE). 
The sample dataset used in this notebook is maintained and updated by [laxmimerit](https://github.com/laxmimerit/Covid-19-Preprocessed-Dataset.git).

**With interactive Visualisations**

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [2]:
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import folium
%matplotlib inline

import os
import math
import random
from datetime import datetime

#color palette (cnf = Confirmed, dth = Deaths, rec = Recovered, act = Active)
#Use color picker to select any color of your choice

cnf = '#393e46'
dth = '#ff2e63'
rec = '#21bf73'
act = '#fe9801'

py.offline.init_notebook_mode(connected = True)

In [3]:
file_path = './Covid-19-Preprocessed-Dataset/preprocessed/country_daywise.csv'

In [4]:
country_daywise = pd.read_csv(file_path)
print(f'Size: {country_daywise.shape}')
country_daywise.sample(5)

Size: (148302, 9)


Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active,New Cases,New Recovered,New Deaths
90430,2021-07-26,Mongolia,157974,782,154172,3020,1249,0,0
92346,2020-08-29,Mozambique,3760,22,2078,1660,63,23,1
132651,2020-04-10,Timor-Leste,2,0,1,1,1,1,0
128129,2020-03-13,Switzerland,1139,11,4,1124,487,0,1
113104,2020-01-28,San Marino,0,0,0,0,0,0,0


In [5]:
print(type(country_daywise.Date[0]))

<class 'str'>


In [6]:
country_daywise.Date = country_daywise.Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())

In [7]:
print(type(country_daywise.Date[0]))
print(country_daywise.Date[0])

<class 'datetime.date'>
2020-01-23


#### Confirmed Cases with Choropleth Map

In [23]:
# fig = px.choropleth(country_daywise, locations = 'Country', locationmode = 'country names', 
#                     color = np.log(country_daywise.Confirmed), hover_name = 'Country', 
#                     hover_data = ['Confirmed'],
#                     animation_frame = country_daywise.Date.astype(str),
#                     title = 'Cases over time', color_continuous_scale = px.colors.sequential.Inferno)
# fig.update(layout_coloraxis_showscale = True)
# fig.show()

You can slide the annimation to any particular date to see the **color code** indicating the **number of confirmed cases** for that day. Light colors indicate high confirmed cases while darker colors indicate fewer confirmed cases. **US, UK, India, Brazil** and **Russia** stand out as countries with high number of confirmed cases.
#### Confirmed and Death Cases with Static colormap

In [None]:
# fig_c = px.choropleth(country_daywise, locations = 'Country', locationmode = 'country names',
#                      color = np.log(country_daywise.Confirmed), hover_name = 'Country', hover_data = ['Confirmed'])

# temp = country_daywise[country_daywise.Deaths > 0]
# fig_d = px.choropleth(temp, locations = 'Country', locationmode = 'country names',
#                      color = np.log(temp['Deaths']), hover_name = 'Country', hover_data = ['Deaths'])

# fig = make_subplots(rows = 1, cols = 2, subplot_titles = ['Confirmed', 'Deaths'], 
#                     specs = [[{'type': 'choropleth'}, {'type': 'choropleth'}]])

# fig.add_trace(fig_c['data'][0], row = 1, col = 1)
# fig.add_trace(fig_d['data'][0], row = 1, col = 2)
# fig.update(layout_coloraxis_showscale = False)   # remove color bar
# fig.show()

### Deaths and Recoveries per 100 Cases

In [15]:
file_path = './Covid-19-Preprocessed-Dataset/preprocessed/daywise.csv'
daywise = pd.read_csv(file_path)
print(f'Size: {daywise.shape}')
daywise.sample(5)

Size: (749, 10)


Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,New Cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of Countries
340,2020-12-28,81523538,1836853,46016135,33670550,500751,2.25,56.45,3.99,191
497,2021-06-03,172660276,3705153,109701213,59253910,480163,2.15,63.54,3.38,193
404,2021-03-02,115188451,2627557,64903491,47657403,388379,2.28,56.35,4.05,192
409,2021-03-07,117326721,2674252,66084362,48568107,373549,2.28,56.33,4.05,192
206,2020-08-16,21710625,810061,13678579,7221985,210566,3.73,63.0,5.92,187


In [30]:
# fig_c = px.bar(daywise, x = 'Date', y = 'Confirmed', color_discrete_sequence = [act])
# fig_d = px.bar(daywise, x = 'Date', y = 'Deaths', color_discrete_sequence = [dth])

# fig = make_subplots(rows = 1, cols = 2, shared_xaxes = False, horizontal_spacing = 0.1, 
#                     subplot_titles = ('Confirmed Cases', 'Deaths Cases'))
# fig.add_trace(fig_c['data'][0], row = 1, col = 1)
# fig.add_trace(fig_d['data'][0], row = 1, col = 2)
# fig.update_layout(height = 400)
# fig.show()

As can be seen from the plots the **confirmed cases** seem to be on a steady rise. This indicates that the virus might still be around for sometime. And people are still dieing from the virus at still very high rate!
#### Variation in Confirmed and Death per 100 Cases

In [31]:
# fig1 = px.line(daywise, x = 'Date', y = 'Deaths / 100 Cases', color_discrete_sequence = [dth])
# fig2 = px.line(daywise, x = 'Date', y = 'Recovered / 100 Cases', color_discrete_sequence = [rec])
# fig3 = px.line(daywise, x = 'Date', y = 'Deaths / 100 Recovered', color_discrete_sequence = [act])

# fig = make_subplots(rows = 1, cols = 3, shared_xaxes = False, 
#                     subplot_titles = ('Deaths / 100 Cases', 'Recovered / 100 Cases', 'Death / 100 Recovered'))

# fig.add_trace(fig1['data'][0], row = 1, col = 1)
# fig.add_trace(fig2['data'][0], row = 1, col = 2)
# fig.add_trace(fig3['data'][0], row = 1, col = 3)

# fig.update_layout(height = 400)

# fig.show()

#### New Cases per Day and No. of Countries

In [32]:
# fig_c = px.bar(daywise, x = 'Date', y = 'Confirmed', color_discrete_sequence = [act])
# fig_d = px.bar(daywise, x = 'Date', y = 'No. of Countries', color_discrete_sequence = [dth])

# fig = make_subplots(rows = 1, cols = 2, shared_xaxes = False, subplot_titles = ('No. New Cases per Day', 'No. of Countries'), 
#                    horizontal_spacing = 0.1)

# fig.add_trace(fig_c['data'][0], row = 1, col = 1)
# fig.add_trace(fig_d['data'][0], row = 1, col = 2)
# fig.show()

#### Top 15 Countries Case Analysis

In [35]:
file_path = './Covid-19-Preprocessed-Dataset/preprocessed/countrywise.csv'
countrywise = pd.read_csv(file_path)
print(f'Size: {countrywise.shape}')
countrywise.sample(5)

Size: (198, 14)


Unnamed: 0,Country,Confirmed,Deaths,Recovered,Active,New Cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Population,Cases / Million People,Confirmed last week,1 week change,1 week % increase
106,MS Zaandam,9,2,0,7,0,22.22,0.0,0.0,0,90599.186528,9,0,0.0
55,El Salvador,135109,3967,0,131142,0,2.94,0.0,0.0,6486201,20830.0,135109,0,0.0
112,Malta,69533,572,0,68961,172,0.82,0.0,0.0,441539,157479.0,68193,1340,1.97
33,Canada,3159212,35114,0,3124098,10806,1.11,0.0,0.0,76492216,41301.0,3081616,77596,2.52
109,Malaysia,2956332,32065,0,2924267,17134,1.08,0.0,0.0,64998698,45483.0,2882060,74272,2.58


##### Uncomment and run the codes below to see the plots

In [75]:
# top = 15
# fig_c = px.bar(countrywise.sort_values('Confirmed').tail(top), x = 'Confirmed', y = 'Country', 
#                text = 'Confirmed', orientation = 'h', color_discrete_sequence = [cnf])

# fig_d = px.bar(countrywise.sort_values('Deaths').tail(top), x = 'Deaths', y = 'Country', 
#                text = 'Deaths', orientation = 'h', color_discrete_sequence = [dth])


# fig = make_subplots(rows = 1, cols = 2, shared_xaxes = False, horizontal_spacing = 0.14, 
#                      subplot_titles = ('Confirmed Cases', 'Deaths Cases'))

# fig.add_trace(fig_c['data'][0], row = 1, col = 1)
# fig.add_trace(fig_d['data'][0], row = 1, col = 2)

# fig.update_layout(height = 500)
# fig.show()

In [74]:
# top = 15

# fig_r = px.bar(countrywise.sort_values('Recovered').tail(top), x = 'Recovered', y = 'Country', 
#                text = 'Recovered', orientation = 'h', color_discrete_sequence = [rec])

# fig_a = px.bar(countrywise.sort_values('Active').tail(top), x = 'Active', y = 'Country', 
#                text = 'Active', orientation = 'h', color_discrete_sequence = [act])


# fig = make_subplots(rows = 5, cols = 2, shared_xaxes = False, horizontal_spacing = 0.14, 
#                      subplot_titles = ('Recovered Cases', 'Active Cases'))

# fig = make_subplots(rows = 1, cols = 2, shared_xaxes = False, horizontal_spacing = 0.14, 
#                      subplot_titles = ('Recovered Cases', 'Active Cases'))

# fig.add_trace(fig_r['data'][0], row = 1, col = 1)
# fig.add_trace(fig_a['data'][0], row = 1, col = 2)

# fig.update_layout(height = 500)
# fig.show()

In [73]:
# top = 15

# fig_dc = px.bar(countrywise.sort_values('Deaths / 100 Cases').tail(top), x = 'Deaths / 100 Cases', y = 'Country', 
#                text = 'Deaths / 100 Cases', orientation = 'h', color_discrete_sequence = ['#a83832'])

# fig_rc = px.bar(countrywise.sort_values('Recovered / 100 Cases').tail(top), x = 'Recovered / 100 Cases', y = 'Country', 
#                text = 'Recovered / 100 Cases', orientation = 'h', color_discrete_sequence = ['#60a832'])


# fig = make_subplots(rows = 1, cols = 2, shared_xaxes = False, horizontal_spacing = 0.14, 
#                      subplot_titles = ('Deaths / 100 Cases', 'Recovered / 100 Cases'))

# fig.add_trace(fig_dc['data'][0], row = 1, col = 1)
# fig.add_trace(fig_rc['data'][0], row = 1, col = 2)

# fig.update_layout(height = 500)
# fig.show()

In [72]:
# top = 15

# fig_nc = px.bar(countrywise.sort_values('New Cases').tail(top), x = 'New Cases', y = 'Country', 
#                text = 'New Cases', orientation = 'h', color_discrete_sequence = ['#a8323a'])

# temp = countrywise[countrywise.Population > 1000000]     # let's consider only with population more than 1 million
# fig_cm = px.bar(temp.sort_values('Cases / Million People').tail(top), x = 'Cases / Million People', y = 'Country', 
#                text = 'Cases / Million People', orientation = 'h', color_discrete_sequence = ['#94a832'])


# fig = make_subplots(rows = 1, cols = 2, shared_xaxes = False, horizontal_spacing = 0.14, 
#                      subplot_titles = ('New Cases', 'Cases / Million People'))

# fig.add_trace(fig_nc['data'][0], row = 1, col = 1)
# fig.add_trace(fig_cm['data'][0], row = 1, col = 2)

# fig.update_layout(height = 500)
# fig.show()

In [71]:
# top = 15

# fig_week1 = px.bar(countrywise.sort_values('1 week change').tail(top), x = '1 week change', y = 'Country', 
#                text = '1 week change', orientation = 'h', color_discrete_sequence = ['#32a877'])
# temp = countrywise[countrywise.Confirmed > 100]     # let's consider only new confirmed cases more than 100
# fig_week1p = px.bar(temp.sort_values('1 week % increase').tail(top), x = '1 week % increase', y = 'Country', 
#                text = '1 week % increase', orientation = 'h', color_discrete_sequence = ['#326da8'])


# fig = make_subplots(rows = 1, cols = 2, shared_xaxes = False, horizontal_spacing = 0.14, 
#                      subplot_titles = ('1 week change', '1 week % increase'))

# fig.add_trace(fig_week1['data'][0], row = 1, col = 1)
# fig.add_trace(fig_week1p['data'][0], row = 1, col = 2)

# fig.update_layout(height = 500)
# fig.show()

**Save the graphs**

In [None]:
## To save static plot first install plotly-orca
#conda install -c plotly plotly-orca==1.2.1 psutil requests OR conda install -c plotly plotly-orca

In [None]:
import os
if not os.path.exists('images'):
    os.mkdir('images')

In [None]:
fig.write_image('images/fig.png')
#fig.write_image('images/fig.jpeg')
#fig.write_image('images/fig.pdf')

#### Scatter Plot for Deaths vs Confirmed Cases

In [70]:
# top = 15
# # fig = px.scatter(country_daywise.sort_values('Deaths', ascending = False).iloc[:15, :])
# fig = px.scatter(countrywise.sort_values('Deaths', ascending = False).head(top), 
#                 x = 'Confirmed', y = 'Deaths', color = 'Country', size = 'Confirmed', height = 700, 
#                 text = 'Country', log_x = True, log_y = True, title = 'Death vs Confirmed Cases (Cases are in log10 scale)')
# fig.update_traces(textposition = 'top center')
# fig.update_layout(showlegend = False)
# fig.update_layout(xaxis_rangeslider_visible = True)
# fig.show()

Comparing the number of **deaths** with the number of **confirmed cases** we can see that **US** is the most badly hit country followed by **Brazil** and **India** (*at the time of this analysis*).
#### Bar plots of Confirmed, Deaths and Ne Cases vs Country and Date

In [66]:
# fig = px.bar(country_daywise, x = 'Date', y = 'Confirmed', color = 'Country', height = 600, 
#             title = 'Confirmed', color_discrete_sequence = px.colors.cyclical.mygbm)
# fig.show()

We can see how each country has contributed in the number of **confirmed cases**.

In [67]:
# fig = px.bar(country_daywise, x = 'Date', y = 'Deaths', color = 'Country', height = 600, 
#             title = 'Deaths', color_discrete_sequence = px.colors.cyclical.mygbm)
# fig.show()

We can see how each country has contributed in the number of **deaths cases**.

In [68]:
# fig = px.bar(country_daywise, x = 'Date', y = 'Recovered', color = 'Country', height = 600, 
#             title = 'Recovered', color_discrete_sequence = px.colors.cyclical.mygbm)
# fig.show()

We can see how each country has contributed in the number of **recovered cases**.

In [89]:
# fig = px.bar(country_daywise.sort_values('New Cases', ascending = False), x = 'Date', y = 'New Cases', 
#              color = 'Country', height = 600, title = 'New Cases', color_discrete_sequence = px.colors.cyclical.mygbm)
# fig.show()

We can see how each country has contributed in the number of **new cases**.
#### Line plot

In [80]:
# fig = px.line(country_daywise.sort_values('Confirmed', ascending = False), x = 'Date', y = 'Confirmed', 
#               color = 'Country', height = 600, title = 'Confirmed', color_discrete_sequence = px.colors.cyclical.mygbm)

# fig.show()

In [85]:
# fig = px.line(country_daywise.sort_values('Deaths', ascending = False), x = 'Date', y = 'Deaths', 
#               color = 'Country', height = 600, title = 'Deaths', color_discrete_sequence = px.colors.cyclical.mygbm)

# fig.show()

In [87]:
# fig = px.line(country_daywise.sort_values('Recovered', ascending = False), x = 'Date', y = 'Recovered', 
#               color = 'Country', height = 600, title = 'Recovered', color_discrete_sequence = px.colors.cyclical.mygbm)

# fig.show()

## Growth Rate after `100, 1.000, 10.000, 100.000 and 1.000.000` Confirmed Cases
#### Growth Rate after 100 Cases

In [150]:
gt_100 = country_daywise[country_daywise.Confirmed > 100]['Country'].unique()
print(f'Number of countries with more than 100 Cases: {len(gt_100)}')

Number of countries with more than 100 Cases: 190


In [115]:
file_path = './Covid-19-Preprocessed-Dataset/preprocessed/covid_19_data_cleaned.csv'
df = pd.read_csv(file_path)
df.rename(columns = {'Province/State': 'ProvinceState'}, inplace = True)
df.ProvinceState = df.ProvinceState.fillna('')
confirmed_gt_100 = df[df.Country.isin(gt_100)]
confirmed_gt_100.sample(5)

Unnamed: 0,Date,ProvinceState,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
47364,2022-01-31,Fujian,China,26.0789,117.9874,1455,0,1,1454
203821,2020-02-20,,United Kingdom,55.3781,-3.436,22,8,0,14
179596,2021-10-03,,Spain,40.463667,-3.74922,4961128,0,86463,4874665
195464,2021-12-18,,United Arab Emirates,23.424076,53.847818,743586,0,2151,741435
130411,2020-12-02,,Luxembourg,49.8153,6.1296,35802,26497,334,8971


In [116]:
confirmed_by_country_date = confirmed_gt_100.groupby(['Country', 'Date'])['Confirmed'].sum()
confirmed_by_country_date = pd.DataFrame(confirmed_by_country_date)
confirmed_by_country_date = confirmed_by_country_date[confirmed_by_country_date.Confirmed > 100]
confirmed_by_country_date = confirmed_by_country_date.reset_index()
confirmed_by_country_date.head()

Unnamed: 0,Country,Date,Confirmed
0,Afghanistan,2020-03-28,106
1,Afghanistan,2020-03-29,114
2,Afghanistan,2020-03-30,114
3,Afghanistan,2020-03-31,166
4,Afghanistan,2020-04-01,192


In [117]:
min_date = confirmed_by_country_date.groupby('Country')['Date'].min().reset_index()
min_date.columns = ['Country', 'Min_Date']
min_date.head()

Unnamed: 0,Country,Min_Date
0,Afghanistan,2020-03-28
1,Albania,2020-03-23
2,Algeria,2020-03-21
3,Andorra,2020-03-22
4,Angola,2020-06-10


In [118]:
from_100th_case = pd.merge(confirmed_by_country_date, min_date, on = 'Country')
from_100th_case.head()

Unnamed: 0,Country,Date,Confirmed,Min_Date
0,Afghanistan,2020-03-28,106,2020-03-28
1,Afghanistan,2020-03-29,114,2020-03-28
2,Afghanistan,2020-03-30,114,2020-03-28
3,Afghanistan,2020-03-31,166,2020-03-28
4,Afghanistan,2020-04-01,192,2020-03-28


**Change the columns containing dates to datetime objects for easy manipulation**

In [119]:
from_100th_case.Date = from_100th_case.Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())
from_100th_case.Min_Date = from_100th_case.Min_Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())

In [None]:
from_100th_case['Num_Days'] = from_100th_case.Date - from_100th_case.Min_Date
from_100th_case.sample(5)

In [126]:
from_100th_case['Num_Days'] = from_100th_case['Num_Days'].map(lambda p: p.days)
from_100th_case.head()

Unnamed: 0,Country,Date,Confirmed,Min_Date,Num_Days
0,Afghanistan,2020-03-28,106,2020-03-28,0
1,Afghanistan,2020-03-29,114,2020-03-28,1
2,Afghanistan,2020-03-30,114,2020-03-28,2
3,Afghanistan,2020-03-31,166,2020-03-28,3
4,Afghanistan,2020-04-01,192,2020-03-28,4


##### Uncomment and run the code to see the plots for each case

In [129]:
# fig = px.line(from_100th_case.sort_values('Confirmed', ascending = False), x = 'Num_Days', y = 'Confirmed', 
#               color = 'Country', title = 'Number of Days from 100 Case', height = 600)
# fig.show()

**NOTE:** You can click on each legend to disable or remove the plot from the graph.
#### Growth Rate after 1000 Cases

In [151]:
gt_1000 = country_daywise[country_daywise.Confirmed > 1000]['Country'].unique()
print(f'Number of countries with more than 1000 Cases: {len(gt_1000)}')
confirmed_gt_1000 = df[df.Country.isin(gt_1000)]
confirmed_gt_1000.sample(5)

Number of countries with more than 1000 Cases: 187


Unnamed: 0,Date,ProvinceState,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
188592,2021-09-05,,Timor-Leste,-8.874217,125.727539,17618,0,80,17538
194517,2021-06-06,,Ukraine,48.3794,31.1656,2273708,2148010,53258,72440
100324,2020-11-25,,Gambia,13.4432,-15.3101,3727,3587,123,17
15194,2020-06-24,,Bahrain,26.0275,50.55,23570,17977,69,5524
97490,2021-05-18,St Martin,France,18.0708,-63.0501,1839,1399,14,426


In [131]:
confirmed_by_country_date = confirmed_gt_1000.groupby(['Country', 'Date'])['Confirmed'].sum()
confirmed_by_country_date = pd.DataFrame(confirmed_by_country_date)
confirmed_by_country_date = confirmed_by_country_date[confirmed_by_country_date.Confirmed > 1000]
confirmed_by_country_date = confirmed_by_country_date.reset_index()
confirmed_by_country_date.head()

Unnamed: 0,Country,Date,Confirmed
0,Afghanistan,2020-04-21,1026
1,Afghanistan,2020-04-22,1092
2,Afghanistan,2020-04-23,1176
3,Afghanistan,2020-04-24,1226
4,Afghanistan,2020-04-25,1330


In [132]:
min_date = confirmed_by_country_date.groupby('Country')['Date'].min().reset_index()
min_date.columns = ['Country', 'Min_Date']
min_date.head()

Unnamed: 0,Country,Min_Date
0,Afghanistan,2020-04-21
1,Albania,2020-05-25
2,Algeria,2020-04-03
3,Andorra,2020-08-17
4,Angola,2020-07-29


In [133]:
from_1000th_case = pd.merge(confirmed_by_country_date, min_date, on = 'Country')
from_1000th_case.head()

Unnamed: 0,Country,Date,Confirmed,Min_Date
0,Afghanistan,2020-04-21,1026,2020-04-21
1,Afghanistan,2020-04-22,1092,2020-04-21
2,Afghanistan,2020-04-23,1176,2020-04-21
3,Afghanistan,2020-04-24,1226,2020-04-21
4,Afghanistan,2020-04-25,1330,2020-04-21


In [134]:
from_1000th_case.Date = from_1000th_case.Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())
from_1000th_case.Min_Date = from_1000th_case.Min_Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())

In [137]:
from_1000th_case['Num_Days'] = from_1000th_case.Date - from_1000th_case.Min_Date
from_1000th_case.sample(5)

Unnamed: 0,Country,Date,Confirmed,Min_Date,Num_Days
98910,Tanzania,2021-10-22,26034,2021-07-29,85 days
81213,Portugal,2020-04-18,19685,2020-03-20,29 days
95755,Suriname,2021-09-17,36428,2020-07-18,426 days
66924,Mexico,2022-02-11,5226269,2020-03-30,683 days
67594,Moldova,2022-02-05,464234,2020-04-07,669 days


In [138]:
from_1000th_case['Num_Days'] = from_1000th_case['Num_Days'].map(lambda p: p.days)
from_1000th_case.head()

Unnamed: 0,Country,Date,Confirmed,Min_Date,Num_Days
0,Afghanistan,2020-04-21,1026,2020-04-21,0
1,Afghanistan,2020-04-22,1092,2020-04-21,1
2,Afghanistan,2020-04-23,1176,2020-04-21,2
3,Afghanistan,2020-04-24,1226,2020-04-21,3
4,Afghanistan,2020-04-25,1330,2020-04-21,4


In [144]:
# fig = px.line(from_1000th_case.sort_values('Confirmed', ascending = False), x = 'Num_Days', y = 'Confirmed', 
#               color = 'Country', title = 'Number of Days from 1000 Case', height = 600)
# fig.show()

### Growth Rate after 10.000 Cases

In [140]:
gt_10000 = country_daywise[country_daywise.Confirmed > 10000]['Country'].unique()
confirmed_gt_10000 = df[df.Country.isin(gt_10000)]
confirmed_by_country_date = confirmed_gt_10000.groupby(['Country', 'Date'])['Confirmed'].sum()
confirmed_by_country_date = pd.DataFrame(confirmed_by_country_date)
confirmed_by_country_date = confirmed_by_country_date[confirmed_by_country_date.Confirmed > 10000]
confirmed_by_country_date = confirmed_by_country_date.reset_index()
min_date = confirmed_by_country_date.groupby('Country')['Date'].min().reset_index()
min_date.columns = ['Country', 'Min_Date']
from_10000th_case = pd.merge(confirmed_by_country_date, min_date, on = 'Country')
from_10000th_case.head()

Unnamed: 0,Country,Date,Confirmed,Min_Date
0,Afghanistan,2020-05-24,10668,2020-05-24
1,Afghanistan,2020-05-25,11180,2020-05-24
2,Afghanistan,2020-05-26,11917,2020-05-24
3,Afghanistan,2020-05-27,12465,2020-05-24
4,Afghanistan,2020-05-28,13102,2020-05-24


In [152]:
print(f'Number of countries with more than 10.000 Cases: {len(gt_10000)}')

Number of countries with more than 10.000 Cases: 171


In [141]:
from_10000th_case.Date = from_10000th_case.Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())
from_10000th_case.Min_Date = from_10000th_case.Min_Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())

In [142]:
from_10000th_case['Num_Days'] = from_10000th_case.Date - from_10000th_case.Min_Date
from_10000th_case['Num_Days'] = from_10000th_case['Num_Days'].map(lambda p: p.days)
from_10000th_case.head()

Unnamed: 0,Country,Date,Confirmed,Min_Date,Num_Days
0,Afghanistan,2020-05-24,10668,2020-05-24,0
1,Afghanistan,2020-05-25,11180,2020-05-24,1
2,Afghanistan,2020-05-26,11917,2020-05-24,2
3,Afghanistan,2020-05-27,12465,2020-05-24,3
4,Afghanistan,2020-05-28,13102,2020-05-24,4


In [145]:
# fig = px.line(from_10000th_case.sort_values('Confirmed', ascending = False), x = 'Num_Days', y = 'Confirmed', 
#               color = 'Country', title = 'Number of Days from 10000 Case', height = 600)
# fig.show()

### Growth Rate after 100.000 Cases

In [146]:
gt_100000 = country_daywise[country_daywise.Confirmed > 100000]['Country'].unique()
confirmed_gt_100000 = df[df.Country.isin(gt_100000)]
confirmed_by_country_date = confirmed_gt_100000.groupby(['Country', 'Date'])['Confirmed'].sum()
confirmed_by_country_date = pd.DataFrame(confirmed_by_country_date)
confirmed_by_country_date = confirmed_by_country_date[confirmed_by_country_date.Confirmed > 100000]
confirmed_by_country_date = confirmed_by_country_date.reset_index()
min_date = confirmed_by_country_date.groupby('Country')['Date'].min().reset_index()
min_date.columns = ['Country', 'Min_Date']
from_100000th_case = pd.merge(confirmed_by_country_date, min_date, on = 'Country')
from_100000th_case.head()

Unnamed: 0,Country,Date,Confirmed,Min_Date
0,Afghanistan,2021-06-19,100521,2021-06-19
1,Afghanistan,2021-06-20,101906,2021-06-19
2,Afghanistan,2021-06-21,103902,2021-06-19
3,Afghanistan,2021-06-22,105749,2021-06-19
4,Afghanistan,2021-06-23,107957,2021-06-19


In [154]:
print(f'Number of countries with more than 100.000 Cases: {len(gt_100000)}')

Number of countries with more than 100.000 Cases: 119


In [147]:
from_100000th_case.Date = from_100000th_case.Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())
from_100000th_case.Min_Date = from_100000th_case.Min_Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())

In [148]:
from_100000th_case['Num_Days'] = from_100000th_case.Date - from_100000th_case.Min_Date
from_100000th_case['Num_Days'] = from_100000th_case['Num_Days'].map(lambda p: p.days)
from_100000th_case.head()

Unnamed: 0,Country,Date,Confirmed,Min_Date,Num_Days
0,Afghanistan,2021-06-19,100521,2021-06-19,0
1,Afghanistan,2021-06-20,101906,2021-06-19,1
2,Afghanistan,2021-06-21,103902,2021-06-19,2
3,Afghanistan,2021-06-22,105749,2021-06-19,3
4,Afghanistan,2021-06-23,107957,2021-06-19,4


In [156]:
# fig = px.line(from_100000th_case.sort_values('Confirmed', ascending = False), x = 'Num_Days', y = 'Confirmed', 
#               color = 'Country', title = 'Number of Days from 100.000 Case', height = 600)
# fig.show()

### Growth Rate after 1.000.000 Cases

In [157]:
gt_1000000 = country_daywise[country_daywise.Confirmed > 1000000]['Country'].unique()
print(f'Number of countries with more than 1.000.000 Cases: {len(gt_1000000)}')
confirmed_gt_1000000 = df[df.Country.isin(gt_1000000)]
confirmed_by_country_date = confirmed_gt_1000000.groupby(['Country', 'Date'])['Confirmed'].sum()
confirmed_by_country_date = pd.DataFrame(confirmed_by_country_date)
confirmed_by_country_date = confirmed_by_country_date[confirmed_by_country_date.Confirmed > 1000000]
confirmed_by_country_date = confirmed_by_country_date.reset_index()
min_date = confirmed_by_country_date.groupby('Country')['Date'].min().reset_index()
min_date.columns = ['Country', 'Min_Date']
from_1000000th_case = pd.merge(confirmed_by_country_date, min_date, on = 'Country')
from_1000000th_case.head()

Number of countries with more than 1.000.000 Cases: 52


Unnamed: 0,Country,Date,Confirmed,Min_Date
0,Argentina,2020-10-19,1002662,2020-10-19
1,Argentina,2020-10-20,1018999,2020-10-19
2,Argentina,2020-10-21,1037325,2020-10-19
3,Argentina,2020-10-22,1053650,2020-10-19
4,Argentina,2020-10-23,1069368,2020-10-19


In [158]:
from_1000000th_case.Date = from_1000000th_case.Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())
from_1000000th_case.Min_Date = from_1000000th_case.Min_Date.map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())

In [159]:
from_1000000th_case['Num_Days'] = from_1000000th_case.Date - from_1000000th_case.Min_Date
from_1000000th_case['Num_Days'] = from_1000000th_case['Num_Days'].map(lambda p: p.days)
from_1000000th_case.head()

Unnamed: 0,Country,Date,Confirmed,Min_Date,Num_Days
0,Argentina,2020-10-19,1002662,2020-10-19,0
1,Argentina,2020-10-20,1018999,2020-10-19,1
2,Argentina,2020-10-21,1037325,2020-10-19,2
3,Argentina,2020-10-22,1053650,2020-10-19,3
4,Argentina,2020-10-23,1069368,2020-10-19,4


In [166]:
# fig = px.line(from_1000000th_case.sort_values('Confirmed', ascending = False), x = 'Num_Days', y = 'Confirmed', 
#               color = 'Country', title = 'Number of Days from 1.000.000 Case', height = 600)
# fig.show()

### Tree May Analysis
#### Confirmed Cases

In [162]:
full_latest = df[df.Date == max(df.Date)]        # the latest date's data
full_latest.sample(5)

Unnamed: 0,Date,ProvinceState,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
150399,2022-02-11,Cook Islands,New Zealand,-21.2367,-159.7777,2,0,0,2
113551,2022-02-11,,Iran,32.427908,53.688046,6761855,0,133437,6628418
186495,2022-02-11,,Tajikistan,38.861,71.2761,17765,0,125,17640
159423,2022-02-11,,Paraguay,-23.4425,-58.4438,620708,0,17844,602864
139871,2022-02-11,,Micronesia,7.4256,150.5508,1,0,0,1


In [163]:
fig = px.treemap(full_latest.sort_values(by = 'Confirmed', ascending = False).reset_index(drop = True), 
                path = ['Country', 'ProvinceState'], values = 'Confirmed', height = 700, 
                 title = 'Number of Confirmed Cases', color_discrete_sequence = px.colors.qualitative.Dark2)
fig.data[0].textinfo = 'label+text+value'
fig.show()

#### Death Cases

In [165]:
# fig = px.treemap(full_latest.sort_values(by = 'Deaths', ascending = False).reset_index(drop = True), 
#                 path = ['Country', 'ProvinceState'], values = 'Deaths', height = 700, 
#                  title = 'Number of Death Cases', color_discrete_sequence = px.colors.qualitative.Dark2)
# fig.data[0].textinfo = 'label+text+value'
# fig.show()

### First and Last Case Report Time
When was the **first** and the **last** case reported in each **country** (*at the time of this analysis*)?

In [174]:
first_date = df[df.Confirmed > 0]
first_date = first_date.groupby('Country')['Date'].agg(['min']).reset_index()
first_date.sample(5)

Unnamed: 0,Country,min
104,Lithuania,2020-02-29
89,Jordan,2020-03-03
193,West Bank and Gaza,2020-03-05
154,Senegal,2020-03-02
150,Samoa,2020-11-18


In [175]:
last_date = df[df.Confirmed > 0]
last_date = last_date.groupby('Country')['Date'].agg(['max']).reset_index()
last_date.sample(5)

Unnamed: 0,Country,max
189,Uzbekistan,2022-02-11
34,Central African Republic,2022-02-11
175,Tanzania,2022-02-11
153,Saudi Arabia,2022-02-11
59,Eswatini,2022-02-11


**Latest date any case (`Confirmed, Deaths, Recovered`) was reported by a country** 

In [176]:
latest_date = df.groupby(['Country', 'Date'])['Confirmed', 'Deaths', 'Recovered'].sum().diff().reset_index()
latest_date.sample(5)

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
110726,Saint Kitts and Nevis,2020-07-22,0.0,0.0,0.0
82832,Maldives,2020-05-13,51.0,1.0,11.0
47670,France,2020-11-11,36406.0,328.0,1776.0
48848,Gabon,2022-01-11,0.0,0.0,0.0
114998,Sao Tome and Principe,2021-12-16,2.0,0.0,0.0


In [177]:
mask = latest_date.Country != latest_date.Country.shift(1)

latest_date.loc[mask, 'Confirmed'] = np.nan
latest_date.loc[mask, 'Deaths'] = np.nan
latest_date.loc[mask, 'Recovered'] = np.nan
latest_date.sample(5)

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
100501,Oman,2021-05-21,0.0,0.0,0.0
82634,Malaysia,2021-11-18,6380.0,55.0,0.0
95426,Netherlands,2021-11-26,21349.0,59.0,0.0
6486,Armenia,2021-05-06,425.0,14.0,714.0
50989,Germany,2021-09-18,7103.0,36.0,0.0


In [178]:
latest_date = latest_date[latest_date.Confirmed > 0]
latest_date.sample(5)

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
66869,Japan,2021-12-15,159.0,1.0,0.0
114274,San Marino,2022-01-13,180.0,0.0,0.0
13366,Belgium,2021-08-26,2467.0,6.0,0.0
5367,Argentina,2020-05-04,104.0,14.0,88.0
87897,Mexico,2021-11-17,4571.0,369.0,0.0


In [179]:
latest_date = latest_date.groupby('Country')['Date'].agg(['max']).reset_index()
latest_date.sample(5)

Unnamed: 0,Country,max
45,Cuba,2022-02-11
127,New Zealand,2022-02-11
108,Malawi,2022-02-11
154,Senegal,2022-02-11
50,Djibouti,2022-02-11


In [181]:
first_last = pd.concat([first_date, latest_date['max']], axis = 1)
first_last.sample(5)

Unnamed: 0,Country,min,max
41,Congo (Kinshasa),2020-03-11,2022-02-11
140,Philippines,2020-01-30,2022-02-11
120,Mongolia,2020-03-10,2022-02-11
191,Venezuela,2020-03-14,2022-02-10
153,Saudi Arabia,2020-03-02,2022-02-11


In [185]:
first_last['min'] = first_last['min'].map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())
first_last['max'] = first_last['max'].map(lambda d: pd.to_datetime(d.split()[0], format = '%Y-%m-%d').date())

In [186]:
first_last['max'] = first_last['max'] + pd.Timedelta(days = 1)      # add one day to avoid subtracting a day from itself
first_last.sample(5)

Unnamed: 0,Country,min,max
105,Luxembourg,2020-02-29,2022-02-12
149,Saint Vincent and the Grenadines,2020-03-14,2022-02-10
68,Ghana,2020-03-14,2022-02-08
81,Indonesia,2020-03-02,2022-02-12
114,Mauritania,2020-03-14,2022-02-12


**Let's calculate the number of days between first reported case and last reported case for each country**

In [187]:
first_last['Days'] = (first_last['max'] - first_last['min']).map(lambda d: d.days)
first_last.sample(5)

Unnamed: 0,Country,min,max,Days
49,Diamond Princess,2020-02-07,2020-03-19,41
160,Slovenia,2020-03-05,2022-02-12,709
17,Belgium,2020-02-04,2022-02-12,739
123,Mozambique,2020-03-22,2022-02-12,692
58,Estonia,2020-02-27,2022-02-12,716


In [189]:
first_last['Task'] = first_last['Country']              # Task key is required by figure factory plot
#first_last.rename(columns = {'min': 'StartDate', 'max': 'LastDate'}, inplace = True)
first_last.rename(columns = {'min': 'Start', 'max': 'Finish'}, inplace = True)     # required by figure factory plot
first_last.sample(5)

Unnamed: 0,Country,StartDate,LastDate,Days,Task
101,Liberia,2020-03-17,2022-02-11,696,Liberia
135,Palau,2021-08-22,2022-02-12,174,Palau
82,Iran,2020-02-19,2022-02-12,724,Iran
113,Marshall Islands,2020-10-28,2022-01-07,436,Marshall Islands
107,Madagascar,2020-03-20,2022-02-06,688,Madagascar


In [190]:
first_last = first_last.sort_values(by = 'Days', ascending = False)
first_last.head()

Unnamed: 0,Country,StartDate,LastDate,Days,Task
37,China,2020-01-22,2022-02-12,752,China
88,Japan,2020-01-22,2022-02-12,752,Japan
183,US,2020-01-22,2022-02-12,752,US
176,Thailand,2020-01-22,2022-02-12,752,Thailand
173,Taiwan*,2020-01-22,2022-02-12,752,Taiwan*


In [191]:
first_last.tail()

Unnamed: 0,Country,StartDate,LastDate,Days,Task
49,Diamond Princess,2020-02-07,2020-03-19,41,Diamond Princess
194,Winter Olympics 2022,2022-01-23,2022-02-12,20,Winter Olympics 2022
106,MS Zaandam,2020-03-28,2020-04-02,5,MS Zaandam
5,Antarctica,2021-12-14,2021-12-15,1,Antarctica
117,Micronesia,2021-01-21,2021-01-22,1,Micronesia


#### Visualisation

In [198]:
colors = ['#' + ''.join([random.choice('0123456789abcdef') for i in range(6)]) for j in range(len(first_last))]

In [None]:
# PlotlyError: The columns in your dataframe must include the following keys: Task, Start, Finish
first_last.rename(columns = {'StartDate': 'Start', 'LastDate': 'Finish'}, inplace = True)

In [202]:
# fig = ff.create_gantt(first_last, index_col = 'Country', colors = colors, 
#                       show_colorbar = False, bar_width = 0.2, showgrid_x = True, showgrid_y = True, height = 2500)
# fig.show()

### Confirmed Cases by Country and Day

In [203]:
country_daywise.sample(5)

Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active,New Cases,New Recovered,New Deaths
16887,2021-03-07,Bosnia and Herzegovina,135513,5228,117885,12400,0,0,0
121007,2021-03-16,Solomon Islands,18,0,16,2,0,0,0
49609,2020-07-16,Georgia,1006,15,883,108,2,10,0
20285,2020-03-25,Burkina Faso,146,4,10,132,32,3,0
110167,2020-03-27,Saint Kitts and Nevis,2,0,0,2,0,0,0


In [206]:
confirmed_country_day = country_daywise.groupby(['Country', 'Date'])['Confirmed'].sum().reset_index()
confirmed_country_day = confirmed_country_day[confirmed_country_day['Country'].isin(gt_10000)]
countries = confirmed_country_day['Country'].unique()

In [210]:
# ncols = 3
# nrows = math.ceil(len(countries) / ncols)

# fig = make_subplots(rows = nrows, cols = ncols, shared_xaxes = False, subplot_titles = countries)

# for idx, country in enumerate(countries):
#     row = int((idx/ncols) + 1)
#     col = int((idx % ncols) + 1)
#     fig.add_trace(go.Bar(x = confirmed_country_day.Date, 
#                          y = confirmed_country_day.loc[confirmed_country_day.Country == country, 'Confirmed'], 
#                          name = country), row = row, col = col)
# fig.update_layout(height = 4000, title_text = 'Confirmed Cases in each Country')
# fig.update_layout(showlegend = False)
# fig.show()

### Covid-19 vs Other Similar Epidemics
*Data from wiki*

In [215]:
epidemic_data = {'epidemic': ['COVID-19', 'EBOLA', 'MERS', 'H1N1', 'SARS'],
             'start_year': [2019, 2013, 2012, 2009, 2002],
             'end_year': [2022, 2016, 2020, 2010, 2004],
             'confirmed': [full_latest.Confirmed.sum(), 28646, 2519, 6724149, 8422],
             'deaths': [full_latest.Deaths.sum(), 11323, 866, 19454, 813]
            }

epidemics = pd.DataFrame(epidemic_data)
epidemics

Unnamed: 0,epidemic,start_year,end_year,confirmed,deaths
0,COVID-19,2019,2022,408442142,5802000
1,EBOLA,2013,2016,28646,11323
2,MERS,2012,2020,2519,866
3,H1N1,2009,2010,6724149,19454
4,SARS,2002,2004,8422,813


In [216]:
epidemics['mortality'] = round(epidemics.deaths / epidemics.confirmed * 100, 2)
epidemics

Unnamed: 0,epidemic,start_year,end_year,confirmed,deaths,mortality
0,COVID-19,2019,2022,408442142,5802000,1.42
1,EBOLA,2013,2016,28646,11323,39.53
2,MERS,2012,2020,2519,866,34.38
3,H1N1,2009,2010,6724149,19454,0.29
4,SARS,2002,2004,8422,813,9.65


In [217]:
temp = epidemics.melt(id_vars = 'epidemic', value_vars = ['confirmed', 'deaths', 'mortality'], 
                     var_name = 'Case', value_name = 'Value')

In [218]:
temp

Unnamed: 0,epidemic,Case,Value
0,COVID-19,confirmed,408442100.0
1,EBOLA,confirmed,28646.0
2,MERS,confirmed,2519.0
3,H1N1,confirmed,6724149.0
4,SARS,confirmed,8422.0
5,COVID-19,deaths,5802000.0
6,EBOLA,deaths,11323.0
7,MERS,deaths,866.0
8,H1N1,deaths,19454.0
9,SARS,deaths,813.0


In [220]:
fig = px.bar(temp, x = 'epidemic', y = 'Value', color = 'epidemic', text = 'Value', facet_col = 'Case', 
            color_discrete_sequence = px.colors.qualitative.Bold)
fig.update_traces(textposition = 'outside')
fig.update_layout(uniformtext_minsize = 8, uniformtext_mode = 'hide')
fig.update_yaxes(showticklabels = False)
fig.layout.yaxis2.update(matches = None)
fig.layout.yaxis3.update(matches = None)
fig.show()

As we can see **COVID-19** has the highest number of **confirmed and deaths cases** but a very **low mortality rate** when compared with **EBOLA, MERS** and **SARS**.