In [4]:
#Libraries to import
import pandas as pd
import numpy as np
import datetime as dt
import requests
import sys
from itertools import chain
import pycountry
import pycountry_convert as pc
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

In [5]:
df = pd.read_csv(r'C:\Users\karim\Desktop\covid_19_data.csv\covid_19_data.csv')

In [6]:
df.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109382 entries, 0 to 109381
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   SNo              109382 non-null  int64  
 1   ObservationDate  109382 non-null  object 
 2   Province/State   75706 non-null   object 
 3   Country/Region   109382 non-null  object 
 4   Last Update      109382 non-null  object 
 5   Confirmed        109382 non-null  float64
 6   Deaths           109382 non-null  float64
 7   Recovered        109382 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 6.7+ MB


In [8]:
df.rename(columns={'ObservationDate':'Date','Province/State':'Province_State',
                   'Country/Region':'Country_Region','Confirmed':'ConfirmedCases',
                   'Deaths':'Fatalities'},inplace=True)
df.loc[df['Country_Region']=='Mainland China','Country_Region']='China'
df['Date'] = pd.to_datetime(df['Date'],format='%m/%d/%Y')
df['Day'] = df['Date'].dt.dayofyear
df['cases_lag_1'] = df.groupby(['Country_Region','Province_State'])['ConfirmedCases'].shift(1)
df['deaths_lag_1'] = df.groupby(['Country_Region','Province_State'])['Fatalities'].shift(1)
df['Daily Cases'] = df['ConfirmedCases'] - df['cases_lag_1']
df['Daily Deaths'] = df['Fatalities'] - df['deaths_lag_1']

In [9]:
df.head()

Unnamed: 0,SNo,Date,Province_State,Country_Region,Last Update,ConfirmedCases,Fatalities,Recovered,Day,cases_lag_1,deaths_lag_1,Daily Cases,Daily Deaths
0,1,2020-01-22,Anhui,China,1/22/2020 17:00,1.0,0.0,0.0,22,,,,
1,2,2020-01-22,Beijing,China,1/22/2020 17:00,14.0,0.0,0.0,22,,,,
2,3,2020-01-22,Chongqing,China,1/22/2020 17:00,6.0,0.0,0.0,22,,,,
3,4,2020-01-22,Fujian,China,1/22/2020 17:00,1.0,0.0,0.0,22,,,,
4,5,2020-01-22,Gansu,China,1/22/2020 17:00,0.0,0.0,0.0,22,,,,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109382 entries, 0 to 109381
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   SNo             109382 non-null  int64         
 1   Date            109382 non-null  datetime64[ns]
 2   Province_State  75706 non-null   object        
 3   Country_Region  109382 non-null  object        
 4   Last Update     109382 non-null  object        
 5   ConfirmedCases  109382 non-null  float64       
 6   Fatalities      109382 non-null  float64       
 7   Recovered       109382 non-null  float64       
 8   Day             109382 non-null  int32         
 9   cases_lag_1     74966 non-null   float64       
 10  deaths_lag_1    74966 non-null   float64       
 11  Daily Cases     74966 non-null   float64       
 12  Daily Deaths    74966 non-null   float64       
dtypes: datetime64[ns](1), float64(7), int32(1), int64(1), object(3)
memory usage: 10.4+ MB


# EDA
 let's perform some EDA on the data in order to get a better understanding of the data and how COVID19 is affecting all of us.

In [11]:
display(df['Date'].min())
display(df['Date'].max())

Timestamp('2020-01-22 00:00:00')

Timestamp('2020-09-13 00:00:00')

## Universal growth of COVID19 over time
In this section, I'll have a look at how COVID19 has been growing throughout the world from 22nd january 2020. I'll be using tree maps to show the share of COVID19 Cases worldwide and chloropleth maps with a time slider to show the daily impact of virus.

In [12]:
class country_utils():
    def __init__(self):
        self.d = {} 
        
    def get_dic(self):
        return self.d
    
    def get_country_details(self,country):
        try:
            country_obj = pycountry.countries.get(name=country)
            if country_obj is None:
                country_obj = pycountry.countries.search_fuzzy(country)[0]
            continent_code = pc.country_alpha2_to_continent_code(country_obj.alpha_2)
            continent = pc.convert_continent_code_to_continent_name(continent_code)
            return country_obj.alpha_3, continent
        except:
            if 'Congo' in country:
                country = 'Congo'
            elif country == 'Diamond Princess' or country == 'Laos' or country == 'MS Zaandam'\
            or country == 'Holy See' or country == 'Timor-Leste':
                return country, country
            elif country == 'Korea, South' or country == 'South Korea':
                country = 'Korea, Republic of'
            elif country == 'Taiwan*':
                country = 'Taiwan'
            elif country == 'Burma':
                country = 'Myanmar'
            elif country == 'West Bank and Gaza':
                country = 'Gaza'
            else:
                return country, country
            country_obj = pycountry.countries.search_fuzzy(country)
            continent_code = pc.country_alpha2_to_continent_code(country_obj[0].alpha_2)
            continent = pc.convert_continent_code_to_continent_name(continent_code)
            return country_obj[0].alpha_3, continent
        
    def get_iso3(self, country):
        return self.d[country]['code']
        
    def get_continent(self,country):
        return self.d[country]['continent']
    
    def add_values(self,country):
        self.d[country] = {}
        self.d[country]['code'],self.d[country]['continent'] = self.get_country_details(country)
        
    def fetch_iso3(self,country):
        if country in self.d.keys():
            return self.get_iso3(country)
        else:
            self.add_values(country)
            return self.get_iso3(country)
        
    def fetch_continent(self,country):
        if country in self.d.keys():
            return self.get_continent(country)
        else:
            self.add_values(country)
            return self.get_continent(country)

In [35]:
df.ConfirmedCases = np.abs(df.ConfirmedCases)
df_tm = df.copy()
date = df_tm.Date.max() #get current date
df_tm = df_tm[df_tm['Date']==date]
obj = country_utils()
df_tm.Province_State.fillna('',inplace=True)
df_tm['continent'] = df_tm.apply(lambda x: obj.fetch_continent(x['Country_Region']), axis=1)
df_tm["world"] = "World"
fig = px.treemap(df_tm, path=['world', 'continent', 'Country_Region'], values='ConfirmedCases',
                  color='ConfirmedCases', hover_data=['Country_Region'],
                  color_continuous_scale='dense', title='Current share of Worldwide COVID19 Cases')
fig.update_layout(width=700,template='seaborn')
fig.show()



In [30]:
obj.get_dic()

{'Afghanistan': {'code': 'AFG', 'continent': 'Asia'},
 'Albania': {'code': 'ALB', 'continent': 'Europe'},
 'Algeria': {'code': 'DZA', 'continent': 'Africa'},
 'Andorra': {'code': 'AND', 'continent': 'Europe'},
 'Angola': {'code': 'AGO', 'continent': 'Africa'},
 'Antigua and Barbuda': {'code': 'ATG', 'continent': 'North America'},
 'Argentina': {'code': 'ARG', 'continent': 'South America'},
 'Armenia': {'code': 'ARM', 'continent': 'Asia'},
 'Austria': {'code': 'AUT', 'continent': 'Europe'},
 'Azerbaijan': {'code': 'AZE', 'continent': 'Asia'},
 'Bahamas': {'code': 'BHS', 'continent': 'North America'},
 'Bahrain': {'code': 'BHR', 'continent': 'Asia'},
 'Bangladesh': {'code': 'BGD', 'continent': 'Asia'},
 'Barbados': {'code': 'BRB', 'continent': 'North America'},
 'Belarus': {'code': 'BLR', 'continent': 'Europe'},
 'Belgium': {'code': 'BEL', 'continent': 'Europe'},
 'Belize': {'code': 'BLZ', 'continent': 'North America'},
 'Benin': {'code': 'BEN', 'continent': 'Africa'},
 'Bhutan': {'code'

In [45]:
dff = df_tm[df_tm['Fatalities'] != 0]
fig = px.treemap(dff, path=['world', 'continent', 'Country_Region'], values='Fatalities',
                  color='Fatalities', hover_data=['Country_Region'],
                  color_continuous_scale='matter', title='Current share of Worldwide COVID19 Deaths')
fig.update_layout(width=700,template='seaborn')
fig.show()

Confirmed Cases and Fatalities are cummulative sums of all the previous days. In order to understand the daily trend, I'll create a column for daily cases and deaths that will be the difference between the current value and the previous day's value

In [46]:
def add_daily_measures(df):
    df.loc[0,'Daily Cases'] = df.loc[0,'ConfirmedCases']
    df.loc[0,'Daily Deaths'] = df.loc[0,'Fatalities']
    for i in range(1,len(df)):
        df.loc[i,'Daily Cases'] = df.loc[i,'ConfirmedCases'] - df.loc[i-1,'ConfirmedCases']
        df.loc[i,'Daily Deaths'] = df.loc[i,'Fatalities'] - df.loc[i-1,'Fatalities']
    #Make the first row as 0 because we don't know the previous value
    df.loc[0,'Daily Cases'] = 0
    df.loc[0,'Daily Deaths'] = 0
    return df

In [49]:
df_world = df.copy()
df_world = df_world.groupby('Date',as_index=False)[['ConfirmedCases','Fatalities','Daily Cases','Daily Deaths']].sum()
df_world = add_daily_measures(df_world)

In [50]:
df_world

Unnamed: 0,Date,ConfirmedCases,Fatalities,Daily Cases,Daily Deaths
0,2020-01-22,555.0,17.0,0.0,0.0
1,2020-01-23,653.0,18.0,98.0,1.0
2,2020-01-24,941.0,26.0,288.0,8.0
3,2020-01-25,1438.0,42.0,497.0,16.0
4,2020-01-26,2118.0,56.0,680.0,14.0
...,...,...,...,...,...
231,2020-09-09,27863733.0,903686.0,292991.0,6303.0
232,2020-09-10,28161885.0,909479.0,298152.0,5793.0
233,2020-09-11,28481413.0,915356.0,319528.0,5877.0
234,2020-09-12,28759036.0,920231.0,277623.0,4875.0


In [51]:
def draw_graph(df,x,y1,y2,title,days=7):
    colors = dict(case='#4285F4',death='#EA4335')
    df['cases_roll_avg'] = df[y1].rolling(days).mean()
    df['deaths_roll_avg'] = df[y2].rolling(days).mean()
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(name='Daily Cases',x=df[x],y=df[y1],mode='lines',
                             line=dict(width=0.5,color=colors['case'])),
                 secondary_y=False)
    fig.add_trace(go.Scatter(name='Daily Deaths',x=df[x],y=df[y2],mode='lines',
                             line=dict(width=0.5,color=colors['death'])),
                 secondary_y=True)
    fig.add_trace(go.Scatter(name='Cases: <br>'+str(days)+'-Day Rolling average',
                             x=df[x],y=df['cases_roll_avg'],mode='lines',
                             line=dict(width=3,color=colors['case'])),
                 secondary_y=False)
    fig.add_trace(go.Scatter(name='Deaths: <br>'+str(days)+'-Day rolling average',
                             x=df[x],y=df['deaths_roll_avg'],mode='lines',
                             line=dict(width=3,color=colors['death'])),
                 secondary_y=True)
    
    fig.update_yaxes(title_text='Cases',title_font=dict(color=colors['case']),secondary_y=False,nticks=5,
                     tickfont=dict(color=colors['case']),linewidth=2,linecolor='black',gridcolor='darkgray',
                    zeroline=False)
    fig.update_yaxes(title_text='Deaths',title_font=dict(color=colors['death']),secondary_y=True,nticks=5,
                     tickfont=dict(color=colors['death']),linewidth=2,linecolor='black',gridcolor='darkgray',
                    zeroline=False)

    fig.update_layout(title=title,height=400,width=700,
                      margin=dict(l=0,r=0,t=60,b=30),hovermode='x',
                      legend=dict(x=0.01,y=0.99,bordercolor='black',borderwidth=1,bgcolor='#EED8E4',
                                  font=dict(family='arial',size=10)),
                     xaxis=dict(mirror=True,linewidth=2,linecolor='black',gridcolor='darkgray'),
                     plot_bgcolor='rgb(255,255,255)')
    return fig

In [52]:
fig = draw_graph(
    df_world,
    'Date',
    'Daily Cases',
    'Daily Deaths',
    '<b>Worldwide: Daily Cases & Deaths</b><br>   With 7-Day Rolling averages')
fig.show()

In [54]:
df_map = df.copy()
df_map['Date'] = df_map['Date'].astype(str)
df_map = df_map.groupby(['Date','Country_Region'], as_index=False)[['ConfirmedCases','Fatalities']].sum()

In [56]:
df_map['iso_alpha'] = df_map.apply(lambda x: obj.fetch_iso3(x['Country_Region']), axis=1)

In [58]:
df_map['log(ConfirmedCases)'] = np.log(df_map.ConfirmedCases + 1)
df_map['log(Fatalities)'] = np.log(df_map.Fatalities + 1)

In [59]:
df_map

Unnamed: 0,Date,Country_Region,ConfirmedCases,Fatalities,iso_alpha,log(ConfirmedCases),log(Fatalities)
0,2020-01-22,China,547.0,17.0,CHN,6.306275,2.890372
1,2020-01-22,Hong Kong,0.0,0.0,HKG,0.000000,0.000000
2,2020-01-22,Japan,2.0,0.0,JPN,1.098612,0.000000
3,2020-01-22,Macau,1.0,0.0,Macau,0.693147,0.000000
4,2020-01-22,South Korea,1.0,0.0,KOR,0.693147,0.000000
...,...,...,...,...,...,...,...
36926,2020-09-13,West Bank and Gaza,30574.0,221.0,PSE,10.327938,5.402677
36927,2020-09-13,Western Sahara,10.0,1.0,Western Sahara,2.397895,0.693147
36928,2020-09-13,Yemen,2011.0,583.0,YEM,7.606885,6.369901
36929,2020-09-13,Zambia,13539.0,312.0,ZMB,9.513404,5.746203


In [60]:
px.choropleth(df_map, 
              locations="iso_alpha", 
              color="log(ConfirmedCases)", 
              hover_name="Country_Region", 
              hover_data=["ConfirmedCases"] ,
              animation_frame="Date",
              color_continuous_scale=px.colors.sequential.dense, 
              title='Total Confirmed Cases growth(Logarithmic Scale)')