<a href="https://colab.research.google.com/github/AbdullahMakhdoom/Covid-19_Analysis_Visualization/blob/main/covid19_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [87]:
!pip install folium # for geographical plots
!pip install --upgrade plotly  # for interactive, dynamic plots

Collecting plotly
[?25l  Downloading https://files.pythonhosted.org/packages/c9/09/315462259ab7b60a3d4b7159233ed700733c87d889755bdc00a9fb46d692/plotly-4.14.1-py2.py3-none-any.whl (13.2MB)
[K     |████████████████████████████████| 13.2MB 329kB/s 
Installing collected packages: plotly
  Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-4.14.1


In [88]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly as py

import folium

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


%matplotlib inline

import math
import random
from datetime import timedelta # for custom dates

import warnings
warnings.filterwarnings('ignore')

# color pallette
cnf = '#393e46'
dth = '#ff2e63'
rec = '#21bf73'
act = '#fe9801'


## Data Preparation

In [4]:
# remove pre-existing 'Covid-19-Preprocessed-Dataset' folder
# since clone does not work if repo already exists
import os

try:
  os.system("rm -rf Covid-19-Preprocessed-Dataset")
except:
  print("File does not exist")

In [5]:
# fetching the latest covid-19 data
!git clone https://github.com/laxmimerit/Covid-19-Preprocessed-Dataset.git

Cloning into 'Covid-19-Preprocessed-Dataset'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 1269 (delta 9), reused 25 (delta 7), pack-reused 1231[K
Receiving objects: 100% (1269/1269), 200.09 MiB | 17.25 MiB/s, done.
Resolving deltas: 100% (438/438), done.
Checking out files: 100% (819/819), done.


In [67]:
df = pd.read_csv('Covid-19-Preprocessed-Dataset/preprocessed/covid_19_data_cleaned.csv', parse_dates = ['Date'] )


In [7]:
df.head()

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
0,2020-01-22,,Afghanistan,33.93911,67.709953,0,0,0,0
1,2020-01-23,,Afghanistan,33.93911,67.709953,0,0,0,0
2,2020-01-24,,Afghanistan,33.93911,67.709953,0,0,0,0
3,2020-01-25,,Afghanistan,33.93911,67.709953,0,0,0,0
4,2020-01-26,,Afghanistan,33.93911,67.709953,0,0,0,0


In [68]:
df['Province/State'] = df['Province/State'].fillna("")
df

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
0,2020-01-22,,Afghanistan,33.93911,67.709953,0,0,0,0
1,2020-01-23,,Afghanistan,33.93911,67.709953,0,0,0,0
2,2020-01-24,,Afghanistan,33.93911,67.709953,0,0,0,0
3,2020-01-25,,Afghanistan,33.93911,67.709953,0,0,0,0
4,2020-01-26,,Afghanistan,33.93911,67.709953,0,0,0,0
...,...,...,...,...,...,...,...,...,...
95837,2020-12-28,,Timor-Leste,-8.87420,125.727500,0,32,0,-32
95838,2020-12-29,,Timor-Leste,-8.87420,125.727500,0,32,0,-32
95839,2020-12-30,,Timor-Leste,-8.87420,125.727500,0,32,0,-32
95840,2020-12-31,,Timor-Leste,-8.87420,125.727500,0,32,0,-32


In [9]:
country_daywise = pd.read_csv('Covid-19-Preprocessed-Dataset/preprocessed/country_daywise.csv', parse_dates = ['Date'] )
countrywise = pd.read_csv('Covid-19-Preprocessed-Dataset/preprocessed/countrywise.csv')
daywise = pd.read_csv('Covid-19-Preprocessed-Dataset/preprocessed/daywise.csv', parse_dates = ['Date'] )


In [11]:
country_daywise.head()

Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active,New Cases,New Recovered,New Deaths
0,2020-01-23,Afghanistan,0,0,0,0,0,0,0
1,2020-01-24,Afghanistan,0,0,0,0,0,0,0
2,2020-01-25,Afghanistan,0,0,0,0,0,0,0
3,2020-01-26,Afghanistan,0,0,0,0,0,0,0
4,2020-01-27,Afghanistan,0,0,0,0,0,0,0


In [12]:
countrywise.head()

Unnamed: 0,Country,Confirmed,Deaths,Recovered,Active,New Cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Population,Cases / Million People,Confirmed last week,1 week change,1 week % increase
0,Afghanistan,51405,2181,41612,7612,55,4.24,80.95,5.24,38928341,1321.0,50433,972,1.93
1,Albania,57727,1174,33185,23368,581,2.03,57.49,3.54,2877800,20059.0,54317,3410,6.28
2,Algeria,99311,2751,66855,29705,323,2.77,67.32,4.11,43851043,2265.0,96549,2762,2.86
3,Andorra,7983,84,7384,515,64,1.05,92.5,1.14,77265,103320.0,7669,314,4.09
4,Angola,17433,405,10859,6169,62,2.32,62.29,3.73,32866268,530.0,16931,502,2.96


In [13]:
daywise.head()

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,New Cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of Countries
0,2020-01-23,654,18,30,606,100,2.75,4.59,60.0,8
1,2020-01-24,941,26,36,879,288,2.76,3.83,72.22,9
2,2020-01-25,1434,42,39,1353,495,2.93,2.72,107.69,11
3,2020-01-26,2118,56,52,2010,684,2.64,2.46,107.69,13
4,2020-01-27,2927,82,61,2784,809,2.8,2.08,134.43,16


In [15]:
confirmed = df.groupby('Date').sum()['Confirmed'].reset_index()

In [16]:
confirmed

Unnamed: 0,Date,Confirmed
0,2020-01-22,555
1,2020-01-23,654
2,2020-01-24,941
3,2020-01-25,1434
4,2020-01-26,2118
...,...,...
341,2020-12-28,81285853
342,2020-12-29,81951541
343,2020-12-30,82708280
344,2020-12-31,83424446


In [18]:
recovered = df.groupby('Date').sum()['Recovered'].reset_index()
recovered

Unnamed: 0,Date,Recovered
0,2020-01-22,28
1,2020-01-23,30
2,2020-01-24,36
3,2020-01-25,39
4,2020-01-26,52
...,...,...
341,2020-12-28,45994197
342,2020-12-29,46348935
343,2020-12-30,46731540
344,2020-12-31,47010049


In [19]:
deaths = df.groupby('Date').sum()['Deaths'].reset_index()
deaths

Unnamed: 0,Date,Deaths
0,2020-01-22,17
1,2020-01-23,18
2,2020-01-24,26
3,2020-01-25,42
4,2020-01-26,56
...,...,...
341,2020-12-28,1774390
342,2020-12-29,1789915
343,2020-12-30,1805008
344,2020-12-31,1818116


In [20]:
# check whether there is null values
df.isnull().sum()

Date              0
Province/State    0
Country           0
Lat               0
Long              0
Confirmed         0
Recovered         0
Deaths            0
Active            0
dtype: int64

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95842 entries, 0 to 95841
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            95842 non-null  datetime64[ns]
 1   Province/State  95842 non-null  object        
 2   Country         95842 non-null  object        
 3   Lat             95842 non-null  float64       
 4   Long            95842 non-null  float64       
 5   Confirmed       95842 non-null  int64         
 6   Recovered       95842 non-null  int64         
 7   Deaths          95842 non-null  int64         
 8   Active          95842 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(4), object(2)
memory usage: 6.6+ MB


In [24]:
df.query('Country ==  "Pakistan"')

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
69546,2020-01-22,,Pakistan,30.3753,69.3451,0,0,0,0
69547,2020-01-23,,Pakistan,30.3753,69.3451,0,0,0,0
69548,2020-01-24,,Pakistan,30.3753,69.3451,0,0,0,0
69549,2020-01-25,,Pakistan,30.3753,69.3451,0,0,0,0
69550,2020-01-26,,Pakistan,30.3753,69.3451,0,0,0,0
...,...,...,...,...,...,...,...,...,...
69887,2020-12-28,,Pakistan,30.3753,69.3451,475085,425494,9992,39599
69888,2020-12-29,,Pakistan,30.3753,69.3451,477240,430113,10047,37080
69889,2020-12-30,,Pakistan,30.3753,69.3451,479715,435073,10105,34537
69890,2020-12-31,,Pakistan,30.3753,69.3451,482178,437229,10176,34773


## Worldwide Total Confirmed, Recovered & Deaths

In [25]:
confirmed.tail()

Unnamed: 0,Date,Confirmed
341,2020-12-28,81285853
342,2020-12-29,81951541
343,2020-12-30,82708280
344,2020-12-31,83424446
345,2021-01-01,83963772


In [26]:
recovered.tail()

Unnamed: 0,Date,Recovered
341,2020-12-28,45994197
342,2020-12-29,46348935
343,2020-12-30,46731540
344,2020-12-31,47010049
345,2021-01-01,47289078


In [27]:
deaths.tail()

Unnamed: 0,Date,Deaths
341,2020-12-28,1774390
342,2020-12-29,1789915
343,2020-12-30,1805008
344,2020-12-31,1818116
345,2021-01-01,1827540


In [32]:
import plotly.io as pio
pio.renderers.default = 'colab'

In [42]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = confirmed['Date'], y = confirmed['Confirmed'],
                         mode = 'lines+markers', name = 'Confirmed',
                         line = dict(color = "Orange", width = 2)))

fig.add_trace(go.Scatter(x = recovered['Date'], y = recovered['Recovered'],
                         mode = 'lines+markers', name = 'Recovered',
                         line = dict(color = "Green", width = 2)))
fig.add_trace(go.Scatter(x = deaths['Date'], y = deaths['Deaths'],
                         mode = 'lines+markers', name = 'Deaths',
                         line = dict(color = "Red", width = 2)))
fig.update_layout(title = "Worldwide Covid-19 Cases", 
                  xaxis_tickfont_size = 14, yaxis = dict(title = "Number of Cases"))
fig.show()

## Cases Density Animation on World Map

In [43]:
# converting 'Date' column in df to object type
df['Date'] = df['Date'].astype(str)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95842 entries, 0 to 95841
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            95842 non-null  object 
 1   Province/State  95842 non-null  object 
 2   Country         95842 non-null  object 
 3   Lat             95842 non-null  float64
 4   Long            95842 non-null  float64
 5   Confirmed       95842 non-null  int64  
 6   Recovered       95842 non-null  int64  
 7   Deaths          95842 non-null  int64  
 8   Active          95842 non-null  int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 6.6+ MB


In [52]:
fig = px.density_mapbox(df, lat = 'Lat', lon = 'Long', hover_name = 'Country', 
                        hover_data = ['Confirmed', 'Recovered', 'Deaths'],
                        animation_frame = 'Date',
                        color_continuous_scale = 'Portland',
                        radius = 7,zoom = 0, height = 700)
fig.update_layout(title = 'Worldwide Covid-19 Cases with Time Lapse')
fig.update_layout(mapbox_style = 'open-street-map', mapbox_center_lon = 0)
fig.show()

### Total Cases on Ships

In [53]:
# convert 'Date' column back to datetime type
df['Date'] = pd.to_datetime(df['Date'])

In [69]:
# Ships effected by Covid

ship_rows = df['Province/State'].str.contains('Grand Princess') | df['Province/State'].str.contains('Diamond Princess') | df['Country'].str.contains('Grand Princess') | df['Country'].str.contains('Diamond Princess') | df['Country'].str.contains('MS Zaandam') 
ship = df[ship_rows]
ship


Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
14186,2020-01-22,Diamond Princess,Canada,0.0,0.0,0,0,0,0
14187,2020-01-23,Diamond Princess,Canada,0.0,0.0,0,0,0,0
14188,2020-01-24,Diamond Princess,Canada,0.0,0.0,0,0,0,0
14189,2020-01-25,Diamond Princess,Canada,0.0,0.0,0,0,0,0
14190,2020-01-26,Diamond Princess,Canada,0.0,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
59161,2020-12-28,,MS Zaandam,0.0,0.0,9,7,2,0
59162,2020-12-29,,MS Zaandam,0.0,0.0,9,7,2,0
59163,2020-12-30,,MS Zaandam,0.0,0.0,9,7,2,0
59164,2020-12-31,,MS Zaandam,0.0,0.0,9,7,2,0


In [70]:
df = df[~ship_rows]

In [71]:
ship_latest = ship[ship['Date'] == max(ship['Date'])]
ship_latest

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
14531,2021-01-01,Diamond Princess,Canada,0.0,0.0,0,0,1,-1
14877,2021-01-01,Grand Princess,Canada,0.0,0.0,13,0,0,13
36329,2021-01-01,,Diamond Princess,0.0,0.0,712,699,13,0
59165,2021-01-01,,MS Zaandam,0.0,0.0,9,7,2,0


In [72]:
ship_latest.style.background_gradient(cmap = 'Pastel1_r')

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
14531,2021-01-01 00:00:00,Diamond Princess,Canada,0.0,0.0,0,0,1,-1
14877,2021-01-01 00:00:00,Grand Princess,Canada,0.0,0.0,13,0,0,13
36329,2021-01-01 00:00:00,,Diamond Princess,0.0,0.0,712,699,13,0
59165,2021-01-01 00:00:00,,MS Zaandam,0.0,0.0,9,7,2,0


## Cases over Time with Area Plot

In [73]:
temp = df.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active
0,2020-01-22,555,17,28,510
1,2020-01-23,654,18,30,606
2,2020-01-24,941,26,36,879
3,2020-01-25,1434,42,39,1353
4,2020-01-26,2118,56,52,2010
...,...,...,...,...,...
341,2020-12-28,81285119,1774374,45993491,33517254
342,2020-12-29,81950807,1789899,46348229,33812679
343,2020-12-30,82707546,1804992,46730834,34171720
344,2020-12-31,83423712,1818100,47009343,34596269


In [75]:
# latest data
temp = temp[ temp['Date'] == max(temp['Date'])].reset_index(drop = True)
temp

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active
0,2021-01-01,83963038,1827524,47288372,34847142


In [90]:
tm = temp.melt(id_vars = 'Date', value_vars = ['Active', 'Deaths', 'Recovered'])
fig = px.treemap(tm,  path = ['variable'], values = 'value', 
                 height = 250, width = 800, 
                 color_discrete_sequence= [act, rec, dth])
fig.data[0].textinfo = 'label+text+value'
fig.show()




TypeError: ignored