# Covid 19 Exploratory Data Analysis Project

Dataset - https://www.kaggle.com/datasets/sudalairajkumar/novel-corona-virus-2019-dataset

In [1]:
import numpy as np
import pandas as pd

In [3]:
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [4]:
init_notebook_mode(connected=True)

In [6]:
data = pd.read_csv('data/covid_19_data.csv')
data.head(10)

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
5,6,01/22/2020,Guangdong,Mainland China,1/22/2020 17:00,26.0,0.0,0.0
6,7,01/22/2020,Guangxi,Mainland China,1/22/2020 17:00,2.0,0.0,0.0
7,8,01/22/2020,Guizhou,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
8,9,01/22/2020,Hainan,Mainland China,1/22/2020 17:00,4.0,0.0,0.0
9,10,01/22/2020,Hebei,Mainland China,1/22/2020 17:00,1.0,0.0,0.0


In [24]:
data = data.rename(columns={'Country/Region':'Country',
                            'Province/State':'State', 
                            'ObservationDate':'Date'})

In [12]:
data = data.drop('SNo', axis=1)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306429 entries, 0 to 306428
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   ObservationDate  306429 non-null  object 
 1   State            228329 non-null  object 
 2   Country          306429 non-null  object 
 3   Last Update      306429 non-null  object 
 4   Confirmed        306429 non-null  float64
 5   Deaths           306429 non-null  float64
 6   Recovered        306429 non-null  float64
dtypes: float64(3), object(4)
memory usage: 16.4+ MB


In [14]:
data.describe()

Unnamed: 0,Confirmed,Deaths,Recovered
count,306429.0,306429.0,306429.0
mean,85670.91,2036.403268,50420.29
std,277551.6,6410.938048,201512.4
min,-302844.0,-178.0,-854405.0
25%,1042.0,13.0,11.0
50%,10375.0,192.0,1751.0
75%,50752.0,1322.0,20270.0
max,5863138.0,112385.0,6399531.0


In [17]:
data.isnull().sum()

ObservationDate        0
State              78100
Country                0
Last Update            0
Confirmed              0
Deaths                 0
Recovered              0
dtype: int64

In [25]:
data.head()

Unnamed: 0,Date,State,Country,Last Update,Confirmed,Deaths,Recovered
0,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [28]:
df_countries = data.groupby(['Country', 'Date']).sum().reset_index().sort_values('Date', ascending=False)
df_countries = df_countries.drop_duplicates(subset = ['Country'])
df_countries = df_countries[df_countries['Confirmed']>0]

In [29]:
fig = go.Figure(data=go.Choropleth(
    locations = df_countries['Country'],
    locationmode = 'country names',
    z = df_countries['Confirmed'],
    colorscale = 'Reds',
    marker_line_color = 'black',
    marker_line_width = 0.5,
))
fig.update_layout(
    title_text = 'Confirmed Cases as of March 28, 2020',
    title_x = 0.5,
    geo=dict(
        showframe = False,
        showcoastlines = False,
        projection_type = 'equirectangular'
    )
)