In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import folium
import plotly.express as px
import seaborn as sns

In [21]:
#reading csv file
covid = pd.read_csv("covid19.csv")
covid.head(10)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Thailand,15.0,101.0,1/22/20,2,0,0
1,,Japan,36.0,138.0,1/22/20,2,0,0
2,,Singapore,1.2833,103.8333,1/22/20,0,0,0
3,,Nepal,28.1667,84.25,1/22/20,0,0,0
4,,Malaysia,2.5,112.5,1/22/20,0,0,0
5,British Columbia,Canada,49.2827,-123.1207,1/22/20,0,0,0
6,New South Wales,Australia,-33.8688,151.2093,1/22/20,0,0,0
7,Victoria,Australia,-37.8136,144.9631,1/22/20,0,0,0
8,Queensland,Australia,-28.0167,153.4,1/22/20,0,0,0
9,,Cambodia,11.55,104.9167,1/22/20,0,0,0


In [22]:
#dataFrame info
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10965 entries, 0 to 10964
Data columns (total 8 columns):
Province/State    5457 non-null object
Country/Region    10965 non-null object
Lat               10965 non-null float64
Long              10965 non-null float64
Date              10965 non-null object
Confirmed         10965 non-null int64
Deaths            10965 non-null int64
Recovered         10965 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 556.9+ KB


In [23]:
#checking missing value
covid.isnull().sum()

Province/State    5508
Country/Region       0
Lat                  0
Long                 0
Date                 0
Confirmed            0
Deaths               0
Recovered            0
dtype: int64

# Data Preprocessing

### Cleaning Data

In [24]:
# replace Mainland china with just China
covid['Country/Region'] = covid['Country/Region'].replace('Mainland China','China')

#filling missing values with NA
covid[['Province/State']] = covid[['Province/State']].fillna('NA')
covid

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Thailand,15.0000,101.0000,1/22/20,2,0,0
1,,Japan,36.0000,138.0000,1/22/20,2,0,0
2,,Singapore,1.2833,103.8333,1/22/20,0,0,0
3,,Nepal,28.1667,84.2500,1/22/20,0,0,0
4,,Malaysia,2.5000,112.5000,1/22/20,0,0,0
...,...,...,...,...,...,...,...,...
10960,,Reunion,-21.1151,55.5364,03-12-2020,1,0,0
10961,,Turkey,38.9637,35.2433,03-12-2020,1,0,0
10962,Gibraltar,United Kingdom,36.1408,-5.3536,03-12-2020,1,0,1
10963,,Cuba,22.0000,-80.0000,03-12-2020,3,0,0


## Derived Tables

# Exploratory Data Analysis(EDA)

### Complete Data

In [25]:
temp = covid.groupby(['Country/Region','Province/State'])['Confirmed','Deaths','Recovered'].max()
temp.style.background_gradient(cmap='Pastel1_r')

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Recovered
Country/Region,Province/State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,,7,0,0
Albania,,23,1,0
Algeria,,24,1,8
Andorra,,1,0,1
Argentina,,19,1,0
Armenia,,4,0,0
Australia,From Diamond Princess,8,0,0
Australia,New South Wales,65,2,4
Australia,Northern Territory,1,0,0
Australia,Queensland,20,0,8


# Chinese Province Wise Data

In [26]:
#this shows only thats states which are in china
china = covid[covid['Country/Region'] == 'China'].reset_index()
china

Unnamed: 0,index,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,156,Hubei,China,30.9756,112.2707,1/22/20,444,17,28
1,160,Guangdong,China,23.3417,113.4244,1/22/20,26,0,0
2,161,Henan,China,33.8820,113.6140,1/22/20,5,0,0
3,162,Zhejiang,China,29.1832,120.0934,1/22/20,10,0,0
4,163,Hunan,China,27.6104,111.7088,1/22/20,4,0,0
...,...,...,...,...,...,...,...,...,...
1678,10941,Inner Mongolia,China,44.0935,113.9448,03-12-2020,75,1,71
1679,10942,Ningxia,China,37.2692,106.1655,03-12-2020,75,0,72
1680,10946,Qinghai,China,35.7452,95.9956,03-12-2020,18,0,18
1681,10947,Macau,China,22.1667,113.5500,03-12-2020,10,0,10


In [27]:
# temp_f = china[['Province/State', 'Confirmed', 'Deaths', 'Recovered']].groupby(['Province/State']).sum().reset_index()
# temp_f = temp_f.sort_values(by='Confirmed',ascending =False)
# temp_f = temp_f.reset_index(drop=True)
# temp_f.style.background_gradient(cmap='Pastel1_r')

In [28]:
m = folium.Map(location=[30,116],tiles='cartodbpositron',min_zoom=2,max_zoom=5,zoom_start=3)
for i in range(0,len(china)):
    folium.Circle(
        location=[china.iloc[i]['Lat'],china.iloc[i]['Long']],
        color='crimson',
        tooltip = '<li><bold>Country: '+str(china.iloc[i]['Country/Region'])+
        '<li><bold>Province: '+str(china.iloc[i]['Province/State'])+
        '<li><bold>Confirmed: '+str(china.iloc[i]['Confirmed'])+
        '<li><bold>Deaths: '+str(china.iloc[i]['Deaths'])+
        '<li><bold>Recovered: '+str(china.iloc[i]['Recovered']),
        radius = int(china.iloc[i]['Confirmed'])**1).add_to(m)
m

# Plots for Visualization

In [34]:
temp = covid.groupby(['Country/Region', 'Date'])['Confirmed', 'Deaths', 'Recovered'].sum()
temp = temp.reset_index()
# temp.head()

fig = px.bar(temp, x="Date", y="Confirmed", color='Country/Region', orientation='v', height=600,
             title='Cases', color_discrete_sequence = px.colors.cyclical.mygbm)
fig.show()

fig = px.bar(temp,x="Date",y="Deaths",color="Country/Region",orientation='v',height=600,
            title="Deaths",color_discrete_sequence = px.colors.cyclical.mygbm)
fig.show()

# No of Places to which COVID-19 Spread

In [55]:
c_spread = china[china['Confirmed']!=0].groupby('Date')['Province/State'].unique().apply(len)
c_spread = pd.DataFrame(c_spread).reset_index()
fig = px.line(c_spread, x='Date', y='Province/State', 
              title='Number of Provinces/States/Regions of China to which COVID-19 spread over the time')
fig.show()



spread = covid[covid['Confirmed']!=0].groupby('Date')['Country/Region'].unique().apply(len)
spread = pd.DataFrame(spread).reset_index()

fig = px.line(spread, x='Date', y='Country/Region', 
              title='Number of Countries/Regions to which COVID-19 spread over the time')
fig.show()

# Comparison With other Example

In [56]:
epidemics = pd.DataFrame({
    'epidemic' : ['COVID-19', 'SARS', 'EBOLA', 'MERS', 'H1N1'],
    'start_year' : [2019, 2003, 2014, 2012, 2009],
    'end_year' : [2020, 2004, 2016, 2017, 2010],
    'confirmed' : [80000, 8096, 28646, 2494, 6724149],
    'deaths' : [2750, 774, 11323, 858, 19654]
})

epidemics['mortality'] = round((epidemics['deaths']/epidemics['confirmed'])*100, 2)

epidemics.head()

Unnamed: 0,epidemic,start_year,end_year,confirmed,deaths,mortality
0,COVID-19,2019,2020,80000,2750,3.44
1,SARS,2003,2004,8096,774,9.56
2,EBOLA,2014,2016,28646,11323,39.53
3,MERS,2012,2017,2494,858,34.4
4,H1N1,2009,2010,6724149,19654,0.29


In [57]:
temp = epidemics.melt(id_vars='epidemic', value_vars=['confirmed', 'deaths', 'mortality'],
                      var_name='Case', value_name='Value')

fig = px.bar(temp, x="epidemic", y="Value", color='epidemic', text='Value', facet_col="Case",
             color_discrete_sequence = px.colors.qualitative.Bold)
fig.update_traces(textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_yaxes(showticklabels=False)
fig.layout.yaxis2.update(matches=None)
fig.layout.yaxis3.update(matches=None)
fig.show()