In [2]:
from IPython.core.display import HTML

In [3]:


HTML('''<div class="flourish-embed flourish-cards" data-src="visualisation/1810417" data-url="https://flo.uri.sh/visualisation/1810417/embed"><script src="https://public.flourish.studio/resources/embed.js"></script></div>''')



In [3]:
# importing datasets
# ==================

full_table = pd.read_csv('covid_19_clean_complete.csv', 
                         parse_dates=['Date'])
full_table.sample(6)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
8566,,Gambia,13.4432,-15.3101,2020-02-23,0,0,0
18511,,Burkina Faso,12.2383,-1.5616,2020-04-01,282,16,46
8907,,Seychelles,-4.6796,55.492,2020-02-24,0,0,0
22859,,Malta,35.9375,14.3754,2020-04-17,422,3,91
19207,,Somalia,5.1521,46.1996,2020-04-03,7,0,1
11973,Greenland,Denmark,71.7069,-42.6043,2020-03-07,0,0,0


In [32]:
# for offline ploting
# ===================
from plotly.offline import plot, iplot, init_notebook_mode
import pandas as pd
import numpy as np
from datetime import timedelta
import plotly.express as px

init_notebook_mode(connected=True)

# Preprocessing

In [5]:
# Ship
# ====

# ship rows
ship_rows = full_table['Province/State'].str.contains('Grand Princess') | full_table['Province/State'].str.contains('Diamond Princess') | full_table['Country/Region'].str.contains('Diamond Princess') | full_table['Country/Region'].str.contains('MS Zaandam')

# ship
ship = full_table[ship_rows]

# full table 
full_table = full_table[~(ship_rows)]

# Latest cases from the ships
ship_latest = ship[ship['Date']==max(ship['Date'])]

# ship_latest.style.background_gradient(cmap='Pastel1_r')

In [8]:
# Cleaning data
# =============

# Active Case = confirmed - deaths - recovered
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']

# replacing Mainland china with just China
full_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')

# filling missing values 
full_table[['Province/State']] = full_table[['Province/State']].fillna('')
full_table[['Confirmed', 'Deaths', 'Recovered', 'Active']] = full_table[['Confirmed', 'Deaths', 'Recovered', 'Active']].fillna(0)

# fixing datatypes
full_table['Recovered'] = full_table['Recovered'].astype(int)

full_table.sample(6)


Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
21854,,Switzerland,46.8182,8.2275,2020-04-13,25688,1138,13700,10850
11484,,Indonesia,-0.7893,113.9213,2020-03-05,2,0,0,2
6824,,Uruguay,-32.5228,-55.7658,2020-02-16,0,0,0,0
28150,Aruba,Netherlands,12.5186,-70.0358,2020-05-07,101,3,89,9
3228,Henan,China,33.882,113.614,2020-02-03,566,2,16,548
27008,Yunnan,China,24.974,101.487,2020-05-03,185,2,181,2


In [11]:
# Grouped by day, country
# =======================

full_grouped = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()

# new cases ======================================================
temp = full_grouped.groupby(['Country/Region', 'Date', ])['Confirmed', 'Deaths', 'Recovered']
temp = temp.sum().diff().reset_index()

mask = temp['Country/Region'] != temp['Country/Region'].shift(1)

temp.loc[mask, 'Confirmed'] = np.nan
temp.loc[mask, 'Deaths'] = np.nan
temp.loc[mask, 'Recovered'] = np.nan

# renaming columns
temp.columns = ['Country/Region', 'Date', 'New cases', 'New deaths', 'New recovered']
# =================================================================

# merging new values
full_grouped = pd.merge(full_grouped, temp, on=['Country/Region', 'Date'])

# filling na with 0
full_grouped = full_grouped.fillna(0)

# fixing data types
cols = ['New cases', 'New deaths', 'New recovered']
full_grouped[cols] = full_grouped[cols].astype('int')

full_grouped['New cases'] = full_grouped['New cases'].apply(lambda x: 0 if x<0 else x)

full_grouped.head()

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered
0,2020-01-22,Afghanistan,0,0,0,0,0,0,0
1,2020-01-22,Albania,0,0,0,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0,0,0,0
4,2020-01-22,Angola,0,0,0,0,0,0,0


In [12]:
# Day wise
# ========

# table
day_wise = full_grouped.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active', 'New cases'].sum().reset_index()

# number cases per 100 cases
day_wise['Deaths / 100 Cases'] = round((day_wise['Deaths']/day_wise['Confirmed'])*100, 2)
day_wise['Recovered / 100 Cases'] = round((day_wise['Recovered']/day_wise['Confirmed'])*100, 2)
day_wise['Deaths / 100 Recovered'] = round((day_wise['Deaths']/day_wise['Recovered'])*100, 2)

# no. of countries
day_wise['No. of countries'] = full_grouped[full_grouped['Confirmed']!=0].groupby('Date')['Country/Region'].unique().apply(len).values

# fillna by 0
cols = ['Deaths / 100 Cases', 'Recovered / 100 Cases', 'Deaths / 100 Recovered']
day_wise[cols] = day_wise[cols].fillna(0)

day_wise.head()

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,New cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries
0,2020-01-22,555,17,28,510,0,3.06,5.05,60.71,6
1,2020-01-23,654,18,30,606,99,2.75,4.59,60.0,8
2,2020-01-24,941,26,36,879,287,2.76,3.83,72.22,9
3,2020-01-25,1434,42,39,1353,493,2.93,2.72,107.69,11
4,2020-01-26,2118,56,52,2010,684,2.64,2.46,107.69,13


In [13]:
# Country wise
# ============

# getting latest values
country_wise = full_grouped[full_grouped['Date']==max(full_grouped['Date'])].reset_index(drop=True).drop('Date', axis=1)

# group by country
country_wise = country_wise.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active', 'New cases'].sum().reset_index()

# per 100 cases
country_wise['Deaths / 100 Cases'] = round((country_wise['Deaths']/country_wise['Confirmed'])*100, 2)
country_wise['Recovered / 100 Cases'] = round((country_wise['Recovered']/country_wise['Confirmed'])*100, 2)
country_wise['Deaths / 100 Recovered'] = round((country_wise['Deaths']/country_wise['Recovered'])*100, 2)

cols = ['Deaths / 100 Cases', 'Recovered / 100 Cases', 'Deaths / 100 Recovered']
country_wise[cols] = country_wise[cols].fillna(0)

country_wise.head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered
0,Afghanistan,3563,106,468,2989,171,2.98,13.13,22.65
1,Albania,842,31,605,206,10,3.68,71.85,5.12
2,Algeria,5182,483,2323,2376,185,9.32,44.83,20.79
3,Andorra,752,47,526,179,1,6.25,69.95,8.94
4,Angola,36,2,11,23,0,5.56,30.56,18.18


In [17]:
# load population dataset
pop = pd.read_csv("population_by_country_2020.csv")

# select only population
pop = pop.iloc[:, :2]

# rename column names
pop.columns = ['Country/Region', 'Population']

# merged data
country_wise = pd.merge(country_wise, pop, on='Country/Region', how='left')

# update population
cols = ['Burma', 'Congo (Brazzaville)', 'Congo (Kinshasa)', "Cote d'Ivoire", 'Czechia', 
        'Kosovo', 'Saint Kitts and Nevis', 'Saint Vincent and the Grenadines', 
        'Taiwan*', 'US', 'West Bank and Gaza', 'Sao Tome and Principe']
pops = [54409800, 89561403, 5518087, 26378274, 10708981, 1793000, 
        53109, 110854, 23806638, 330541757, 4543126, 219159]
for c, p in zip(cols, pops):
    country_wise.loc[country_wise['Country/Region']== c, 'Population'] = p
    
# missing values
# country_wise.isna().sum()
# country_wise[country_wise['Population'].isna()]['Country/Region'].tolist()

# Cases per population
country_wise['Cases / Million People'] = round((country_wise['Confirmed'] / country_wise['Population']) * 1000000)

country_wise.head()


Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Population,Cases / Million People
0,Afghanistan,3563,106,468,2989,171,2.98,13.13,22.65,38742911.0,92.0
1,Albania,842,31,605,206,10,3.68,71.85,5.12,2878420.0,293.0
2,Algeria,5182,483,2323,2376,185,9.32,44.83,20.79,43685618.0,119.0
3,Andorra,752,47,526,179,1,6.25,69.95,8.94,77240.0,9736.0
4,Angola,36,2,11,23,0,5.56,30.56,18.18,32644783.0,1.0


In [22]:
today = full_grouped[full_grouped['Date']==max(full_grouped['Date'])].reset_index(drop=True).drop('Date', axis=1)[['Country/Region', 'Confirmed']]
last_week = full_grouped[full_grouped['Date']==max(full_grouped['Date'])-timedelta(days=7)].reset_index(drop=True).drop('Date', axis=1)[['Country/Region', 'Confirmed']]

temp = pd.merge(today, last_week, on='Country/Region', suffixes=(' today', ' last week'))

# temp = temp[['Country/Region', 'Confirmed last week']]
temp['1 week change'] = temp['Confirmed today'] - temp['Confirmed last week']

temp = temp[['Country/Region', 'Confirmed last week', '1 week change']]

country_wise = pd.merge(country_wise, temp, on='Country/Region')

country_wise['1 week % increase'] = round(country_wise['1 week change']/country_wise['Confirmed last week']*100, 2)

country_wise.head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Population,Cases / Million People,Confirmed last week,1 week change,1 week % increase
0,Afghanistan,3563,106,468,2989,171,2.98,13.13,22.65,38742911.0,92.0,2171,1392,64.12
1,Albania,842,31,605,206,10,3.68,71.85,5.12,2878420.0,293.0,773,69,8.93
2,Algeria,5182,483,2323,2376,185,9.32,44.83,20.79,43685618.0,119.0,4006,1176,29.36
3,Andorra,752,47,526,179,1,6.25,69.95,8.94,77240.0,9736.0,745,7,0.94
4,Angola,36,2,11,23,0,5.56,30.56,18.18,32644783.0,1.0,27,9,33.33


In [23]:
country_wise[country_wise['Cases / Million People'].isna()]

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Population,Cases / Million People,Confirmed last week,1 week change,1 week % increase


In [37]:
temp = full_table.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)

tm = temp.melt(id_vars="Date", value_vars=['Active', 'Deaths', 'Recovered'])
fig = px.treemap(tm, path=["variable"], values="value", height=225, width=1200,
                 color_discrete_sequence=["red","green","blue"],
                 )
fig.data[0].textinfo = 'label+text+value'
fig.show()

In [39]:
temp = full_table.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()

fig = px.area(temp, x="Date", y="Count", color='Case', height=600,
             title='Cases over time', color_discrete_sequence = ["red","green","blue"])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

# Maps

In [40]:
# World wide

temp = full_table[full_table['Date'] == max(full_table['Date'])]

m = folium.Map(location=[0, 0], tiles='cartodbpositron',
               min_zoom=1, max_zoom=4, zoom_start=1)

for i in range(0, len(temp)):
    folium.Circle(
        location=[temp.iloc[i]['Lat'], temp.iloc[i]['Long']],
        color='crimson', fill='crimson',
        tooltip =   '<li><bold>Country : '+str(temp.iloc[i]['Country/Region'])+
                    '<li><bold>Province : '+str(temp.iloc[i]['Province/State'])+
                    '<li><bold>Confirmed : '+str(temp.iloc[i]['Confirmed'])+
                    '<li><bold>Deaths : '+str(temp.iloc[i]['Deaths']),
        radius=int(temp.iloc[i]['Confirmed'])**0.5).add_to(m)
m

NameError: name 'folium' is not defined