# Corona Virus COVID-19 Worlwide distribution on March 25th at 11:40pm Beijing Time 

### In this project, a choropleth map displaying the distribution of the new Corona Virus referred to as COVID-19 is performed using the Folium libray.

### The Corono virus outbreak originated in China, hubei province in the city of Wuhan late December 2019 and have been spreading at an unprecedented rate since then. So far, more than 440000 cased have been registered and unfortunately more than 19000 people lost their lives. 

### The data used for this project originates from the following link  <https://www.worldometers.info/coronavirus/>

In [12]:
# import necessary library
from bs4 import BeautifulSoup #use to scrape data online
import requests #use to scrape data online
import numpy as np  # useful for many scientific computing in Python
import pandas as pd # primary data structure library
import folium # to generate choropleth map

### Get the data from the previous link (Web Scraping)

In [13]:
# Grab the url of where the desired data is located

url = requests.get('https://www.worldometers.info/coronavirus/').text
soup = BeautifulSoup(url)

In [14]:
# fetch the table out of the web page

Corona_tab = soup.find('table')
Corona_tag = Corona_tab.find_all('td')
print(len(Corona_tag))

1980


### Create a dataframe

In [15]:
# store values in lists
Country_Other = []
Total_Cases = []
New_Cases = []
Total_Deaths = []
New_Deaths = []
Total_Recovered = []
Active_Cases = []
Serious_Critical = []
Tot_Cases_1M_pop = []
Tot_Deaths_1M_pop = []

# note that relevant info Corona tags are in group of 10 tags.
indexes = np.arange(len(Corona_tag), step=10)

for idx in indexes:
    Country_Other.append(Corona_tag[idx].text.strip())
    Total_Cases.append(Corona_tag[idx+1].text.strip())
    New_Cases.append(Corona_tag[idx+2].text.strip())
    Total_Deaths.append(Corona_tag[idx+3].text.strip())
    New_Deaths.append(Corona_tag[idx+4].text.strip())
    Total_Recovered.append(Corona_tag[idx+5].text.strip())
    Active_Cases.append(Corona_tag[idx+6].text.strip())
    Serious_Critical.append(Corona_tag[idx+7].text.strip())
    Tot_Cases_1M_pop.append(Corona_tag[idx+8].text.strip())
    Tot_Deaths_1M_pop.append(Corona_tag[idx+9].text.strip())

In [16]:
df = pd.DataFrame(data=[Country_Other, Total_Cases, New_Cases,Total_Deaths,New_Deaths,Total_Recovered, Active_Cases,Serious_Critical, Tot_Cases_1M_pop,Tot_Deaths_1M_pop])
df = df.transpose()
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,China,81218,+47,3281,+4,73650,4287,1399,56,2
1,Italy,69176,,6820,,8326,54030,3393,1144,113
2,USA,55416,+560,789,+9,379,54248,1175,167,2
3,Spain,47610,+5552,3434,+443,5367,38809,2636,1018,73
4,Germany,35714,+2723,181,+22,3540,31993,23,426,2
...,...,...,...,...,...,...,...,...,...,...
193,Somalia,1,,,,,1,,0.06,
194,Syria,1,,,,,1,,0.06,
195,Timor-Leste,1,,,,,1,,0.8,
196,Turks and Caicos,1,,,,,1,,26,


## Data Wrangling

In [17]:
# Rename the columns to match those found on the web page
headers = ['Countries', 'Total Cases', 'New Cases','Total Deaths','New Deaths','Total Recovered', 'Active Cases','Serious & Critical', 'Tot Cases/1M pop','Tot Deaths/1M pop']
df.columns=headers
df

Unnamed: 0,Countries,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious & Critical,Tot Cases/1M pop,Tot Deaths/1M pop
0,China,81218,+47,3281,+4,73650,4287,1399,56,2
1,Italy,69176,,6820,,8326,54030,3393,1144,113
2,USA,55416,+560,789,+9,379,54248,1175,167,2
3,Spain,47610,+5552,3434,+443,5367,38809,2636,1018,73
4,Germany,35714,+2723,181,+22,3540,31993,23,426,2
...,...,...,...,...,...,...,...,...,...,...
193,Somalia,1,,,,,1,,0.06,
194,Syria,1,,,,,1,,0.06,
195,Timor-Leste,1,,,,,1,,0.8,
196,Turks and Caicos,1,,,,,1,,26,


In [18]:
# check the dimension of the data frame
df.shape

(198, 10)

In [19]:
# display the 5 first rows
df.head()

Unnamed: 0,Countries,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious & Critical,Tot Cases/1M pop,Tot Deaths/1M pop
0,China,81218,47.0,3281,4.0,73650,4287,1399,56,2
1,Italy,69176,,6820,,8326,54030,3393,1144,113
2,USA,55416,560.0,789,9.0,379,54248,1175,167,2
3,Spain,47610,5552.0,3434,443.0,5367,38809,2636,1018,73
4,Germany,35714,2723.0,181,22.0,3540,31993,23,426,2


In [20]:
# display the last 5 rows
df.tail()

Unnamed: 0,Countries,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious & Critical,Tot Cases/1M pop,Tot Deaths/1M pop
193,Somalia,1,,,,,1,,0.06,
194,Syria,1,,,,,1,,0.06,
195,Timor-Leste,1,,,,,1,,0.8,
196,Turks and Caicos,1,,,,,1,,26.0,
197,Total:,441093,18566.0,19762.0,872.0,112036.0,309295,13425.0,56.6,2.5


In [21]:
# basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 10 columns):
Countries             198 non-null object
Total Cases           198 non-null object
New Cases             198 non-null object
Total Deaths          198 non-null object
New Deaths            198 non-null object
Total Recovered       198 non-null object
Active Cases          198 non-null object
Serious & Critical    198 non-null object
Tot Cases/1M pop      198 non-null object
Tot Deaths/1M pop     198 non-null object
dtypes: object(10)
memory usage: 15.6+ KB


##### Some columns having numerical data do not have the proper data type. Thus they need to be changed . Those columns include 'Total Cases', 'New Cases', 'Total Deaths', 'New Deaths', 'Total Recovered', 'Active Cases', 'Serious & Critical', 'Tot Cases/1M pop', 'Tot Deaths/1M pop'

In [22]:
# Remove the commas and plus signs in the data set to help for the transformation of each variable into a numerical one

df['Total Cases'] = df['Total Cases'].str.replace(',', '')
df['New Cases'] = df['New Cases'].str.replace(',', '')
df['New Cases'] = df['New Cases'].str.replace('+', '')
df['Total Deaths'] = df['Total Deaths'].str.replace(',', '')
df['New Deaths'] = df['New Deaths'].str.replace('+', '')
df['Total Recovered'] = df['Total Recovered'].str.replace(',', '')
df['Active Cases'] = df['Active Cases'].str.replace(',', '')
df['Serious & Critical'] = df['Serious & Critical'].str.replace(',', '')
df['Tot Cases/1M pop'] = df['Tot Cases/1M pop'].str.replace(',', '')
df['Tot Deaths/1M pop'] = df['Tot Deaths/1M pop'].str.replace(',', '')

In [23]:
# Have a look at the new dataframe
df

Unnamed: 0,Countries,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious & Critical,Tot Cases/1M pop,Tot Deaths/1M pop
0,China,81218,47,3281,4,73650,4287,1399,56,2
1,Italy,69176,,6820,,8326,54030,3393,1144,113
2,USA,55416,560,789,9,379,54248,1175,167,2
3,Spain,47610,5552,3434,443,5367,38809,2636,1018,73
4,Germany,35714,2723,181,22,3540,31993,23,426,2
...,...,...,...,...,...,...,...,...,...,...
193,Somalia,1,,,,,1,,0.06,
194,Syria,1,,,,,1,,0.06,
195,Timor-Leste,1,,,,,1,,0.8,
196,Turks and Caicos,1,,,,,1,,26,


In [24]:
# Replace all NaN (Not a number object) and empty cells to zero so as to change easily the variable into integer type

df['Total Cases'] = df['Total Cases'].replace('', '0')
df['Total Cases'] = df['Total Cases'].replace('NaN', '0')

df['New Cases'] = df['New Cases'].replace('', '0')
df['New Cases'] = df['New Cases'].replace('NaN', '0')

df['Total Deaths'] = df['Total Deaths'].replace('', '0')
df['Total Deaths'] = df['Total Deaths'].replace('NaN', '0')

df['New Deaths'] = df['New Deaths'].replace('', '0')
df['New Deaths'] = df['New Deaths'].replace('NaN', '0')

df['Total Recovered'] = df['Total Recovered'].replace('', '0')
df['Total Recovered'] = df['Total Recovered'].replace('NaN', '0')

df['Active Cases'] = df['Active Cases'].replace('', '0')
df['Active Cases'] = df['Active Cases'].replace('NaN', '0')

df['Serious & Critical'] = df['Serious & Critical'].replace('', '0')
df['Serious & Critical'] = df['Serious & Critical'].replace('NaN', '0')

df['Tot Cases/1M pop'] = df['Tot Cases/1M pop'].replace('', '0')
df['Tot Cases/1M pop'] = df['Tot Cases/1M pop'].replace('NaN', '0')

df['Tot Deaths/1M pop'] = df['Tot Deaths/1M pop'].replace('', '0')
df['Tot Deaths/1M pop'] = df['Tot Deaths/1M pop'].replace('NaN', '0')
df

Unnamed: 0,Countries,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious & Critical,Tot Cases/1M pop,Tot Deaths/1M pop
0,China,81218,47,3281,4,73650,4287,1399,56,2
1,Italy,69176,0,6820,0,8326,54030,3393,1144,113
2,USA,55416,560,789,9,379,54248,1175,167,2
3,Spain,47610,5552,3434,443,5367,38809,2636,1018,73
4,Germany,35714,2723,181,22,3540,31993,23,426,2
...,...,...,...,...,...,...,...,...,...,...
193,Somalia,1,0,0,0,0,1,0,0.06,0
194,Syria,1,0,0,0,0,1,0,0.06,0
195,Timor-Leste,1,0,0,0,0,1,0,0.8,0
196,Turks and Caicos,1,0,0,0,0,1,0,26,0


In [25]:
# Transform selected columns into numerical variables

df['Total Cases'] = pd.to_numeric(df['Total Cases'])
df['New Cases'] = pd.to_numeric(df['New Cases'])
df['Total Deaths'] = pd.to_numeric(df['Total Deaths'])
df['New Deaths'] = pd.to_numeric(df['New Deaths'])
df['Total Recovered'] = pd.to_numeric(df['Total Recovered'])
df['Active Cases'] = pd.to_numeric(df['Active Cases'])
df.dtypes

Countries             object
Total Cases            int64
New Cases              int64
Total Deaths           int64
New Deaths             int64
Total Recovered        int64
Active Cases           int64
Serious & Critical    object
Tot Cases/1M pop      object
Tot Deaths/1M pop     object
dtype: object

In [26]:
# Check the list of indexes values
df.index.values

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [27]:
# get summary statistics from the dataset
df.describe()

Unnamed: 0,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases
count,198.0,198.0,198.0,198.0,198.0,198.0
mean,4455.484848,187.535354,199.616162,8.808081,1131.676768,3124.191919
std,32672.399575,1395.754039,1525.257277,70.276998,9546.01827,22871.017715
min,1.0,0.0,0.0,0.0,0.0,1.0
25%,7.25,0.0,0.0,0.0,0.0,7.0
50%,79.5,2.0,1.0,0.0,2.0,73.5
75%,418.75,27.0,5.0,0.0,16.0,395.75
max,441093.0,18566.0,19762.0,872.0,112036.0,309295.0


In [28]:
# let's retrieve the list of countries 
df['Countries'].unique()

array(['China', 'Italy', 'USA', 'Spain', 'Germany', 'Iran', 'France',
       'Switzerland', 'S. Korea', 'UK', 'Netherlands', 'Austria',
       'Belgium', 'Norway', 'Portugal', 'Canada', 'Sweden', 'Australia',
       'Brazil', 'Israel', 'Turkey', 'Malaysia', 'Denmark', 'Czechia',
       'Luxembourg', 'Ireland', 'Japan', 'Ecuador', 'Chile', 'Pakistan',
       'Poland', 'Thailand', 'Romania', 'Saudi Arabia', 'Finland',
       'Indonesia', 'Greece', 'Iceland', 'Diamond Princess',
       'South Africa', 'Russia', 'Philippines', 'Singapore', 'India',
       'Slovenia', 'Qatar', 'Panama', 'Egypt', 'Bahrain', 'Croatia',
       'Peru', 'Hong Kong', 'Mexico', 'Estonia', 'Dominican Republic',
       'Argentina', 'Serbia', 'Colombia', 'Iraq', 'Lebanon', 'UAE',
       'Armenia', 'Algeria', 'Lithuania', 'Bulgaria', 'Taiwan', 'Hungary',
       'Latvia', 'Slovakia', 'New Zealand', 'Kuwait', 'Uruguay',
       'Andorra', 'San Marino', 'Costa Rica', 'North Macedonia',
       'Tunisia', 'Morocco', 'Bosnia

In [29]:
# get the number of countries listed altogether
df['Countries'].count()

198

In [30]:
# Reshape the dataset to keep only the columns 'Cases', 'Deaths' and 'Countries'
df = df[['Countries', 'Total Cases', 'New Cases','Total Deaths','New Deaths','Total Recovered', 'Active Cases']]
df

Unnamed: 0,Countries,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases
0,China,81218,47,3281,4,73650,4287
1,Italy,69176,0,6820,0,8326,54030
2,USA,55416,560,789,9,379,54248
3,Spain,47610,5552,3434,443,5367,38809
4,Germany,35714,2723,181,22,3540,31993
...,...,...,...,...,...,...,...
193,Somalia,1,0,0,0,0,1
194,Syria,1,0,0,0,0,1
195,Timor-Leste,1,0,0,0,0,1
196,Turks and Caicos,1,0,0,0,0,1


In [31]:
# drop the Total table

df = df.head(197)
df

Unnamed: 0,Countries,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases
0,China,81218,47,3281,4,73650,4287
1,Italy,69176,0,6820,0,8326,54030
2,USA,55416,560,789,9,379,54248
3,Spain,47610,5552,3434,443,5367,38809
4,Germany,35714,2723,181,22,3540,31993
...,...,...,...,...,...,...,...
192,St. Vincent Grenadines,1,0,0,0,0,1
193,Somalia,1,0,0,0,0,1
194,Syria,1,0,0,0,0,1
195,Timor-Leste,1,0,0,0,0,1


### let's produce the choropleth map 

In [32]:
# upload the geoson file
world_geo =r'C:\Users\HP\Downloads\world.geojson'
print('GeoJSON file downloaded!')

GeoJSON file downloaded!


#### Now that we have the GeoJSON file, let's create a world map, centered around [0, 0] latitude and longitude values, with an intial zoom level of 2, and using Mapbox Bright style.

In [33]:
world_geo = world_geo

Corona_map = folium.Map(location=[0,0], zoom_start=2, tiles='Mapbox Bright')

In [35]:
# generate choropleth map using the Total cases and the Countries column
Corona_map.choropleth(
    geo_data=world_geo,
    data=df,
    columns=['Countries', 'Total Cases'],
    key_on='feature.properties.name',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Covid-19 Total Cases Worldwide March 25th'
)

# display map
Corona_map

#### Notice as per choropleth map. the darker the color of a country, and the closer to red, the higher the number cases from that country.

#### Notice how the legend is displaying a negative boundary or threshold. Let's fix that by defining our own thresholds and starting with 0 instead of -6,918!

In [36]:
world_geo = world_geo

# create a numpy array of length 6 and has linear spacing from the minium Cases to the maximum Cases
threshold_scale = np.linspace(df['Total Cases'].min(),
                              df['Total Cases'].max(),
                              6, dtype=int)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 1 # make sure that the last value of the list is greater than the maximum Cases

# let Folium determine the scale.
Corona_map = folium.Map(location=[0, 0], zoom_start=2, tiles='Mapbox Bright')
Corona_map.choropleth(
    geo_data=world_geo,
    data=df,
    columns=['Countries', 'Total Cases'],
    key_on='feature.properties.name',
    threshold_scale=threshold_scale,
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Covid-19 Total Cases Worldwide March 25th',
    reset=True
)
Corona_map

In [37]:
# generate choropleth map using the Total Deaths and the Countries column
Corona_map.choropleth(
    geo_data=world_geo,
    data=df,
    columns=['Countries', 'Total Deaths'],
    key_on='feature.properties.name',
    fill_color='YlGn', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Covid-19 Total Deaths Worldwide March 25th'
)

# display map
Corona_map

In [38]:
world_geo = world_geo

# create a numpy array of length 6 and has linear spacing from the minium Total Deaths to the maximum Total Deaths
threshold_scale = np.linspace(df['Total Deaths'].min(),
                              df['Total Deaths'].max(),
                              6, dtype=int)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 1 # make sure that the last value of the list is greater than the maximum Cases

# let Folium determine the scale.
Corona_map = folium.Map(location=[0, 0], zoom_start=2, tiles='Mapbox Bright')
Corona_map.choropleth(
    geo_data=world_geo,
    data=df,
    columns=['Countries', 'Total Deaths'],
    key_on='feature.properties.name',
    threshold_scale=threshold_scale,
    fill_color='YlGn', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Covid-19 Total Deaths Worldwide March 25th',
    reset=True
)
Corona_map

In [39]:
# generate choropleth map using the Active cases and the Countries column
Corona_map.choropleth(
    geo_data=world_geo,
    data=df,
    columns=['Countries', 'Active Cases'],
    key_on='feature.properties.name',
    fill_color='BuPu', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Covid-19 Active Cases Worldwide March 25th'
)

# display map
Corona_map

In [40]:
world_geo = world_geo

# create a numpy array of length 6 and has linear spacing from the minium Active Cases to the maximum Active Cases
threshold_scale = np.linspace(df['Active Cases'].min(),
                              df['Active Cases'].max(),
                              6, dtype=int)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 1 # make sure that the last value of the list is greater than the maximum Cases

# let Folium determine the scale.
Corona_map = folium.Map(location=[0, 0], zoom_start=2, tiles='Mapbox Bright')
Corona_map.choropleth(
    geo_data=world_geo,
    data=df,
    columns=['Countries', 'Active Cases'],
    key_on='feature.properties.name',
    threshold_scale=threshold_scale,
    fill_color='BuPu', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Covid-19 Active Cases Worldwide March 25th',
    reset=True
)
Corona_map