In [395]:
from bs4 import BeautifulSoup
from requests import get

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import math 
import os

In [396]:
# Function for remove comma within numbers
def removeCommas(string): 
    string = string.replace(',','')
    return string 

# Scrap data from worldmeter

In [397]:
# Test if we can scrap info from worldometers
# The communication with website is ok if the response is 200
headers = ({'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
worldometers = "https://www.worldometers.info/coronavirus/#countries"
response = get(worldometers, headers=headers)
response

<Response [200]>

In [398]:
# Scrap all content from the website
html_soup = BeautifulSoup(response.text, 'html.parser')
# After inspect the website content, data are stored inside tag 'tbody' and table header is 'thead'
table_contents = html_soup.find_all('tbody')
table_header = html_soup.find_all('thead')

# Header for the table
header = []
for head_title in table_header[0].find_all('th'):    
    header.append(str(head_title.contents))

# Save value into columns
CountryName = []
TotalCases = []
NewCases = []
TotalDeaths = []
NewDeaths = []
TotalRecovered = []
ActiveCases = []
SeriousCritical = []

for row in table_contents[0].find_all('tr'):
    cells = row.find_all('td')
    if len(cells[0].find_all('a')) >= 1:
        CountryName.append(cells[0].find_all('a')[0].contents[0])
    elif len(cells[0].find_all('span')) >= 1:
        CountryName.append(cells[0].find_all('span')[0].contents[0])   
    else:
        CountryName.append(cells[0].contents[0])
    
    
    if len(cells[1].contents) >=1:
        TotalCases.append(cells[1].contents[0])
    else:
        TotalCases.append(0)
    
    if len(cells[2].contents) >= 1:
        NewCases.append(cells[2].contents[0])
    else:
        NewCases.append(0)
        
    
    if len(cells[3].contents) >= 1:
        TotalDeaths.append(cells[3].contents[0])
    else:
        TotalDeaths.append(0)

    
    if len(cells[4].contents) >= 1:
        NewDeaths.append(cells[4].contents[0])
    else:
        NewDeaths.append(0)
    
    if len(cells[5].contents) >= 1:
        TotalRecovered.append(cells[5].contents[0])
    else:
        TotalRecovered.append(0)
        
    if len(cells[6].contents) >= 1:
        ActiveCases.append(cells[6].contents[0])
    else:
        ActiveCases.append(0)
    
    if len(cells[7].contents) >= 1:
        SeriousCritical.append(cells[7].contents[0])
    else:
        SeriousCritical.append(0)
        
        
CaseTable = pd.DataFrame({header[0]: CountryName,
                          header[1]: TotalCases,
                          header[2]: NewCases,
                          header[3]: TotalDeaths,
                          header[4]: NewDeaths,                          
                          header[5]: TotalRecovered,
                          header[6]: ActiveCases,
                          header[7]: SeriousCritical,
                          })  

CaseTable.head(40)

Unnamed: 0,"['Country,', <br>Other</br>]","['Total', <br>Cases</br>]","['New', <br/>, 'Cases']","['Total', <br/>, 'Deaths']","['New', <br/>, 'Deaths']","['Total', <br/>, 'Recovered']","['Active', <br/>, 'Cases']","['Serious,', <br/>, 'Critical']"
0,China,81340,55,3292,5,74588,3460,1034
1,USA,85749,314,1304,9,1868,82577,2122
2,Italy,80589,0,8215,0,10361,62013,3612
3,Spain,57786,0,4365,0,7015,46406,3166
4,Germany,47278,3340,281,14,5673,41324,23
5,Iran,32332,2926,2378,144,11133,18821,2893
6,France,29155,0,1696,0,4948,22511,3375
7,Switzerland,11951,140,197,5,897,10857,203
8,UK,11658,0,578,0,135,10945,163
9,S. Korea,9332,91,139,8,4528,4665,59


In [399]:
CaseTable.tail(40)

Unnamed: 0,"['Country,', <br>Other</br>]","['Total', <br>Cases</br>]","['New', <br/>, 'Cases']","['Total', <br/>, 'Deaths']","['New', <br/>, 'Deaths']","['Total', <br/>, 'Recovered']","['Active', <br/>, 'Cases']","['Serious,', <br/>, 'Critical']"
160,Mozambique,7,0,,0,0,7,0
161,Seychelles,7,0,,0,0,7,0
162,Benin,6,0,,0,0,6,0
163,Eritrea,6,0,,0,0,6,0
164,Laos,6,0,,0,0,6,0
165,Eswatini,6,0,,0,0,6,0
166,Cabo Verde,5,0,1.0,0,0,4,0
167,Guyana,5,0,1.0,0,0,4,0
168,Zimbabwe,5,2,1.0,0,0,4,0
169,Fiji,5,0,,0,0,5,0


In [400]:
caseTableSimple = CaseTable[[CaseTable.columns[0], CaseTable.columns[1], CaseTable.columns[3], CaseTable.columns[5]]]
caseTableSimple.columns = ['Country/Region', 'Confirmed', 'Deaths', 'Recovered']
# Set data type as string first for manuipulation
caseTableSimple = caseTableSimple.astype({'Country/Region':str,'Confirmed':str,'Deaths':str, 'Recovered':str})
# Remove the last row of total number (changed on 20200310, worldmeter moved this row as next tbody)
#caseTableSimple = caseTableSimple.iloc[:-1,:]
# Remove lead and tail space for each element
caseTableSimple = caseTableSimple.apply(lambda x: x.str.strip())
# Remove comma for each element
caseTableSimple = caseTableSimple.applymap(removeCommas)
# Replace empty str with zero. This include row of 'Diamond Princess' (its name is empty)
caseTableSimple = caseTableSimple.replace('', '0')
# After string manipulation, convert data type as correct type
caseTableSimple = caseTableSimple.astype({'Country/Region':'str',
                                          'Confirmed':'int',
                                          'Deaths':'int',
                                          'Recovered':'int',                                          
                                         })
# Data for these countries come from other source
removeRegion = ['China', 'Canada', 'Australia', 'USA']
for i in removeRegion:
    caseTableSimple.drop(caseTableSimple[caseTableSimple['Country/Region'] == i].index, axis=0, inplace=True)

# Change Country name the same as my old data 
if 'S. Korea' in list(caseTableSimple['Country/Region']):
    caseTableSimple = caseTableSimple.replace('S. Korea', 'South Korea')

# Add column 'Province/State' with empty value
caseTableSimple['Province/State'] =''

# In my old data, 'Diamond Princess' is represented by 'Yokohama' in the column of 'Province/State'
if 'Diamond Princess' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == 'Diamond Princess',].index, 'Province/State'] = 'Yokohama'
    caseTableSimple['Country/Region'].replace({'Diamond Princess':'Japan'}, inplace=True)

# In my old data, 'Belgium' has 'Brussels' in the column of 'Province/State'
if 'Belgium' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == 'Belgium',].index, 'Province/State'] = 'Brussels'

# In my old data, I used 'Macau' not 'Macao'
if 'Macao' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == 'Macao',].index, 'Province/State'] = 'Macau'
    caseTableSimple['Country/Region'].replace({'Macao':'Macau'}, inplace=True)

# In my old data, 'Hong Kong' has 'Hong Kong' in the column of 'Province/State'
if 'Hong Kong' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == 'Hong Kong',].index, 'Province/State'] = 'Hong Kong'

# In my old data, 'Taiwan' has 'Taiwan' in the column of 'Province/State'
if 'Taiwan' in list(caseTableSimple['Country/Region']):
    caseTableSimple.at[caseTableSimple.loc[caseTableSimple['Country/Region'] == 'Taiwan',].index, 'Province/State'] = 'Taiwan'

# In my old data, I used 'United Arab Emirates' not 'UAE'
if 'UAE' in list(caseTableSimple['Country/Region']):
    caseTableSimple['Country/Region'].replace({'UAE':'United Arab Emirates'}, inplace=True)

if 'Réunion' in list(caseTableSimple['Country/Region']):
    caseTableSimple['Country/Region'].replace({'Réunion':'Reunion'}, inplace=True)
    
if 'Curaçao' in list(caseTableSimple['Country/Region']):
    caseTableSimple['Country/Region'].replace({'Curaçao':'Curacao'}, inplace=True)

# In my old data I used US time as Last Update time
currentTime = datetime.now()
lastUpdateTime = currentTime.strftime('%m/%d/%Y %H:%M')
# Remove the first number (This only works for month number less than 10)
lastUpdateTime[1:]
caseTableSimple['Last Update'] = lastUpdateTime[1:]

# Reorder list as all old data
columnList = caseTableSimple.columns.tolist()
columnList =[columnList[i] for i in [4, 0, 5, 1, 2, 3]]
caseTableSimple = caseTableSimple[columnList]

In [401]:
caseTableSimple.tail(20)

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
180,,Bhutan,3/27/2020 21:55,3,0,0
181,,CAR,3/27/2020 21:55,3,0,0
182,,Chad,3/27/2020 21:55,3,0,0
183,,Liberia,3/27/2020 21:55,3,0,0
184,,Mauritania,3/27/2020 21:55,3,0,0
185,,St. Barth,3/27/2020 21:55,3,0,0
186,,Saint Lucia,3/27/2020 21:55,3,0,1
187,,Sint Maarten,3/27/2020 21:55,3,0,0
188,,Somalia,3/27/2020 21:55,3,0,0
189,,Nicaragua,3/27/2020 21:55,2,1,0


# Scrap data for US_CAN

In [402]:
# Test if we can scrap info from worldometers
# The communication with website is ok if the response is 200
US_Canada = "https://coronavirus.1point3acres.com/zh"
response2 = get(US_Canada, headers=headers)
response2

<Response [200]>

In [403]:
# Scrap all content from the website
html_soup2 = BeautifulSoup(response2.text, 'html.parser')

In [404]:
# Since they change class index everyday, this code is for finding the new index.
indexList = []
for span in html_soup2.find_all('span'):
    # Only retain 'span' that has contents
    if len(span.contents):
        # Since we only need to find index for table, use one of the table head as target word to locate index
        if span.contents[0] == 'Location':
            # Store the index inside a list
            indexList.append(span['class'][0])

In [405]:
# The first index is for US table and the 2nd index is for Canada table. Do not care about the rest inside the list.
USindex, CANindex = indexList

In [406]:
# Check if the index return right data
html_soup2.find_all('span', class_=USindex)

[<span class="jsx-564222390" style="color:#ee7500">🎉 Click here to check the state testing data/location</span>,
 <span class="jsx-564222390">Location</span>,
 <span class="jsx-564222390">Confirmed</span>,
 <span class="jsx-564222390">Deaths</span>,
 <span class="jsx-564222390">Fatality rate</span>,
 <span class="jsx-564222390">Source</span>,
 <span class="jsx-564222390">United States</span>,
 <span class="jsx-564222390">86,043</span>,
 <span class="jsx-564222390">1,304</span>,
 <span class="jsx-564222390">1.5%</span>,
 <span class="jsx-564222390"><a class="jsx-564222390" href="https://www.cdc.gov" target="_blank">CDC</a></span>,
 <span class="jsx-564222390">New York<i aria-label="icon: home" class="anticon anticon-home" tabindex="-1"><svg aria-hidden="true" class="" data-icon="home" fill="currentColor" focusable="false" height="1em" viewbox="64 64 896 896" width="1em"><path d="M946.5 505L560.1 118.8l-25.9-25.9a31.5 31.5 0 0 0-44.4 0L77.5 505a63.9 63.9 0 0 0-18.8 46c.4 35.2 29.7 63.3 6

In [407]:
Locations = []
Confirmed = []
Recovered = []
Deaths = []
list1 = range(1, len(html_soup2.find_all('span', class_=USindex))-4, 5)
list2 = range(2, len(html_soup2.find_all('span', class_=USindex))-3, 5)
list3 = range(3, len(html_soup2.find_all('span', class_=USindex))-2, 5)
list4 = range(4, len(html_soup2.find_all('span', class_=USindex))-1, 5)

for index in list1:
    if len(html_soup2.find_all('span', class_=USindex)[index].contents):
        Locations.append(html_soup2.find_all('span', class_=USindex)[index].contents[0])
    else:
        Locations.append(0)
for index in list2:
    if len(html_soup2.find_all('span', class_=USindex)[index].contents):
        try:
            Confirmed.append(html_soup2.find_all('span', class_=USindex)[index].contents[1])
        except:
            Confirmed.append(html_soup2.find_all('span', class_=USindex)[index].contents[0])
    else:
        Confirmed.append(0)
for index in list3:
    # They do not provide recovered cases number anymore.
    #if len(html_soup2.find_all('span', class_=USindex)[index].contents):
    #    Recovered.append(html_soup2.find_all('span', class_=USindex)[index].contents[0])
    #else:
    Recovered.append(0)
for index in list3:
    if len(html_soup2.find_all('span', class_=USindex)[index].contents):
        try:
            Deaths.append(html_soup2.find_all('span', class_=USindex)[index].contents[1])
        except:
            Deaths.append(html_soup2.find_all('span', class_=USindex)[index].contents[0])
    else:
        Deaths.append(0)
    
US_data = pd.DataFrame({'Province/State':Locations,
                        'Confirmed':Confirmed,
                        'Deaths':Deaths,
                        #'Recovered':Recovered,  
                            })

# Remove rows that are not data
US_data.drop(US_data[US_data['Deaths'] == 'Deaths'].index, axis=0, inplace=True)

# Remove rows that are not data
US_data.drop(US_data[US_data['Province/State'] == 'United States'].index, axis=0, inplace=True)

# Replace Washington, D.C. as Washington DC
if 'Washington, D.C.' in list(US_data['Province/State']):
    US_data['Province/State'].replace({'Washington, D.C.':'Washington DC'}, inplace=True)

# Replace Washington as WA
if 'Washington' in list(US_data['Province/State']):
    US_data['Province/State'].replace({'Washington':'WA'}, inplace=True)
    
# Replace Grand Princess as From Grand Princess
#if 'Grand Princess' in list(US_data['Province/State']):
#    US_data['Province/State'].replace({'Grand Princess':'From Grand Princess'}, inplace=True)
    
# Replace Diamond Princess as From Diamond Princess cruise
#if 'Diamond Princess' in list(US_data['Province/State']):
#    US_data['Province/State'].replace({'Diamond Princess':'From Diamond Princess cruise'}, inplace=True)
    
# Assign 0 in column Province/State as unassigned
if 0 in list(US_data['Province/State']):
    US_data.at[US_data.loc[US_data['Province/State'] == 0,].index, 'Province/State'] = 'Unassigned'

# Remove comma for each element
US_data['Confirmed'] = US_data['Confirmed'].apply(removeCommas)

In [408]:
US_data

Unnamed: 0,Province/State,Confirmed,Deaths
2,New York,39140,461
3,New Jersey,6876,81
4,California,4040,82
5,WA,3207,150
6,Michigan,2857,61
7,Illinois,2542,26
8,Florida,2484,29
9,Massachusetts,2417,25
10,Louisiana,2305,83
11,Pennsylvania,1813,18


In [409]:
# As the website changed to dynamic, using selenium to interact with the website vitually
from selenium import webdriver

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [410]:
# Open vitual Chrome browser
driver = webdriver.Chrome()
# Direct the driver to open a webpage by calling the ‘get’ method, with a parameter of the page we want to visit.
driver.get("https://coronavirus.1point3acres.com/en")
# click tab button to let page lode new data (US data is the default)
python_button = driver.find_element(By.XPATH, "//span[text()='Canada']")
python_button.click()
# Wait for the dynamically loaded elements to show up
WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.CLASS_NAME, CANindex)))
# And grab the new page HTML source
html_page = driver.page_source
driver.quit()

In [411]:
# Now we can use html_page as source for BS4
html_soup2 = BeautifulSoup(html_page)

In [412]:
Locations = []
Confirmed = []
Recovered = []
Deaths = []
list1 = range(0, len(html_soup2.find_all('span', class_=CANindex))-4, 5)
list2 = range(1, len(html_soup2.find_all('span', class_=CANindex))-3, 5)
list3 = range(2, len(html_soup2.find_all('span', class_=CANindex))-2, 5)
list4 = range(3, len(html_soup2.find_all('span', class_=CANindex))-1, 5)

for index in list1:
    if len(html_soup2.find_all('span', class_=CANindex)[index].contents):
        Locations.append(html_soup2.find_all('span', class_=CANindex)[index].contents[0])
    else:
        Locations.append(0)
for index in list2:
    if len(html_soup2.find_all('span', class_=CANindex)[index].contents):
        try:
            Confirmed.append(html_soup2.find_all('span', class_=CANindex)[index].contents[1])
        except:
            Confirmed.append(html_soup2.find_all('span', class_=CANindex)[index].contents[0])
    else:
        Confirmed.append(0)
for index in list3:
    #. They do not provide recovered cases number
    #if len(html_soup2.find_all('span', class_=CANindex)[index].contents):
    #    Recovered.append(html_soup2.find_all('span', class_=CANindex)[index].contents[0])
    #else:
    Recovered.append(0)
for index in list3:
    if len(html_soup2.find_all('span', class_=CANindex)[index].contents):
        try:
            Deaths.append(html_soup2.find_all('span', class_=CANindex)[index].contents[1])
        except:
            Deaths.append(html_soup2.find_all('span', class_=CANindex)[index].contents[0])
    else:
        Deaths.append(0)
    
CAN_data = pd.DataFrame({'Province/State':Locations,
                         'Confirmed':Confirmed,
                         'Deaths':Deaths,
                         #'Recovered':Recovered,  
                            })

# Remove rows that are not data
CAN_data.drop(CAN_data[CAN_data['Deaths'] == 'Deaths'].index, axis=0, inplace=True)

# Remove rows that are not data
CAN_data.drop(CAN_data[CAN_data['Province/State'] == 'Canada'].index, axis=0, inplace=True)

# Remove comma for each element
CAN_data['Confirmed'] = CAN_data['Confirmed'].apply(removeCommas)

In [413]:
CAN_data

Unnamed: 0,Province/State,Confirmed,Deaths
2,Quebec,1632,8
3,Ontario,858,15
4,British Columbia,725,14
5,Alberta,486,2
6,Saskatchewan,95,0
7,Newfoundland and Labrador,82,0
8,Nova Scotia,73,0
9,Manitoba,36,0
10,New Brunswick,33,0
11,Grand Princess,13,0


In [414]:
US_Can_data = pd.concat([US_data, CAN_data], ignore_index=True)
US_Can_data = US_Can_data.apply(lambda x: x.str.strip())
US_Can_data

Unnamed: 0,Province/State,Confirmed,Deaths
0,New York,39140,461
1,New Jersey,6876,81
2,California,4040,82
3,WA,3207,150
4,Michigan,2857,61
...,...,...,...
66,Grand Princess,13,0
67,Prince Edward Island,9,0
68,Yukon,3,0
69,Northwest Territories,1,0


In [415]:
nameList = pd.read_csv('./web_data/statesNameTranslation.csv')

In [416]:
US_Can_data_EN = pd.merge(US_Can_data, nameList, how = 'left', 
                          left_on = 'Province/State', 
                          right_on = 'English')
US_Can_data_EN = US_Can_data_EN.drop(['Chinese', 'Province/State', 'Abbr.'], axis=1)
US_Can_data_EN['Last Update'] = lastUpdateTime[1:]
US_Can_data_EN.rename(columns={'English':'Province/State'}, inplace=True)
US_Can_data_EN = US_Can_data_EN.drop(US_Can_data_EN[US_Can_data_EN['Province/State'] == 'Wuhan Evacuee'].index, axis=0)
columnOrder = ['Province/State', 'Country/Region', 'Last Update','Confirmed', 'Deaths', 'Recovered']

US_Can_data_EN = US_Can_data_EN[columnOrder]
US_Can_data_EN 

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,New York,US,3/27/2020 21:55,39140,461,0
1,New Jersey,US,3/27/2020 21:55,6876,81,1
2,California,US,3/27/2020 21:55,4040,82,6
3,WA,US,3/27/2020 21:55,3207,150,1
4,Michigan,US,3/27/2020 21:55,2857,61,0
...,...,...,...,...,...,...
66,Grand Princess,US,3/27/2020 21:55,13,0,0
67,Prince Edward Island,Canada,3/27/2020 21:55,9,0,0
68,Yukon,Canada,3/27/2020 21:55,3,0,0
69,Northwest Territories,Canada,3/27/2020 21:55,1,0,0


In [417]:
finalTable = pd.concat([US_Can_data_EN, caseTableSimple], ignore_index=True)
finalTable

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,New York,US,3/27/2020 21:55,39140,461,0
1,New Jersey,US,3/27/2020 21:55,6876,81,1
2,California,US,3/27/2020 21:55,4040,82,6
3,WA,US,3/27/2020 21:55,3207,150,1
4,Michigan,US,3/27/2020 21:55,2857,61,0
...,...,...,...,...,...,...
261,,Turks and Caicos,3/27/2020 21:55,2,0,0
262,,Libya,3/27/2020 21:55,1,0,0
263,,Papua New Guinea,3/27/2020 21:55,1,0,0
264,,St. Vincent Grenadines,3/27/2020 21:55,1,0,0


In [418]:
timeStampe = currentTime.strftime('%m_%d_%Y_%H_%M')
finalTable.to_csv('./web_data/{}_webData.csv'.format(timeStampe), index=False)

# Scrap data for China

In [419]:
# Test if we can scrap info from worldometers
# The communication with website is ok if the response is 200
#CHN = "https://ncov.dxy.cn/ncovh5/view/pneumonia?scene=2&clicktime=1579582238&enterid=1579582238&from=singlemessage&isappinstalled=0"
#response3 = get(CHN, headers=headers)
#response3.encoding='utf-8' ##去掉这句则乱码，加上则正常显示，其中utf-8是根据网页源代码中设置的编码格式来指定的  
#response3

In [420]:
# Scrap all content from the website
#html_soup3 = BeautifulSoup(response3.text, 'html.parser')

In [421]:
#print(html_soup3.prettify())

In [422]:
#html_soup3.find_all('script', id='getAreaStat')[0].contents