In [23]:
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from datetime import datetime
import os

URL = 'https://voice.baidu.com/act/newpneumonia/newpneumonia/'

driver = webdriver.Chrome('./chromedriver.exe')
driver.get(URL)

In [24]:
all_states = driver.find_elements_by_xpath(
'//*[@id="nationTable"]/table/tbody/tr')

# close 1st state
all_states[0].click()

In [25]:
# record state level data
all_states = driver.find_elements_by_xpath(
'//*[@id="nationTable"]/table/tbody/tr')
states = [i.text for i in all_states]

In [26]:
# click the state tag to expand the city view
for state in all_states:
    state.click()

In [27]:
# record city level data
cities = []
all_cities = driver.find_elements_by_xpath(
'//*[@id="nationTable"]/table/tbody/tr/td/table')
for city in all_cities:
    rows = city.find_elements_by_xpath('./tbody/tr')
    value = [i.text for i in rows]
    cities +=value

In [28]:
expand_btn = driver.find_element_by_xpath(
    '//*[@id="foreignTable"]/div/span')
expand_btn.click()

# record country level data
countries = driver.find_elements_by_xpath(
'//*[@id="foreignTable"]/table/tbody/tr')
countries = [i.text for i in countries]

In [29]:
# close browser
driver.close()

In [30]:
print(cities[:3])
print(states[:3])
print(countries[:3])

['武汉\n35991 1922 1036', '孝感\n3009 255 49', '黄冈\n2791 504 59']
['湖北\n51986 3862 1318', '广东\n1261 332 2', '河南\n1184 316 11']
['日本\n252 1 1', '新加坡\n58 15 -', '泰国\n33 10 -']


In [31]:
# post process data
def process_raw(x):
    name,v = x.split('\n')
    confirm, recover, death = v.split(' ')
    return {'name':name, 
            'confirmed':confirm, 
            'recovered': recover, 
            'death': death}

In [32]:
cities = [process_raw(i) for i in cities]
df_cities = pd.DataFrame(cities)
df_cities.head()

Unnamed: 0,name,confirmed,recovered,death
0,武汉,35991,1922,1036
1,孝感,3009,255,49
2,黄冈,2791,504,59
3,荆州,1447,102,23
4,随州,1206,62,14


In [33]:
states = [process_raw(i) for i in states]
df_states = pd.DataFrame(states)
df_states.head()

Unnamed: 0,name,confirmed,recovered,death
0,湖北,51986,3862,1318
1,广东,1261,332,2
2,河南,1184,316,11
3,浙江,1155,367,-
4,湖南,988,363,2


In [34]:
countries = [process_raw(i) for i in countries]
df_countries = pd.DataFrame(countries)
df_countries.head()

Unnamed: 0,name,confirmed,recovered,death
0,日本,252,1,1
1,新加坡,58,15,-
2,泰国,33,10,-
3,韩国,28,7,-
4,马来西亚,19,3,-


In [35]:
# save data

suffix = datetime.today().strftime('%Y%m%d')
output_dir = 'covid19_data'
output = os.path.join(output_dir, suffix)

if not os.path.exists(output):
    os.mkdir(output)
    
df_cities.to_csv(
    '{}/cities.csv'.format(output), 
    index = False, encoding='utf-8')

df_states.to_csv(
    '{}/states.csv'.format(output), 
    index = False, encoding='utf-8')

df_countries.to_csv(
    '{}/countries.csv'.format(output), 
    index = False, encoding='utf-8')