# Prepare Datasets for Novel Coronavirus (COVID-19) Outbreak

In [1]:
import pandas as pd
import dateutil
import requests
import datetime

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

### Read COVID-19 daily case reports
Data are provided by [Coronavirus COVID-19 Global Cases by Johns Hopkins CSSE](https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6)

In [3]:
base_url_daily = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/"

#### Find the latest dataset
It will be from today or yesterday in UTC zone

In [4]:
today = datetime.datetime.utcnow().strftime('%m-%d-%Y')
yesterday = (datetime.datetime.utcnow() - datetime.timedelta(days=1)).strftime('%m-%d-%Y')

url = base_url_daily + today + ".csv"
date = today

# if there is no dataset for today, use yesterday's dataset
response = requests.get(url)
if response.status_code >= 400:
    url = base_url_daily + yesterday + ".csv"
    date = yesterday
    
print("Reading latest dataset from " + date + " UTC")
print("URL:", url)

Reading latest dataset from 03-05-2020 UTC
URL: https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/03-05-2020.csv


In [5]:
cases = pd.read_csv(url)

Clean up missing data and update data types

In [6]:
cases['Province/State'] = cases['Province/State'].fillna('')
cases['Last Update'] = cases['Last Update'].apply(dateutil.parser.parse)

In [7]:
cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  173 non-null    object        
 1   Country/Region  173 non-null    object        
 2   Last Update     173 non-null    datetime64[ns]
 3   Confirmed       173 non-null    int64         
 4   Deaths          173 non-null    int64         
 5   Recovered       173 non-null    int64         
 6   Latitude        173 non-null    float64       
 7   Longitude       173 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(2)
memory usage: 10.9+ KB


In [8]:
cases

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
0,Hubei,Mainland China,2020-03-05 14:53:03,67466,2902,40592,30.9756,112.2707
1,,South Korea,2020-03-05 09:03:09,6088,35,41,36.0,128.0
2,,Italy,2020-03-05 17:43:03,3858,148,414,43.0,12.0
3,,Iran,2020-03-05 13:43:04,3513,107,739,32.0,53.0
4,Guangdong,Mainland China,2020-03-05 09:23:03,1351,7,1181,23.3417,113.4244
5,Henan,Mainland China,2020-03-05 01:48:26,1272,22,1239,33.882,113.614
6,Zhejiang,Mainland China,2020-03-05 09:43:03,1215,1,1124,29.1832,120.0934
7,Hunan,Mainland China,2020-03-05 08:43:03,1018,4,938,27.6104,111.7088
8,Anhui,Mainland China,2020-03-05 04:33:02,990,6,970,31.8257,117.2264
9,Jiangxi,Mainland China,2020-03-05 01:16:58,935,1,901,27.614,115.7221


### Read time series file 
We use this file to retrieve the Latitude and Longitude of the outbreak locations.

In [9]:
time_series = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")

Clean up missing data and update data types

In [10]:
time_series['Province/State'] = time_series['Province/State'].fillna('')
time_series.iloc[:,4:] = time_series.iloc[:,4:].fillna(0).astype('int64')

In [11]:
time_series.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 48 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  173 non-null    object 
 1   Country/Region  173 non-null    object 
 2   Lat             173 non-null    float64
 3   Long            173 non-null    float64
 4   1/22/20         173 non-null    int64  
 5   1/23/20         173 non-null    int64  
 6   1/24/20         173 non-null    int64  
 7   1/25/20         173 non-null    int64  
 8   1/26/20         173 non-null    int64  
 9   1/27/20         173 non-null    int64  
 10  1/28/20         173 non-null    int64  
 11  1/29/20         173 non-null    int64  
 12  1/30/20         173 non-null    int64  
 13  1/31/20         173 non-null    int64  
 14  2/1/20          173 non-null    int64  
 15  2/2/20          173 non-null    int64  
 16  2/3/20          173 non-null    int64  
 17  2/4/20          173 non-null    int

In [12]:
time_series

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20,2/20/20,2/21/20,2/22/20,2/23/20,2/24/20,2/25/20,2/26/20,2/27/20,2/28/20,2/29/20,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20
0,Anhui,Mainland China,31.8257,117.2264,1,9,15,39,60,70,106,152,200,237,297,340,408,480,530,591,665,733,779,830,860,889,910,934,950,962,973,982,986,987,988,989,989,989,989,989,989,990,990,990,990,990,990,990
1,Beijing,Mainland China,40.1824,116.4142,14,22,36,41,68,80,91,111,114,139,168,191,212,228,253,274,297,315,326,337,342,352,366,372,375,380,381,387,393,395,396,399,399,399,400,400,410,410,411,413,414,414,418,418
2,Chongqing,Mainland China,30.0572,107.874,6,9,27,57,75,110,132,147,182,211,247,300,337,366,389,411,426,428,468,486,505,518,529,537,544,551,553,555,560,567,572,573,575,576,576,576,576,576,576,576,576,576,576,576
3,Fujian,Mainland China,26.0789,117.9874,1,5,10,18,35,59,80,84,101,120,144,159,179,194,205,215,224,239,250,261,267,272,279,281,285,287,290,292,293,293,293,293,293,293,294,294,296,296,296,296,296,296,296,296
4,Gansu,Mainland China,36.0611,103.8343,0,2,2,4,7,14,19,24,26,29,40,51,55,57,62,62,67,79,83,83,86,87,90,90,90,90,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,102
5,Guangdong,Mainland China,23.3417,113.4244,26,32,53,78,111,151,207,277,354,436,535,632,725,813,895,970,1034,1095,1131,1159,1177,1219,1241,1261,1294,1316,1322,1328,1331,1332,1333,1339,1342,1345,1347,1347,1347,1348,1349,1349,1350,1350,1350,1351
6,Guangxi,Mainland China,23.8298,108.7881,2,5,23,23,36,46,51,58,78,87,100,111,127,139,150,168,172,183,195,210,215,222,222,226,235,237,238,242,244,245,246,249,249,251,252,252,252,252,252,252,252,252,252,252
7,Guizhou,Mainland China,26.8154,106.8748,1,3,3,4,5,7,9,9,12,29,29,38,46,58,64,71,81,89,99,109,127,133,135,140,143,144,146,146,146,146,146,146,146,146,146,146,146,146,146,146,146,146,146,146
8,Hainan,Mainland China,19.1959,109.7453,4,5,8,19,22,33,40,43,46,52,62,64,72,80,99,106,117,124,131,138,144,157,157,159,162,162,163,163,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168
9,Hebei,Mainland China,38.0428,114.5149,1,1,2,8,13,18,33,48,65,82,96,104,113,126,135,157,172,195,206,218,239,251,265,283,291,300,301,306,306,307,308,309,311,311,311,312,317,318,318,318,318,318,318,318


### Merge the two dataframes

In [13]:
df = pd.merge(cases, time_series, on=['Province/State','Country/Region'])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 173 entries, 0 to 172
Data columns (total 54 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  173 non-null    object        
 1   Country/Region  173 non-null    object        
 2   Last Update     173 non-null    datetime64[ns]
 3   Confirmed       173 non-null    int64         
 4   Deaths          173 non-null    int64         
 5   Recovered       173 non-null    int64         
 6   Latitude        173 non-null    float64       
 7   Longitude       173 non-null    float64       
 8   Lat             173 non-null    float64       
 9   Long            173 non-null    float64       
 10  1/22/20         173 non-null    int64         
 11  1/23/20         173 non-null    int64         
 12  1/24/20         173 non-null    int64         
 13  1/25/20         173 non-null    int64         
 14  1/26/20         173 non-null    int64         
 15  1/27/2

### Split City, State string into separate columns

In [15]:
states = {
        # US
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming',
        # Canada
        'AB': 'Alberta',
        'BC': 'British Columbia',
        'MB': 'Manitoba',
        'NB': 'New Brunswick',
        'NL': 'Newfoundland and Labrador',
        'NT': 'Northwest Territories',
        'NS': 'Nova Scotia',
        'NU': 'Nunavut',
        'ON': 'Ontario',
        'PE': 'Prince Edward Island',
        'QC': 'Quebec',
        'SK': 'Saskatchewan',
        'YT': 'Yukon'
}

In [16]:
def get_country(country):
    if country == 'US':
        return 'USA'
    if country == 'UK':
        return 'United Kingdom'
    if country == 'Mainland China':
        return 'China'
    return country

In [17]:
def get_state(state):
    st = state
    if "," in str(state):
        st = str(state).split(",")[1].strip()
    if st in states:
        return states[st]
    else:
        return state

In [18]:
def get_city(state):
    if "," in str(state):
        return str(state).split(",")[0].strip()
    return ""

In [19]:
df['City'] = df['Province/State'].apply(get_city)
df['State'] = df['Province/State'].apply(get_state)
df['Country'] = df['Country/Region'].apply(get_country)
df['Outbreak'] = 'COVID-19'

### Create City Dataset

In [20]:
df_city = df[df['City'].str.len() > 0]
df_city = df_city[['City', 'State', 'Country', 'Long', 'Lat', 'Outbreak', 'Confirmed', 'Deaths', 'Recovered', 'Last Update']]
df_city.to_csv("../data/city_COVID-19.csv")
df_city

Unnamed: 0,City,State,Country,Long,Lat,Outbreak,Confirmed,Deaths,Recovered,Last Update
46,King County,Washington,USA,-122.3321,47.6062,COVID-19,51,10,1,2020-03-05 22:03:17
59,Toronto,Ontario,Canada,-79.3832,43.6532,COVID-19,21,0,2,2020-03-05 17:53:03
61,Santa Clara,California,USA,-121.9552,37.3541,COVID-19,20,0,1,2020-03-05 22:53:03
63,Snohomish County,Washington,USA,-121.8339,48.033,COVID-19,18,1,0,2020-03-05 20:23:07
64,Westchester County,New York,USA,-73.7949,41.122,COVID-19,18,0,0,2020-03-05 21:33:03
75,Los Angeles,California,USA,-118.2437,34.0522,COVID-19,11,0,0,2020-03-05 21:03:14
90,Cook County,Illinois,USA,-87.6976,41.7377,COVID-19,5,0,2,2020-03-05 18:33:03
97,New York City,New York,USA,-74.006,40.7128,COVID-19,4,0,0,2020-03-05 21:03:13
104,Orange County,California,USA,-117.8531,33.7879,COVID-19,3,0,0,2020-03-04 02:13:11
105,San Diego County,California,USA,-117.1611,32.7157,COVID-19,3,0,1,2020-03-05 20:33:03


### Aggregate data by State
Latitude and longitude are average over all locations in a state

In [21]:
df_state = df[df['State'].str.len() > 0]
df_state = df_state.groupby(['State', 'Country'], as_index=False).agg({'Lat': "mean", 'Long': 'mean', 'Outbreak': 'first', 'Confirmed': "sum", "Deaths": 'sum', 'Recovered': 'sum', 'Last Update': 'max'})
df_state.to_csv("../data/state_COVID-19.csv")
df_state

Unnamed: 0,State,Country,Lat,Long,Outbreak,Confirmed,Deaths,Recovered,Last Update
0,Anhui,China,31.8257,117.2264,COVID-19,990,6,970,2020-03-05 04:33:02
1,Arizona,USA,33.35865,-112.18455,COVID-19,2,0,1,2020-03-03 17:33:02
2,Beijing,China,40.1824,116.4142,COVID-19,418,8,297,2020-03-04 23:23:01
3,British Columbia,Canada,49.2827,-123.1207,COVID-19,13,0,3,2020-03-05 04:43:03
4,California,USA,37.110623,-121.087446,COVID-19,51,1,2,2020-03-05 22:53:03
5,Chongqing,China,30.0572,107.874,COVID-19,576,6,512,2020-03-05 23:23:02
6,Diamond Princess cruise ship,Others,35.4437,139.638,COVID-19,706,6,10,2020-03-03 03:13:06
7,Florida,USA,28.6986,-83.9383,COVID-19,4,0,0,2020-03-05 20:03:03
8,From Diamond Princess,Australia,35.4437,139.638,COVID-19,0,0,0,2020-02-29 02:03:10
9,Fujian,China,26.0789,117.9874,COVID-19,296,1,277,2020-03-05 09:33:03


### Aggregate data by Country
Latitude and longitude are average over all locations in a country

In [22]:
df_country = df.groupby(['Country'], as_index=False).agg({'Lat': 'mean', 'Long': 'mean', 'Outbreak': 'first', 'Confirmed': 'sum', 'Deaths': 'sum', 'Recovered': 'sum', 'Last Update': 'max'})
df_country.to_csv("../data/country_COVID-19.csv")
df_country

Unnamed: 0,Country,Lat,Long,Outbreak,Confirmed,Deaths,Recovered,Last Update
0,Afghanistan,33.0,65.0,COVID-19,1,0,0,2020-02-24 23:33:02
1,Algeria,28.0339,1.6596,COVID-19,12,0,0,2020-03-04 19:33:03
2,Andorra,42.5063,1.5218,COVID-19,1,0,0,2020-03-02 20:23:16
3,Argentina,-38.4161,-63.6167,COVID-19,1,0,0,2020-03-04 01:33:07
4,Armenia,40.0691,45.0382,COVID-19,1,0,0,2020-03-01 19:53:02
5,Australia,-23.131538,140.060987,COVID-19,55,2,21,2020-03-05 18:33:03
6,Austria,47.5162,14.5501,COVID-19,41,0,0,2020-03-05 17:53:02
7,Azerbaijan,40.1431,47.5769,COVID-19,6,0,0,2020-03-05 13:53:03
8,Bahrain,26.0275,50.55,COVID-19,55,0,0,2020-03-05 13:53:03
9,Belarus,53.7098,27.9534,COVID-19,6,0,0,2020-03-04 12:43:03


### Add Strain data from Nextstrain.org
Data are provided by [Nextstrain.org](https://nextstrain.org), a resource forrReal-time tracking of pathogen evolution.

Check this [git repository](https://github.com/nextstrain/ncov) for the latest available dataset.

In [23]:
strains = pd.read_csv("https://github.com/nextstrain/ncov/raw/master/data/metadata.tsv", sep = '\t')

In [24]:
strains

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,location,segment,length,host,age,sex,originating_lab,submitting_lab,authors,url,title
0,Anhui/SZ005/2020,ncov,EPI_ISL_413485,?,2020-01-24,Asia,China,Anhui,Suzhou,genome,29860,Human,58,Male,"Department of microbiology laboratory,Anhui Pr...","Department of microbiology laboratory,Anhui Pr...",Li et al,https://www.gisaid.org,?
1,Australia/NSW01/2020,ncov,EPI_ISL_407893,?,2020-01-24,Oceania,Australia,New South Wales,Sydney,genome,29782,Human,43,Male,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,?
2,Australia/NSW05/2020,ncov,EPI_ISL_412975,?,2020-02-28,Oceania,Australia,New South Wales,Sydney,genome,29782,Human,43,Male,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,?
3,Australia/NSW06/2020,ncov,EPI_ISL_413213,?,2020-02-29,Oceania,Australia,New South Wales,Sydney,genome,29782,Human,51,Female,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,?
4,Australia/NSW07/2020,ncov,EPI_ISL_413214,?,2020-02-29,Oceania,Australia,New South Wales,Sydney,genome,29782,Human,53,Male,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,?
5,Australia/QLD01/2020,ncov,EPI_ISL_407894,?,2020-01-28,Oceania,Australia,Queensland,Gold Coast,genome,29842,Human,44,Male,Pathology Queensland,Public Health Virology Laboratory,Huang et al,https://www.gisaid.org,?
6,Australia/QLD02/2020,ncov,EPI_ISL_407896,?,2020-01-30,Oceania,Australia,Queensland,Gold Coast,genome,29864,Human,42,Female,Pathology Queensland,Public Health Virology Laboratory,Huang et al,https://www.gisaid.org,?
7,Australia/QLD03/2020,ncov,EPI_ISL_410717,?,2020-02-05,Oceania,Australia,Queensland,Gold Coast,genome,29865,Human,8,Male,Pathology Queensland,Public Health Virology Laboratory,Huang et al,https://www.gisaid.org,?
8,Australia/QLD04/2020,ncov,EPI_ISL_410718,?,2020-02-05,Oceania,Australia,Queensland,Gold Coast,genome,29865,Human,37,Male,Pathology Queensland,Public Health Virology Laboratory,Huang et al,https://www.gisaid.org,?
9,Australia/VIC01/2020,ncov,EPI_ISL_406844,MT007544,2020-01-25,Oceania,Australia,Victoria,Clayton,genome,29893,Human,50,Male,Monash Medical Centre,Collaboration between the University of Melbou...,Caly et al,https://www.gisaid.org,?


In [25]:
strains_city = pd.merge(df_city, strains, left_on=['City','State','Country'], right_on=['location','division', 'country'])
strains_city = strains_city[['City','State','Country','strain','genbank_accession','division','location']]
strains_city.to_csv("../data/strains_city_COVID-19.csv")
strains_city

Unnamed: 0,City,State,Country,strain,genbank_accession,division,location
0,King County,Washington,USA,USA/WA-S2/2020,?,Washington,King County
1,Snohomish County,Washington,USA,USA/WA1/2020,MN985325,Washington,Snohomish County
2,Snohomish County,Washington,USA,USA/WA2/2020,?,Washington,Snohomish County
3,Los Angeles,California,USA,USA/CA1/2020,MN994467,California,Los Angeles
4,Orange County,California,USA,USA/CA2/2020,MN994468,California,Orange County
5,Boston,Massachusetts,USA,USA/MA1/2020,MT039888,Massachusetts,Boston


In [26]:
strains_state = pd.merge(df_state, strains, left_on=['State','Country'], right_on=['division', 'country'])
strains_state = strains_state[['State','Country','strain','genbank_accession','division','location']]
strains_state.to_csv("../data/strains_state_COVID-19.csv")
strains_state

Unnamed: 0,State,Country,strain,genbank_accession,division,location
0,Anhui,China,Anhui/SZ005/2020,?,Anhui,Suzhou
1,Anhui,China,Hefei/2/2020,?,Anhui,Hefei
2,Arizona,USA,USA/AZ1/2020,MN997409,Arizona,Phoenix
3,Beijing,China,Beijing/IVDC-BJ-005/2020,?,Beijing,?
4,British Columbia,Canada,Canada/BC_37_0-2/2020,?,British Columbia,?
5,California,USA,USA/CA1/2020,MN994467,California,Los Angeles
6,California,USA,USA/CA2/2020,MN994468,California,Orange County
7,California,USA,USA/CA3/2020,MT027062,California,?
8,California,USA,USA/CA4/2020,MT027063,California,?
9,California,USA,USA/CA5/2020,MT027064,California,?


In [27]:
strains_country = pd.merge(df_country, strains, left_on='Country', right_on='country')
strains_country = strains_country[['Country','strain','genbank_accession','division','location']]
strains_country.to_csv("../data/strains_country_COVID-19.csv")
strains_country

Unnamed: 0,Country,strain,genbank_accession,division,location
0,Australia,Australia/NSW01/2020,?,New South Wales,Sydney
1,Australia,Australia/NSW05/2020,?,New South Wales,Sydney
2,Australia,Australia/NSW06/2020,?,New South Wales,Sydney
3,Australia,Australia/NSW07/2020,?,New South Wales,Sydney
4,Australia,Australia/QLD01/2020,?,Queensland,Gold Coast
5,Australia,Australia/QLD02/2020,?,Queensland,Gold Coast
6,Australia,Australia/QLD03/2020,?,Queensland,Gold Coast
7,Australia,Australia/QLD04/2020,?,Queensland,Gold Coast
8,Australia,Australia/VIC01/2020,MT007544,Victoria,Clayton
9,Australia,Sydney/2/2020,?,New South Wales,Sydney


## Open Access Epidemiological Data from the COVID-19 Outbreak
Provided by the [Open COVID-19 Data Curation Group](https://doi.org/10.1016/S1473-3099(20)30119-5).


In [28]:
epi1 = pd.read_csv("https://docs.google.com/spreadsheets/d/1itaohdPiAeniCXNlntNztZ_oRvjh0HsGuJXUJWET008/export?format=csv")

In [29]:
epi2 = pd.read_csv("https://docs.google.com/spreadsheets/d/1itaohdPiAeniCXNlntNztZ_oRvjh0HsGuJXUJWET008/export?format=csv&gid=429276722")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [30]:
epi = pd.concat([epi1, epi2])
epi = epi.fillna('')

In [31]:
epi.head()

Unnamed: 0,ID,age,sex,city,province,country,wuhan(0)_not_wuhan(1),latitude,longitude,geo_resolution,date_onset_symptoms,date_admission_hospital,date_confirmation,symptoms,lives_in_Wuhan,travel_history_dates,travel_history_location,reported_market_exposure,additional_information,chronic_disease_binary,chronic_disease,source,sequence_available,outcome,date_death_or_discharge,notes_for_discussion,location,admin3,admin2,admin1,country_new,admin_id,data_moderator_initials,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40
0,1,30,male,"Chaohu City, Hefei City",Anhui,China,1,31.647,117.717,admin3,18.01.2020,20.01.2020,22.01.2020,,yes,17.01.2020,Wuhan,,,,,http://ah.people.com.cn/GB/n2/2020/0127/c35826...,,,,,,Chaohu City,Hefei City,Anhui,China,340181,,,,,,,,,
1,2,47,male,"Baohe District, Hefei City",Anhui,China,1,31.7786,117.332,admin3,10.01.2020,21.01.2020,23.01.2020,,no,10.01.2020,"Luzhou Hunan, via Wuhan",,,,,http://ah.people.com.cn/GB/n2/2020/0127/c35826...,,,,,,Baohe District,Hefei City,Anhui,China,340111,,,,,,,,,
2,3,49,male,"High-Tech Zone, Hefei City",Anhui,China,1,31.8283,117.225,point,15.01.2020,20.01.2020,23.01.2020,,no,10.01.2020,"Yinzhou Hunan, via Wuhan",,,,,http://ah.people.com.cn/GB/n2/2020/0127/c35826...,,,,,High-Tech Zone,Shushan District,Hefei City,Anhui,China,340104,,,,,,,,,
3,4,47,female,"High-Tech Zone, Hefei City",Anhui,China,1,31.8283,117.225,point,17.01.2020,20.01.2020,23.01.2020,,no,,,,contacted with confirmed case,,,http://ah.people.com.cn/GB/n2/2020/0127/c35826...,,,,,High-Tech Zone,Shushan District,Hefei City,Anhui,China,340104,,,,,,,,,
4,5,50,female,"Feidong County, Hefei City",Anhui,China,1,32.0012,117.568,admin3,10.01.2020,21.01.2020,23.01.2020,,no,07.01.2020,Wuhan,,"06.01.2020 went to Wuhan, 07.01.2020 returned ...",,,http://ah.people.com.cn/GB/n2/2020/0127/c35826...,,,,,,Feidong County,Hefei City,Anhui,China,340122,,,,,,,,,


In [32]:
epi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35860 entries, 0 to 21240
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ID                        35860 non-null  int64 
 1   age                       35860 non-null  object
 2   sex                       35860 non-null  object
 3   city                      35860 non-null  object
 4   province                  35860 non-null  object
 5   country                   35860 non-null  object
 6   wuhan(0)_not_wuhan(1)     35860 non-null  int64 
 7   latitude                  35860 non-null  object
 8   longitude                 35860 non-null  object
 9   geo_resolution            35860 non-null  object
 10  date_onset_symptoms       35860 non-null  object
 11  date_admission_hospital   35860 non-null  object
 12  date_confirmation         35860 non-null  object
 13  symptoms                  35860 non-null  object
 14  lives_in_Wuhan        

In [33]:
epi['travel_history_location'].unique()

array(['Wuhan', 'Luzhou Hunan, via Wuhan', 'Yinzhou Hunan, via Wuhan', '',
       'Hubei', 'Ezhou City, Hubei', 'Thailand', 'Changsha', 'Hangzhou',
       'Rushan City', 'Xiaogan', 'Bengbu', 'Hongshan District, Wuhan',
       'Qiaokou District, Wuhan', 'Wuchang District, Wuhan', 'Macheng',
       'Xiangtan', 'Wuhan via Shanghai', 'Singapore', 'China', 'Wuchang',
       'from Wuhan , traveled to Johor from Singapore', 'Tonglu County',
       'Hong Kong', 'Shiyan county, Hubei Province', 'Xianyang City',
       'None', 'Wuzhou', 'Shanghai', 'Xiaogan City',
       'Hanuang District, Wuhan', 'Bavaria, Germany',
       'Went to Thailand; had dinner with firedns prior to trip',
       'Beijing, Zhuzhou, Hunan, and Chengdu, Sichuan',
       'Sanxiang, Zhongshan', 'Wuhan via Hong Kong', 'Wuhan via Qingdao',
       'Japan', 'Osaka to Tokyo; on tour bus for chinese tourists',
       'Yokohama', 'Suizhou City, Hubei', 'Wuhan then to Sanya, Hainan',
       "Wuhan to Xi'an", 'Guangzhou', 'Taizhou, 

In [34]:
epi = epi[['ID','age','sex','city','province','country','latitude','longitude','symptoms','date_onset_symptoms','date_admission_hospital','date_confirmation','travel_history_dates','travel_history_location']]
epi.to_csv("../data/epi_data_COVID-19.csv")
epi.head(100)

Unnamed: 0,ID,age,sex,city,province,country,latitude,longitude,symptoms,date_onset_symptoms,date_admission_hospital,date_confirmation,travel_history_dates,travel_history_location
0,1,30.0,male,"Chaohu City, Hefei City",Anhui,China,31.647,117.717,,18.01.2020,20.01.2020,22.01.2020,17.01.2020,Wuhan
1,2,47.0,male,"Baohe District, Hefei City",Anhui,China,31.7786,117.332,,10.01.2020,21.01.2020,23.01.2020,10.01.2020,"Luzhou Hunan, via Wuhan"
2,3,49.0,male,"High-Tech Zone, Hefei City",Anhui,China,31.8283,117.225,,15.01.2020,20.01.2020,23.01.2020,10.01.2020,"Yinzhou Hunan, via Wuhan"
3,4,47.0,female,"High-Tech Zone, Hefei City",Anhui,China,31.8283,117.225,,17.01.2020,20.01.2020,23.01.2020,,
4,5,50.0,female,"Feidong County, Hefei City",Anhui,China,32.0012,117.568,,10.01.2020,21.01.2020,23.01.2020,07.01.2020,Wuhan
5,6,,,Lu'an City,Anhui,China,31.7594,116.315,pneumonia,,,24.01.2020,,
6,7,42.0,female,Fuyang City,Anhui,China,32.9188,115.704,fever,21.01.2020,21.01.2020,22.01.2020,19.01.2020,Wuhan
7,8,,female,Huaibei City,Anhui,China,33.7272,116.742,,,,25.01.2020,13.01.2020,Wuhan
8,9,59.0,female,Huainan City,Anhui,China,32.7574,116.734,fever,19.01.2020,24.01.2020,26.01.2020,22.01.2020,Wuhan
9,10,30.0,male,Hefei City,Anhui,China,31.7944,117.343,,17.01.2020,22.01.2020,23.01.2020,21.01.2020,Wuhan
