This section is used to collect data for other sections <br>
<b><font size=7>Table of content</font></b><br>

<font size=3.5>1. [Data collection](#1)<br>
&emsp;1.1. [Covid-19 World data](#1.1)<br>
&emsp;&emsp;1.1.1. [Data from WHO website](#1.1.1)<br>
&emsp;&emsp;1.1.2. [Data from Johns Hopkins University](#1.1.2)<br>
&emsp;&emsp;1.1.3. [Data from Ourworldindata](#1.1.3)<br>
&emsp;&emsp;1.1.4. [World census data](#1.1.4)<br>
&emsp;1.2. [Covid-19 VietNam data](#1.2)<br>
&emsp;&emsp;1.2.1. [Population data](#1.2.1)<br>
&emsp;&emsp;1.2.2. [Vaccine import and distribution](#1.2.2)<br>
&emsp;&emsp;&ensp;a. [Distribution data](#1.2.2.a)<br>
&emsp;&emsp;&ensp;b. [Import data](#1.2.2.b)<br>
&emsp;&emsp;1.2.3. [Covid-19 data from Ministry of Health](#1.2.3)<br>
&emsp;&emsp;&ensp;a. [Total case in VietNam](#1.2.3.a)<br>
&emsp;&emsp;&ensp;b. [Case by province](#1.2.3.b)<br>
&emsp;&emsp;&ensp;c. [Case by age/gender](#1.2.3.c)<br>
&emsp;&emsp;&ensp;d. [Death case by province](#1.2.3.d)<br>
&emsp;1.3. [Robust weather condition](#1.3)<br>
&emsp;&emsp;1.3.1. [Average annual wind speed](#1.3.1)<br>
&emsp;&emsp;1.3.2. [Average annual temperature](#1.3.2)<br>
&emsp;1.4. [Updated time](#1.4)<br>
</font>

In [None]:
import numpy as np
import pandas as pd
import datetime
import requests
import json
import re
import time
from IPython.display import display
import os
import traceback
from covid_science.collection import store_dataframe,message_box,browser_request
from covid_science.preparation import read_stored_data
from covid_science.workers import vn_casualty_wrapper_func,temperature_data_wrapper_func
from functools import partial
from multiprocessing import Manager,Pool

In [None]:
#default automated data collection settings

continue_collect = True
init_settings = [6,6,6,6,6,6,6,6]
basic_settings =[7,6,7,7,6,7,7,7]
auto_process = False
if os.path.exists("database/Covid-19_raw_data"): #if collecting data is not the first time for the directory
    answer = message_box("Data collection settings","Do you want to gather data based on default setting?",3+32+4096)
    if answer == 2: #cancel
        continue_collect = False
    elif answer ==6: #yes
        pass
    else: #no
        basic_settings[0]= message_box("Data collection settings",
                                       "Vaccine metadata for each country might not be updated rapidly. "\
                                       "Do you want to collect vaccine metadata?",
                                       4+32+4096)
        basic_settings[1]= message_box("Data collection settings",
                                       "Source file from Ourworldindata is large. Do you want to process?",
                                       4+32+4096)
        basic_settings[2]= message_box("Data collection settings",
                                       "Do you want to collect each country census data again?",
                                       4+32+4096)
        basic_settings[3]= message_box("Data collection settings",
                                       "Do you want to save VietNam population data?",
                                       4+32+4096)
        basic_settings[4]= message_box("Data collection settings",
                                       "Do you want to collect VietNam vaccine import data for today?",
                                       4+32+4096)
        basic_settings[5]= message_box("Data collection settings",
                                       "Do you want to collect VietNam province code again "\
                                       "before processing covid-19 death data for today?",
                                       4+32+4096)
        basic_settings[6] = message_box("Data collection settings",
                                        "Do you want to collect each country average wind speed again? Please note that data won't have much changes.",
                                        4+32+4096)
        basic_settings[7] = message_box("Data collection settings",
                                        "Do you want to collect each country average annual temperature again? Please note that data won't have much changes.",
                                        4+32+4096)
        auto_setting = message_box("Data collection settings",
                                   "Do you want to auto process all data (new file will override old file when conflicts, "\
                                   "file with no new information can still be created when called to process)?",
                                   4+32+4096)
        if auto_setting == 6:
            auto_process = True
else:
    answer = message_box("Data collection settings","This directory is new. Default data collection settings will be applied."\
            " Do you want to continue?",4+48+4096)
    if answer==7: #no
        continue_collect = False
    else:
        basic_settings = init_settings     

<font size=10>**1.Data collection**</font><a name="1"></a><br>
<font size=6>**1.1.Covid-19 World data**</font><a name="1.1"></a><br>
<ins><font size=5>**1.1.1.Data from WHO website**</font></ins><a name="1.1.1"></a><br>
Reference link: [https://covid19.who.int/info/](https://covid19.who.int/info/)<br>
* <ins>Get request & store infect/death case data</ins>

In [None]:
if continue_collect:
    try:    
        folder_path = "database/Covid-19_raw_data/WHO"
        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
            
        df_WHO_statistic = pd.read_csv('https://covid19.who.int/WHO-COVID-19-global-data.csv',encoding='utf-8-sig')
        display(df_WHO_statistic.head(3))
        df_WHO_statistic.to_csv(folder_path +"/statistic.csv",index=False)
    except Exception:
        traceback.print_exc()

* <ins>Get request & store vaccine type data</ins>

In [None]:
if basic_settings[0]==6 and continue_collect:
    try:    
        df_WHO_vaccine_meta = pd.read_csv('https://covid19.who.int/who-data/vaccination-metadata.csv',encoding='utf-8')
        display(df_WHO_vaccine_meta.head(3))
        
        folder_path = "database/Covid-19_raw_data/WHO"
        df_WHO_vaccine_meta.to_csv(folder_path +"/vaccine_meta.csv",encoding='utf-8',index=False)
    except Exception:
        traceback.print_exc()

* <ins>Get request & store vaccination data</ins>

In [None]:
if continue_collect:
    try:    
        df_WHO_vaccination = pd.read_csv('https://covid19.who.int/who-data/vaccination-data.csv',encoding='utf-8')
        display(df_WHO_vaccination.head(3))
        store_dataframe(df_WHO_vaccination,"database/Covid-19_raw_data/WHO/vaccination",auto_process=auto_process,encoding='utf-8')
    except Exception as error:
        if 'Input dataframe has some errors/format problems. File will not be saved.' in error.args[0]: ## this mean a new column might has been added to the main data
            df_WHO_vaccination.to_csv("database/Covid-19_raw_data/WHO/vaccination" +"/"+ datetime.date.today().strftime("%Y-%m-%d") +".csv",encoding="utf-8",index=False)
        else:
            traceback.print_exc()

* <ins>Get request & store protection measure data</ins>

In [None]:
if continue_collect:
    try:    
        df_WHO_measure = pd.read_csv('https://covid19.who.int/who-data/phsm-severity-data.csv',encoding='utf-8')
        display(df_WHO_measure.head(3))
        
        folder_path = "database/Covid-19_raw_data/WHO"
        df_WHO_measure.to_csv(folder_path +"/measure.csv",encoding='utf-8',index=False)
    except Exception:
        traceback.print_exc()

<ins><font size="5">**1.1.2.Data from Johns Hopkins University**</font></ins><a name="1.1.2"></a><br>
Reference link: [https://github.com/CSSEGISandData/COVID-19](https://github.com/CSSEGISandData/COVID-19)<br>
* <ins>Processing & store daily case data</ins>

In [None]:
if continue_collect:
    try:
        not_latest_data = True
        day_count = 0
        while not_latest_data:
            today = datetime.datetime.now()-datetime.timedelta(day_count)
            today = today.strftime('%m-%d-%Y')
            search_string = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/'+ today +'.csv'
            try:
                df_JH_daily = pd.read_csv(search_string)
                print('Lastest data is on ' + today + '.')
                not_latest_data = False
            except:
                day_count += 1
                print('Data for ' + today + ' isn\'t updated yet!')

        folder_path = "database/Covid-19_raw_data/JHU/daily"

        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")

        csv_name = datetime.datetime.strptime(today,'%m-%d-%Y').strftime("%Y-%m-%d") +".csv"
        display(df_JH_daily.head(3))
        df_JH_daily.to_csv(folder_path + "/" + csv_name,encoding='utf-8',index=False)
        print("Data collected.")
    except:
        traceback.print_exc()

* <ins>Get request & store infect case time sery </ins>

In [None]:
if continue_collect:
    try:    
        df_JH_confirm_time_sery = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
        display(df_JH_confirm_time_sery.head(3))
        
        folder_path = "database/Covid-19_raw_data/JHU"
        df_JH_confirm_time_sery.to_csv(folder_path +"/time_sery.csv",encoding='utf-8',index=False)
    except Exception as error:
        traceback.print_exc()

* <ins>Processing & store daily death case data</ins>

In [None]:
if continue_collect:
    try:    
        df_JH_death_time_sery = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
        display(df_JH_death_time_sery.head(3))
        
        folder_path = "database/Covid-19_raw_data/JHU"
        df_JH_death_time_sery.to_csv(folder_path +"/death_time_sery.csv",encoding='utf-8',index=False)
    except Exception:
        traceback.print_exc()

* <ins>Processing & store vaccinated population data</ins>

In [None]:
if continue_collect:
    try:    
        df_JH_vaccinate= pd.read_csv('https://raw.githubusercontent.com/govex/COVID-19/master/data_tables/vaccine_data/global_data/time_series_covid19_vaccine_global.csv')
        display(df_JH_vaccinate.head(3))
        
        folder_path = "database/Covid-19_raw_data/JHU"
        df_JH_vaccinate.to_csv(folder_path +"/vaccinate.csv",encoding='utf-8',index=False)
    except Exception:
        traceback.print_exc()

<ins><font size="5">**1.1.3.Data from Ourworldindata**</font></ins><a name="1.1.3"></a><br>
Reference link:<br>
[https://ourworldindata.org/coronavirus#coronavirus-country-profiles](https://ourworldindata.org/coronavirus#coronavirus-country-profiles)</br>
[https://github.com/owid/covid-19-data/tree/master/public/data/vaccinations](https://github.com/owid/covid-19-data/tree/master/public/data/vaccinations)

In [None]:
if basic_settings[1]==6 and continue_collect:
    try:
        folder_path = "database/Covid-19_raw_data/OurWorldinData"
        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
            
        df_oxford = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv',encoding='utf-8')
        display(df_oxford.head(3))
        df_oxford.to_csv(folder_path +"/covid_data.csv",encoding='utf-8',index=False)
        df_oxford_simplified = df_oxford[['iso_code','continent','location','date','new_cases','new_deaths','new_vaccinations_smoothed','population']]
        df_oxford_simplified.to_csv(folder_path +"/covid_data_simplified.csv",encoding='utf-8',index=False)
        
        df_oxford_vac_manufacture = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations-by-manufacturer.csv',encoding='utf-8')
        display(df_oxford_vac_manufacture.head(3))
        df_oxford_vac_manufacture.to_csv(folder_path +"/vaccine_volume.csv",encoding='utf-8',index=False)
        
        #Vietnam vaccine dose usage
        df_oxford_simplified.loc[df_oxford_simplified.location=="Vietnam",["date","new_vaccinations_smoothed"]].reset_index(drop=True).to_csv(folder_path+"/vietnam_vaccine.csv",encoding='utf-8',index=False)
    except Exception:
        traceback.print_exc()

In [None]:
#backup link
#df_oxford = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv',encoding='utf-8')
#df_oxford.head(3)

<ins><font size="5">**1.1.4.World census data**</font></ins><a name="1.1.4"></a><br>
Reference link: [https://population.un.org/wpp/Download/Standard/CSV/](https://population.un.org/wpp/Download/Standard/CSV/)

In [None]:
if basic_settings[2]==6 and continue_collect:
    try:    
        folder_path = "database/Covid-19_raw_data/census_data"

        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
        
        data_df = pd.read_excel('https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_General/WPP2022_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT_REV1.xlsx',
                       header=16,na_values='...')
        
        data_df = data_df[['Region, subregion, country or area *','Location code','ISO3 Alpha-code','Year','Type',
                           'Total Population, as of 1 January (thousands)',
                           'Crude Birth Rate (births per 1,000 population)',
                           'Crude Death Rate (deaths per 1,000 population)']].copy()
        data_df.dropna(subset='Year',inplace=True)
        data_df.reset_index(drop=True,inplace=True)
        data_df.iloc[:,3]=data_df.iloc[:,3].astype(np.int64)
        
        population_df = data_df.iloc[:,[0,1,2,3,4,5]].copy()
        display(population_df.head(3))
        population_df.to_csv(folder_path + "/population.csv",encoding="utf-8",index=False)
        
        birth_df = data_df.iloc[:,[0,1,2,3,4,6]].copy()
        display(birth_df.head(3))
        birth_df.to_csv(folder_path + "/crude_birth_rate.csv",encoding="utf-8",index=False)
        
        death_df = data_df.iloc[:,[0,1,2,3,4,7]].copy()
        display(death_df.head(3))
        death_df.to_csv(folder_path + "/crude_death_rate.csv",encoding="utf-8",index=False)
                
        print("Data collected.")
    except Exception:
        traceback.print_exc()

<ins><font size="6">**1.2.Covid-19 VietNam data**</font></ins><a name="1.2"></a><br>
<ins><font size="5">**1.2.1.Population data**</font></ins><a name="1.2.1"></a><br>
Reference link: [https://www.gso.gov.vn/px-web-2/?pxid=V0201&theme=D%C3%A2n%20s%E1%BB%91%20v%C3%A0%20lao%20%C4%91%E1%BB%99ng](https://www.gso.gov.vn/px-web-2/?pxid=V0201&theme=D%C3%A2n%20s%E1%BB%91%20v%C3%A0%20lao%20%C4%91%E1%BB%99ng)


In [None]:
if basic_settings[3]==6 and continue_collect:
    try:
        #request and process data
        url = 'https://pxweb.gso.gov.vn/sq/86d18bc5-2888-413d-b749-44ea803e1c86'
        vietnam_population = pd.read_excel('https://pxweb.gso.gov.vn/sq/86d18bc5-2888-413d-b749-44ea803e1c86',skiprows=2)

        statistical_year = []
        for year in vietnam_population.columns:
            year_value = int(re.findall(r'\d+', year)[0])
            if year_value > 1990:
                statistical_year.append(year_value)
                statistical_year.append(year_value)
                statistical_year.append(year_value)
        dummy = vietnam_population.transpose().iloc[1:4,0].to_list()
        year_parameters = []
        for x in range(1,int((len(statistical_year)//3)+1)):
            year_parameters.extend(dummy)
            
        #format data
        vietnam_population = vietnam_population.transpose()
        vietnam_population.drop(vietnam_population.columns[0],axis=1,inplace=True)
        vietnam_population.columns=vietnam_population.iloc[0,:]
        vietnam_population.drop(vietnam_population.index[0],axis=0,inplace = True)
        vietnam_population.set_index([statistical_year,year_parameters],inplace=True)
        vietnam_population.rename(columns={'Thanh Hoá':'Thanh Hóa','Khánh Hoà':'Khánh Hòa','Hoà Bình':'Hòa Bình','TP.Hồ Chí Minh':'TP. Hồ Chí Minh'},inplace=True)
        vietnam_population.sort_index(axis=1,inplace=True)
        vietnam_population.reset_index(inplace=True)
        vietnam_population.rename(columns={"level_0":"year","level_1":"criteria"},inplace=True)
        vietnam_population.criteria.replace(['Diện tích(Km2)','Dân số trung bình (Nghìn người)','Mật độ dân số (Người/km2)'],
                                      ['Area(km2)','Average population(thousand)','Population density(person/km2)'],inplace=True)
        display(vietnam_population.head(3))
        
        #save the data
        folder_path = "database/Covid-19_raw_data/VietNamData"
        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
        vietnam_population.to_csv(folder_path +"/"+"vietnam_population.csv",encoding="utf-8",index=False)
        print("Data collected.")
    except Exception:
        traceback.print_exc()

<ins><font size=5>**1.2.2.Vaccine import and distribution**</font></ins><a name="1.2.2"></a><br>
**a. Distribution data**<a name="1.2.2.a"></a><br>
Reference link: [https://tiemchungcovid19.gov.vn/portal](https://tiemchungcovid19.gov.vn/portal)

In [None]:
if continue_collect:
    try:    
        #request data
        vaccine_distribution = requests.get('https://tiemchungcovid19.gov.vn/api/public/dashboard/vaccination-statistics/all')#,verify=False)
        
        #format data
        df_vaccine_distribution = pd.json_normalize(vaccine_distribution.json())
        df_vaccine_distribution = df_vaccine_distribution[['provinceName','population','popOverEighteen','totalOnceInjected','totalTwiceInjected','totalInjected','totalVaccineAllocated','totalVaccineAllocatedReality']]
        df_vaccine_distribution.loc[df_vaccine_distribution.provinceName=='Hồ Chí Minh','provinceName']='TP. Hồ Chí Minh'
        df_vaccine_distribution.loc[df_vaccine_distribution.provinceName=='Hoà Bình','provinceName']='Hòa Bình'
        df_vaccine_distribution.sort_values('provinceName',inplace=True,ignore_index=True,kind = 'stable')
        display(df_vaccine_distribution.head(3))
        
        #save data
        folder_path = "database/Covid-19_raw_data/VietNamData/vaccine_distribution"
        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
        df_vaccine_distribution.to_csv(folder_path +"/"+ datetime.date.today().strftime("%Y-%m-%d") +".csv",encoding="utf-8",index=False)
        print("Data collected.")
    except Exception:
        traceback.print_exc()

**b. Import data**<a name="1.2.2.b"></a><br>
Reference link: [https://en.wikipedia.org/wiki/COVID-19_vaccination_in_Vietnam#cite_note-102](https://en.wikipedia.org/wiki/COVID-19_vaccination_in_Vietnam#cite_note-102)

In [None]:
if basic_settings[4]==6 and continue_collect:
    try:
        #request data
        vaccine_table = pd.read_html('https://en.wikipedia.org/wiki/COVID-19_vaccination_in_Vietnam#cite_note-102')[2]
        display(vaccine_table.head(3))
        
        #format data
        vaccine_import_df=vaccine_table.loc[~((vaccine_table['Doses available']=="None")|(vaccine_table['Doses available'].isnull()))].copy()
        vaccine_import_df['Doses available'] = vaccine_import_df['Doses available'].apply(lambda x:re.sub(r"\[([A-Za-z0-9\s]*)\]",'',x.strip())).str.replace(",","").astype(int)
        vaccine_import_df = vaccine_import_df[['Vaccine','Doses available']]
        
        #save data
        folder_path = "database/Covid-19_raw_data/VietNamData/vaccine_imported"
        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
        vaccine_import_df.drop_duplicates(ignore_index=True,inplace=True)
        vaccine_import_df.to_csv(folder_path +"/"+ datetime.date.today().strftime("%Y-%m-%d") +".csv",encoding="utf-8",index=False)
        print("Data collected.")
    except Exception:
        traceback.print_exc()
else:
    pass

<ins><font size=5>**1.2.3. Covid-19 data from Ministry of Health**</font></ins><a name="1.2.3"></a><br>
Reference link: [https://ncov.vncdc.gov.vn/](https://ncov.vncdc.gov.vn/)<br>
(Since MoH website isn't very friendly for web scraping, some support functions will be used to help collecting the data instead of using requests.get)<br>
**a. Total case in VietNam**<a name="1.2.3.a"></a><br>
* <ins>General input parameters</ins>

In [None]:
wait_time=30
attempt=3
interval=10
link = "https://ncov.vncdc.gov.vn/"
api_country_case = "https://ncov.vncdc.gov.vn/v2/vietnam/report-epi?"
search_scope = ['https://ncov.vncdc.gov.vn/v2/']


request_obj = browser_request(wait_time=wait_time,
                              attempt=attempt,
                              interval=interval)

* <ins>Processing & store data</ins>

In [None]:
if continue_collect:
    try:
        #request API response        
        json_object = request_obj.get_web_response(website_link=link,api=api_country_case,process_time=True,search_scope=search_scope)
        #format data
        df_vncdc_country_case = pd.json_normalize(json_object,record_path=['report','data'],meta=[['report','name'],['report','code']])
        df_vncdc_country_case[0] = pd.to_datetime(df_vncdc_country_case[0],unit='ms').dt.date
        df_vncdc_country_case.rename(columns={0:"date",1:"case","report.name":"case_origin","report.code":"code"},inplace=True)
        df_vncdc_country_case.loc[df_vncdc_country_case.code=="avg_case",'code']="7_days_avg_case"
        if (df_vncdc_country_case.date.head(1)==datetime.date(2020,1,22)).all():  ###for some reason the date keep move back 1 day if you request the data at late night
            df_vncdc_country_case.date=df_vncdc_country_case.date+datetime.timedelta(1)
        df_vncdc_country_case.date=df_vncdc_country_case.date.astype(str)
        display(df_vncdc_country_case.head(3))
        
        #save data
        store_dataframe(df_vncdc_country_case,"database/Covid-19_raw_data/VietNamData/Ministry_of_Health_data/total_case",auto_process=auto_process,encoding='utf-8')
    except Exception:
        traceback.print_exc()

**b. Case by province**<a name="1.2.3.b"></a><br>
* <ins>General input parameters</ins>

In [None]:
wait_time=25
attempt=3
interval=5

link = "https://ncov.vncdc.gov.vn/viet-nam-full.html?startTime={}&endTime{}=&provinces=&districts=&tabKey=0".format('2020-01-01','')
api_province_case = "https://ncov.vncdc.gov.vn/v2/vietnam/by-current?"
search_scope = ['https://ncov.vncdc.gov.vn/v2/']

request_obj = browser_request(wait_time=wait_time,
                              attempt=attempt,
                              interval=interval)

* <ins>Processing & store data</ins>

In [None]:
if continue_collect:
    try:
        #request API response
        json_object = request_obj.get_web_response(website_link=link,api=api_province_case,process_time=True,search_scope=search_scope)  
        
        #format data
        df_vncdc_province_case = pd.json_normalize(json_object)
            ##remove path of nested reports from column name
        df_vncdc_province_case.columns = df_vncdc_province_case.columns.str.replace('data.','',regex=True)

            ##Synchronize all province name with other dataframe
        list1 = df_vncdc_province_case.tinh.apply(lambda x:str.replace(x,'Thành phố ','')).apply(lambda x:str.replace(x,'Tỉnh ','')).apply(lambda x:str.replace(x,'Hồ Chí Minh','TP. Hồ Chí Minh')).to_list()
        df_vncdc_province_case.tinh = list1
        df_vncdc_province_case.loc[df_vncdc_province_case.tinh=='Hoà Bình','tinh']='Hòa Bình'
        df_vncdc_province_case=df_vncdc_province_case.loc[~(df_vncdc_province_case.tinh=="Chưa rõ")]
        df_vncdc_province_case.sort_values(by='tinh',inplace=True,ignore_index=True,kind = 'stable')
            ##drop some columns to keep data consistently as time sery
        df_vncdc_province_case.drop(df_vncdc_province_case.columns[[2,3,4,6,7,8,9,10,11]],axis=1,inplace=True)
        df_vncdc_province_case.rename(columns={"tinh":"provinceName"},inplace=True)
        display(df_vncdc_province_case.head(3))
        
        #save data
        store_dataframe(df_vncdc_province_case,"database/Covid-19_raw_data/VietNamData/Ministry_of_Health_data/case_by_province",horizontal_sery=True,auto_process=auto_process,encoding='utf-8')
    except Exception:
        traceback.print_exc()

**c. Case by age/gender**<a name="1.2.3.c"></a><br>
* <ins>General input parameters</ins>

In [None]:
wait_time=20
attempt=3
interval=5

link = "https://ncov.vncdc.gov.vn/viet-nam-full.html?startTime={}&endTime{}=&provinces=&districts=&tabKey=0".format('2020-01-01','')
api_age_case = "https://ncov.vncdc.gov.vn/v2/vietnam/dotuoi-2?"
search_scope = ['https://ncov.vncdc.gov.vn/v2/']

request_obj = browser_request(wait_time=wait_time,
                              attempt=attempt,
                              interval=interval)

* <ins>Processing & store data</ins>

In [None]:
if continue_collect:
    try:
        #request API response
        json_object = request_obj.get_web_response(website_link=link,api=api_age_case,process_time=True,search_scope=search_scope) 
        
        #format data
            ##change record format for json normalize
        for item in json_object:
            for data in item['data']:
                replace_item = []
                for x,y in zip(data['data'].keys(),data['data'].values()):
                    replace_item.append([x,y])
                data['data']=replace_item
                
            ##remove path of nested reports from column name
        df_vncdc_age_case = pd.json_normalize(json_object,record_path=['data','data'],meta=[['name'],['data','type']])
        df_vncdc_age_case.rename(columns={0:"age_range",1:"case","name":"distribution","data.type":"gender"},inplace=True)
        df_vncdc_age_case.loc[df_vncdc_age_case.distribution=="Nhóm 1",'distribution']="Group 1"
        df_vncdc_age_case.loc[df_vncdc_age_case.distribution=="Nhóm 2",'distribution']="Group 3"
        df_vncdc_age_case.loc[df_vncdc_age_case.distribution=="Nhóm 3",'distribution']="Group 3"
        df_vncdc_age_case.loc[df_vncdc_age_case.gender=="Nam",'gender']="Male"
        df_vncdc_age_case.loc[df_vncdc_age_case.gender=="Nữ",'gender']="Female"
        display(df_vncdc_age_case.head(3))
        
        #save data
        store_dataframe(df_vncdc_age_case,"database/Covid-19_raw_data/VietNamData/Ministry_of_Health_data/case_by_age",auto_process=auto_process,encoding='utf-8')
    except Exception as error:
        traceback.print_exc()

**d. Death case by province**<a name="1.2.3.d"></a><br>
* <ins>Input parameters for province code</ins>

In [None]:
wait_time=20
attempt=3
interval=5

link = "https://ncov.vncdc.gov.vn/viet-nam-full.html?startTime={}&endTime{}=&provinces=&districts=&tabKey=0".format('2020-01-01','')
api_province_code = "https://ncov.vncdc.gov.vn/v2/vietnam/provinces?"
search_scope = ['https://ncov.vncdc.gov.vn/v2/']

request_obj = browser_request(wait_time=wait_time,
                              attempt=attempt,
                              interval=interval)

* <ins>Processing and store province code</ins>

In [None]:
if basic_settings[5]==6 and continue_collect:
    try:    
        #request API response
        json_object = request_obj.get_web_response(website_link=link,api=api_province_code,process_time=True,search_scope=search_scope) 
        
        #format data
        df_province_code = pd.json_normalize(json_object)
        df_province_code = df_province_code[['label','value']]
        df_province_code.columns = ['province','id']
            ##Synchronize all province name with other dataframe
        list1 = df_province_code.province.apply(lambda x:str.replace(x,'Thành phố ','')).apply(lambda x:str.replace(x,'Tỉnh ','')).apply(lambda x:str.replace(x,'Hồ Chí Minh','TP. Hồ Chí Minh')).to_list()
        df_province_code.province = list1
        df_province_code.loc[df_province_code.province=='Hoà Bình','province']='Hòa Bình'
        df_province_code=df_province_code.loc[~(df_province_code.province=="Chưa rõ")]
        df_province_code.sort_values(by='province',inplace=True,ignore_index=True,kind = 'stable')
        display(df_province_code.head(3))
        
        #save data
        folder_path = "database/Covid-19_raw_data/VietNamData/Ministry_of_Health_data"
        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
        df_province_code.to_csv(folder_path +"/"+ "province_code.csv",encoding="utf-8",index=False)
        print("Data collected.")
    except Exception:
        traceback.print_exc()

* <ins>Input parameters for total death data</ins>

In [None]:
link = "https://ncov.vncdc.gov.vn/viet-nam-full.html?startTime={}&endTime{}=&provinces=&districts=&tabKey=0".format('2020-01-01','')
api_total_death = "https://ncov.vncdc.gov.vn/v2/vietnam/report-epi-5?"
search_scope = ['https://ncov.vncdc.gov.vn/v2/']

* <ins>Processing</ins>

In [None]:
if continue_collect:
    try:
        #request total casualty case API response 
        json_object = request_obj.get_web_response(website_link=link,api=api_total_death,process_time=True,search_scope=search_scope) 
            
        #format data
        df_cumulative_death = pd.json_normalize(json_object['report'][2],record_path = 'data')
        df_cumulative_death[0] = pd.to_datetime(df_cumulative_death[0],unit='ms').dt.date
        df_cumulative_death.rename(columns={0:"date",1:"VN_death_case"},inplace=True)

        df_cumulative_death2 = pd.json_normalize(json_object['report'][4],record_path = 'data')
        df_cumulative_death2[0] = pd.to_datetime(df_cumulative_death2[0],unit='ms').dt.date
        df_cumulative_death2.rename(columns={0:"date",1:"VN_days_death_avg"},inplace=True)

        df_cumulative_death=df_cumulative_death.merge(df_cumulative_death2,on='date',how='outer')
        display(df_cumulative_death.head(3))
    except Exception:
        traceback.print_exc()

* <ins>Input parameters for death data by province</ins>

In [None]:
#general input parameters
wait_time=15
attempt=3
interval=5

link = "https://ncov.vncdc.gov.vn/viet-nam-full.html?startTime=2020-01-01&endTime=&provinces={}&districts=&tabKey=0"
api_province_death = "https://ncov.vncdc.gov.vn/v2/vietnam/report-epi-5?"
search_scope = ['https://ncov.vncdc.gov.vn/v2/']

request_obj = browser_request(wait_time=wait_time,
                              attempt=attempt,
                              interval=interval)

vn_max_worker = 5 #dont set too high, will slow things down

* <ins>Processing</ins>

In [None]:
start_cell = time.perf_counter()

#collect casualty data by province/ In case of failure due to network connection of host, user can re-run the cell to try collecting data again
if continue_collect:
    vn_manager = Manager()
    vn_counter= vn_manager.Array('i',[0,0])
    vn_time_list = vn_manager.list([])
    vn_error_dict = vn_manager.dict()
    vn_shared_output = vn_manager.Namespace()
    vn_shared_output.shared_df = pd.DataFrame({'date':[]})
    vn_lock = vn_manager.Lock()
    try:
        run_cell=True    
        if 'df_province_code' in globals():
            province_list = df_province_code.values.tolist()
        else:
            df_province_code=pd.read_csv("database/Covid-19_raw_data/VietNamData/Ministry_of_Health_data/province_code.csv",encoding='utf-8')
            province_list = df_province_code.values.tolist()
            
        try:
            if df_dummy.size != df_cumulative_death.size:
                answer = message_box("Data collection request",
                                     "Are you sure that you want to try to collect VietNam casualty data by province, "\
                                     "since the Dataframe was just collected?",
                                     4+48+4096)               
                if answer==6:
                    df_dummy=df_cumulative_death.copy()
                    run_cell=True
                else:
                    run_cell=False
        except NameError:
                df_dummy=df_cumulative_death.copy()
        except Exception as error:
            raise error
                
        if run_cell:
            worker_func = partial(vn_casualty_wrapper_func,
                                  shared_count=vn_counter,
                                  time_list=vn_time_list,
                                  error_dict=vn_error_dict,
                                  shared_output=vn_shared_output,
                                  lock=vn_lock,
                                  request_object=request_obj,
                                  link = link,
                                  api = api_province_death,
                                  search_scope = search_scope)
            executor= Pool(processes=vn_max_worker)
            start_worker = time.perf_counter()
            pool_result = executor.map_async(worker_func,province_list)
            
            max_province = len(province_list)
            while vn_counter[0]<max_province:
                time.sleep(1)
                n_done = vn_counter[0]
                n_wait = max_province
                total_time = round(sum(vn_time_list),2)
                if n_done!=0:
                    avg_time = round(total_time/n_done,2)

                if n_done!=0:
                    remain_province = n_wait - n_done
                    remain_time = round(remain_province*avg_time,2)
                    real_remain = round(remain_time/vn_max_worker,2)
                else:
                    remain_province = "estimating..."
                    remain_time = "-- "
                    real_remain = "-- "

                elapsed_time = round(time.perf_counter()-start_worker,2)
                a_string = (f"Elapsed time: {elapsed_time}s    " +
                            f"Total workers processing time: {total_time}s    " + 
                            f" Remaining provinces: {remain_province}    " + 
                            f" Estimated remaining worker time: {remain_time}s    " +
                            f" Estimated remaining real time: {real_remain}s")

                print(" "*(len(a_string)+10),end="\r")
                print(a_string,end="\r")
            
            print(" "*(len(a_string)+10),end="\r")
            print("---Casualty data collecting completed---")
            print(f"Number of workers with error happened: {vn_counter[1]}")
            print(f"Total worker processing time taken: {round(sum(vn_time_list),2)}s")
            
            result_df = vn_shared_output.shared_df.copy()
            sorted_col = np.append(['date'],np.sort(result_df.columns[1:],kind='stable'))
            result_df = result_df[sorted_col]
            df_dummy = df_dummy.merge(result_df,on="date",how='outer')
            
            vn_casualty_error_result = dict(vn_error_dict)
            vn_casualty_time = list(vn_time_list)
            
            executor.close()
            executor.join()
    except Exception:
        print(f"Error was found. Data for {vn_counter[0]} provinces was collected before the error.") 
        traceback.print_exc()
    finally:
        vn_manager.shutdown()
        
end_cell = time.perf_counter()
print(f"Total elapsed time on this code block: {round(end_cell-start_cell,2)}s")

* <ins>Store death data</ins>

In [None]:
if continue_collect:
    try:
        #format data
        df_province_death=df_dummy.fillna(0)
        if (df_province_death.date.head(1)==datetime.date(2020,1,22)).all():  ###for some reason the date keep move back 1 day if you request the data at late night
            df_province_death.date=df_province_death.date+datetime.timedelta(1)
        df_province_death.date=df_province_death.date.astype(str)
        df_province_death[df_province_death.columns[1:]]=df_province_death[df_province_death.columns[1:]].astype(int)
        #drop duplication if any
        df_province_death.drop_duplicates(inplace=True,ignore_index=True) #subset=["date","VN_death_case","VN_days_death_avg"] if error in duplicated found
        
        display(df_province_death.head(3))

        #save data
        store_dataframe(df_province_death,"database/Covid-19_raw_data/VietNamData/Ministry_of_Health_data/death_data",auto_process=auto_process,encoding='utf-8',subset=["date","VN_death_case","VN_days_death_avg"])
    except Exception:
        traceback.print_exc()

**b. Case by summary**

<font size=6>**1.3.Robust weather condition**</font><a name="1.3"></a><br>
Reference link:<br> 
[https://worldweather.wmo.int/en/dataguide.html](https://worldweather.wmo.int/en/dataguide.html)<br>
[http://data.un.org/Data.aspx?d=CLINO&f=ElementCode%3A16](http://data.un.org/Data.aspx?d=CLINO&f=ElementCode%3A16)<br>
[https://www.ncdc.noaa.gov/cdo-web/webservices/v2#gettingStarted](https://www.ncdc.noaa.gov/cdo-web/webservices/v2#gettingStarted)<br>
<font size=5><ins>**1.3.1. Average annual wind speed**</ins></font><a name="1.3.1"></a><br>
Reference link:<br> 
[http://data.un.org/Data.aspx?d=CLINO&f=ElementCode%3A16](http://data.un.org/Data.aspx?d=CLINO&f=ElementCode%3A16)<br>
The purpose is to check whenever the weather condition (average temperature, wind speed) has an effect on Covid-19 tranmission rate. Robust weather conditions will be collected to check if there is a correlation between them before further analyzing.

* <ins>Processing & store wind data</ins>

In [None]:
if continue_collect and basic_settings[6]==6:
    try:
        #request data
        df_wind_data = pd.read_csv("http://data.un.org/Handlers/DownloadHandler.ashx?DataFilter=ElementCode:16;StatisticCode:01&DataMartId=CLINO&Format=csv&c=1,2,10,18,19,20,22,24,26,28,30,32,34,36,38,40,42,44,46&s=CountryName:asc,WmoStationNumber:asc,StatisticCode:asc",compression="zip")

        #format data
        df_wind_data.drop(df_wind_data.columns[[6,8,10,12,14,16,18,20,22,24,26,28,30,32]],inplace=True,axis=1)
        df_wind_data = df_wind_data[:-2]
        df_wind_data=df_wind_data.loc[~(df_wind_data["Annual NCDC Computed Value"]==-9999.9)]
        df_wind_data = df_wind_data[["Country or Territory Code","Country or Territory","Unit","Annual NCDC Computed Value"]].groupby(["Country or Territory Code","Country or Territory","Unit"]).mean().round(2)

        #save the data
        folder_path = "database/Covid-19_raw_data/robust_weather_condition"
        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
        display(df_wind_data.head(3))
        df_wind_data.to_csv(folder_path +"/"+"average_wind_speed.csv",encoding="utf-8")
        print("Data collected.")
    except Exception:
        traceback.print_exc()

<font size=5><ins>**1.3.2. Average annual temperature**</ins></font><a name="1.3.2"></a><br>
Reference link:<br> 
[https://datahelpdesk.worldbank.org/knowledgebase/articles/902061-climate-data-api](https://datahelpdesk.worldbank.org/knowledgebase/articles/902061-climate-data-api)<br>
[https://climateknowledgeportal.worldbank.org/download-data](https://climateknowledgeportal.worldbank.org/download-data)<br>
* <ins>General input parameters</ins>

In [None]:
link = "https://climateknowledgeportal.worldbank.org/cru/cru/timeseries/tas/monthly/climatology/historical/country/{}"
temp_max_worker = 5

* <ins>Processing</ins>

In [None]:
start_cell = time.perf_counter()

if continue_collect and basic_settings[7]==6:
    
    temp_manager = Manager()
    temp_counter= temp_manager.Array('i',[0,0])
    temp_time_list = temp_manager.list([])
    temp_error_dict = temp_manager.dict()
    temp_shared_output = temp_manager.Namespace()
    temp_shared_output.shared_df = pd.DataFrame()
    temp_lock = temp_manager.Lock()
    
    try:
        #get country ISO3 code for API request
        if 'df_oxford' in globals():
            code_list = df_oxford[["iso_code","location"]].drop_duplicates().values.tolist()
            code_list = [code for code in code_list if not re.match(r"OWID_",code[0])]
        else:
            code_list = read_stored_data("database/Covid-19_raw_data/OurWorldinData/covid_data")[["iso_code","location"]].drop_duplicates().values.tolist()
            code_list = [code for code in code_list if not re.match(r"OWID_",code[0])]
        
        worker_func = partial(temperature_data_wrapper_func,
                              shared_count=temp_counter,
                              time_list=temp_time_list,
                              error_dict=temp_error_dict,
                              shared_output=temp_shared_output,
                              lock=temp_lock,
                              link = link)
        
        executor= Pool(processes=temp_max_worker)
        start_worker = time.perf_counter()
        pool_result = executor.map_async(worker_func,code_list)
        
        max_country = len(code_list)
        while temp_counter[0]<max_country:
            time.sleep(1)
            n_done = temp_counter[0]
            n_wait = max_country
            total_time = round(sum(temp_time_list),2)
            if n_done!=0:
                avg_time = round(total_time/n_done,2)

            if n_done!=0:
                remain_country = n_wait - n_done
                remain_time = round(remain_country*avg_time,2)
                real_remain = round(remain_time/temp_max_worker,2)
            else:
                remain_country = "estimating..."
                remain_time = "-- "
                real_remain = "-- "

            elapsed_time = round(time.perf_counter()-start_worker,2)
            a_string = (f"Elapsed time: {elapsed_time}s    " +
                        f"Total workers processing time: {total_time}s    " + 
                        f" Remaining countries: {remain_country}    " + 
                        f" Estimated remaining worker time: {remain_time}s    " +
                        f" Estimated remaining real time: {real_remain}s")

            print(" "*(len(a_string)+10),end="\r")
            print(a_string,end="\r")
        
        print(" "*(len(a_string)+10),end="\r")
        print("---Temperature data collecting completed---")
        print(f"Number of workers with error happened: {temp_counter[1]}")
        print(f"Total worker processing time taken: {round(sum(temp_time_list),2)}s")
        
        df_temp_data = temp_shared_output.shared_df.copy()
        df_temp_data.sort_values(['iso_code','Year'],inplace=True,ignore_index=True)
        temp_error_result = dict(temp_error_dict)
        temp_time = list(temp_time_list)
        
        executor.close()
        executor.join()
    except Exception:
        print(f"Error was found. Temperature data for {temp_counter[0]} countries were collected before the error.") 
        traceback.print_exc()
    finally:
        temp_manager.shutdown()
        
end_cell = time.perf_counter()
print(f"Total elapsed time on this code block: {round(end_cell-start_cell,2)}s")

* <ins>Store temperature data</ins>

In [None]:
if continue_collect and basic_settings[7]==6:
    try:
        #save temperature data
        folder_path = "database/Covid-19_raw_data/robust_weather_condition"
        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
        display(df_temp_data.head(3))
        df_temp_data.to_csv(folder_path +"/"+"average_temperature.csv",encoding="utf-8",index=False)
        print("Data collected.")
    except Exception:
        traceback.print_exc()

<font size=6>**1.4.Updated time**</font><a name="1.4"></a><br>
Latest update time of data

In [None]:
if continue_collect:
    try:
        #save updated time
        folder_path = "database/Covid-19_raw_data"
        if os.path.exists(str(folder_path)):
            pass
        else:
            os.makedirs(str(folder_path))
            print("New folder created.")
        time_df = pd.DataFrame({'last_modified':[datetime.datetime.now().timestamp()]})
        display(time_df)
        time_df.to_csv(folder_path+"/"+"modified_model_data_time.csv",encoding="utf-8",index=False)
        time_df.to_csv(folder_path+"/"+"modified_census_data_time.csv",encoding="utf-8",index=False) #NOTE: comment out this line if census data is not refresh or changed for subsequence running
        print("Modification date updated.")
    except Exception:
        traceback.print_exc()