# DOH Covid 19 Tracker Scraper

This data was scraped from DOH's covid-19 tracker (https://www.doh.gov.ph/covid19tracker), specifically, from the line graph.

Ultimately, we have decided to scrape another site which has the data that we need (see **Template-Scraper.ipynb**).

## I. Import Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
import numpy as np
from datetime import datetime

## II. Perform GET request from Tableau site

Extract all the necessary data such as Tableau SheetId and Request X-Session-Id (to obtain the actual data being protrayed)

In [2]:
url = "https://public.tableau.com/views/COVID-19CasesandDeathsinthePhilippines_15866705872710/Home"

r = requests.get(
    url,
    headers={
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9,fil;q=0.8,af;q=0.7",
        "Cache-Control": "max-age=0"
    },
    params= {
        ":embed":"y",
        ":showVizHome": "no",
        ":host_url": "https://public.tableau.com/",
        ":embed_code_version": 3,
        ":tabs": "no",
        ":toolbar": "yes",
        ":animate_transition": "yes",
        ":display_static_image": "no",
        ":display_spinner": "no",
        ":display_overlay": "yes",
        ":display_count": "yes",
        ":loadOrderID": 0,
        "publish": "yes"
    }
)
soup = BeautifulSoup(r.text, "html.parser")



tableauData = json.loads(soup.find("textarea",{"id": "tsConfigContainer"}).text)

dataUrl = f"https://public.tableau.com/vizql/w/COVID-19CasesandDeathsinthePhilippines_15866705872710/v/Home/bootstrapSession/sessions/{r.headers['X-Session-Id']}"

r = requests.post(dataUrl, data = {
    "sheet_id": tableauData["sheetId"],
})

print(tableauData["sheetId"])

dataReg = re.search('\d+;({.*})\d+;({.*})', r.text, re.MULTILINE)
info = json.loads(dataReg.group(1))
data = json.loads(dataReg.group(2))


# 9th

# print(data)
# print(data["secondaryInfo"]["presModelMap"]["dataDictionary"]["presModelHolder"]["genDataDictionaryPresModel"]["dataSegments"]["0"]["dataColumns"])

Home


## III. Extract Entries from Line Graph Data

In [3]:
def generate_line_data(data):
    '''
    Generates Line Data
    
    Parameters:
    data - scraped tableau data
    
    Returns pandas.DataFrame
    '''
    
    line_dat = data["secondaryInfo"]["presModelMap"]["dataDictionary"]["presModelHolder"]["genDataDictionaryPresModel"]["dataSegments"]["0"]["dataColumns"][0]['dataValues']
    cstring_dat = data["secondaryInfo"]["presModelMap"]["dataDictionary"]["presModelHolder"]["genDataDictionaryPresModel"]["dataSegments"]["0"]["dataColumns"][2]['dataValues']
    line_dat_indices = data["secondaryInfo"]["presModelMap"]['vizData']["presModelHolder"]['genPresModelMapPresModel']['presModelMap']['Epi_Curve']['presModelHolder']['genVizDataPresModel']['paneColumnsData']['paneColumnsList'][0]['vizPaneColumns'][7]['valueIndices']
    
    # print(line_dat_indices)
    def does_str_have_month(cstring):
        return ("January " in cstring
                or "February " in cstring
                or "March " in cstring
                or "April " in cstring
                or "May " in cstring
                or "June " in cstring
                or "July " in cstring
                or "August " in cstring
                or "September " in cstring
                or "October " in cstring
                or "November " in cstring
                or "December " in cstring)

    date_cstrings = []
    for cstring in cstring_dat:
        if does_str_have_month(cstring):
            if len(date_cstrings) > 1:
                curr = datetime.strptime(cstring, "%B %d, %Y")
                prev = datetime.strptime(date_cstrings[len(date_cstrings) - 1], "%B %d, %Y")
                if curr < prev:
                    date_cstrings.append(cstring)
            else:
                date_cstrings.append(cstring)
    # print(date_cstrings)
    
    dict_data = {"date": date_cstrings, "values": []}
    # print(len(date_cstrings))
    # print(range(len(date_cstrings)))
    ctr = 0
    for index in line_dat_indices:
        dict_data['values'].append(line_dat[index])
    
    # print("\n\n")
    # print(dict_data)
    return pd.DataFrame(data=dict_data)


## IV. Perform Wrangling

In [4]:
all_regions_dict = generate_line_data(data)
all_regions_dict['date'] = pd.to_datetime(all_regions_dict['date'])
all_regions_dict['values'] = all_regions_dict['values'].astype('int')
all_regions_dict

Unnamed: 0,date,values
0,2020-09-16,230
1,2020-09-15,441
2,2020-09-14,616
3,2020-09-13,1398
4,2020-09-12,2336
...,...,...
197,2020-03-03,46
198,2020-03-02,33
199,2020-03-01,48
200,2020-01-21,2


## V. Store Data

In [5]:
all_regions_dict.to_csv('datasets/covid_spread_from_doh_tableau.csv', index=False)