# Covid-19 Template Scraper

Scrapes data from https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_data/Philippines_medical_cases.

Contains Daily Reported Covid-19 Cases.

## 1. Import Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
import numpy as np
from datetime import datetime

## II. Perform GET Request

In [2]:
url = "https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_data/Philippines_medical_cases"

r = requests.get(
    url
)

soup = BeautifulSoup(r.text, "html.parser")


# print(soup.prettify())


## III. Extract table rows

In [3]:
markers = soup.findAll("table", attrs={'class':'wikitable'})

for marker in markers:
    caption = marker.find("caption")
    if caption:
        if 'Daily COVID-19 cases' in caption.text:
            elem_marker = marker
            break
            
t_rows = elem_marker.find('tbody').find_all(recursive=False)


## IV. Parse Table Rows

In [4]:
headers1 = []
headers2 = []

for _ in range(31):
    headers1.append(None)
    headers2.append(None)

actual_index = 0
for index2, th in enumerate(t_rows[0].find_all('th')):
    if th.has_attr('rowspan'):
        for index3, col in enumerate(range(int(th['rowspan']))):
            # print(actual_index)
            if index3 == 0:
                headers1[actual_index] = th.text
            headers2[actual_index] = ''
            actual_index = actual_index + 1
    elif th.has_attr('colspan'):
        for index3, col in enumerate(range(int(th['colspan']))):
            headers1[actual_index] = th.text
            actual_index = actual_index + 1
    else:
        headers1[actual_index] = th.text
        actual_index = actual_index + 1
    
for th in t_rows[1].find_all('th'):
    actual_index = 0
    for h in headers2:
        # print(h)
        if h == None:
            break;
        actual_index = actual_index + 1
    # print(actual_index)
    headers2[actual_index] = th.text
        
full_headers = []
for index in range(30):
    if headers2[index] != '':
        if headers1[index] is not None:
            full_headers.append(headers1[index] + "-" + headers2[index])
        else:
            full_headers.append(headers2[index])
    else:
        full_headers.append(headers1[index])
full_headers = [i.replace('\n', '').replace('-', '_').lower() for i in full_headers if i] 
print(f"Extracted Headers: {full_headers}")

Extracted Headers: ['date', 'regions_ncr', 'regions_car', 'regions_i', 'regions_ii', 'regions_iii', 'regions_iv_a', 'regions_iv_b', 'regions_v', 'regions_vi', 'regions_vii', 'regions_viii', 'regions_ix', 'regions_x', 'regions_xi', 'regions_xii', 'regions_xiii', 'regions_bar', 'confirmed_new', 'confirmed_total', 'deaths_new', 'deaths_total', 'active_total', 'recov._total', 'tested_new', 'tested_total', 'ref.', 'notes']


## VI. Remove Unnecessary Rows

In [5]:
row_length = len(t_rows)
print(row_length)
t_rows.pop(0)
t_rows.pop(0)
t_rows.pop()
t_rows.pop()
t_rows.pop()
t_rows.pop()
t_rows.pop()
print(len(t_rows))

209
202


## VII. Place data in a DataFrame and store into a CSV

In [6]:


data_dict = {}
for h in full_headers:
    data_dict[h] = []

for tr in t_rows:
    for index, td in enumerate(tr.find_all('td')):
        cell_val = td.text.replace('\n', '').strip()
        # print('0' if cell_val == '' or cell_val == '–' else cell_val)
        data_dict[full_headers[index]].append('0' if cell_val == '' or cell_val == '–' else cell_val)

del data_dict['ref.']
del data_dict['notes']
        
df = pd.DataFrame(data=data_dict)
df['date'] = pd.to_datetime(df['date'], format="%B %d, %Y")
df['regions_ncr'] = df['regions_ncr'].astype('uint16')
df['regions_car'] = df['regions_car'].astype('uint16')
df['regions_i'] = df['regions_i'].astype('uint16')
df['regions_ii'] = df['regions_ii'].astype('uint16')
df['regions_iv_a'] = df['regions_iv_a'].astype('uint16')
df['regions_iv_b'] = df['regions_iv_b'].astype('uint16')
df['regions_v'] = df['regions_v'].astype('uint16')
df['regions_vi'] = df['regions_vi'].astype('uint16')
df['regions_vii'] = df['regions_vii'].astype('uint16')
df['regions_viii'] = df['regions_viii'].astype('uint16')
df['regions_ix'] = df['regions_ix'].astype('uint16')
df['regions_x'] = df['regions_x'].astype('uint16')
df['regions_xi'] = df['regions_xi'].astype('uint16')
df['regions_xii'] = df['regions_xii'].astype('uint16')
df['regions_xiii'] = df['regions_xiii'].astype('uint16')
df['regions_bar'] = df['regions_bar'].astype('uint16')
df['confirmed_new'] = df['confirmed_new'].astype('uint16')
df['confirmed_total'] = df['confirmed_total'].astype('uint16')
df['deaths_new'] = df['deaths_new'].astype('uint16')
df['deaths_total'] = df['deaths_total'].astype('uint16')
df['active_total'] = df['active_total'].astype('uint16')
df['recov._total'] = df['recov._total'].astype('uint16')
df['tested_total'] = df['tested_total'].astype('uint16')

df.to_csv('datasets/covid_spread_from_daily_summary.csv', index=False)
df

Unnamed: 0,date,regions_ncr,regions_car,regions_i,regions_ii,regions_iii,regions_iv_a,regions_iv_b,regions_v,regions_vi,...,regions_xiii,regions_bar,confirmed_new,confirmed_total,deaths_new,deaths_total,active_total,recov._total,tested_new,tested_total
0,2020-01-30,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,2,0,0,0
1,2020-01-31,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,2,1,0,0
2,2020-02-01,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,1,0,0
3,2020-02-03,0,0,0,0,0,0,0,0,0,...,0,0,1,2,0,1,1,1,0,0
4,2020-02-05,0,0,0,0,0,0,0,0,0,...,0,0,1,3,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,2020-09-13,1307,14,38,45,327,739,43,28,299,...,44,12,3372,64608,79,4371,49277,10960,23634,48383
198,2020-09-14,1498,30,45,46,517,936,42,35,134,...,4,20,4699,3744,259,4630,53754,10896,24670,28678
199,2020-09-15,690,9,48,81,281,761,18,30,291,...,6,16,3544,7263,34,4663,57392,10744,31919,63894
200,2020-09-16,0,0,0,0,0,0,0,0,0,...,0,0,3550,10790,69,4732,60344,11250,30643,30767
