# Scrapping Philippines SGC website

In [1]:
# make sure we are running version 3.12.3
from platform import python_version

print(python_version())

3.12.5


In [10]:
# load libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [5]:
# website to scrape
philippines_stat_web = "https://psa.gov.ph/classification/psgc/municipalities"

In [5]:
# url request
page = requests.get(philippines_stat_web)

soup = BeautifulSoup(page.text, 'html') # html is the parser here, we want text from the page 

In [6]:
print(soup)

<!DOCTYPE html>

<html dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="index" name="robots"/>
<link href="https://psa.gov.ph/classification/psgc/municipalities" rel="canonical"/>
<meta content="width" name="MobileOptimized"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="/sites/default/files/favicon.ico" rel="icon" type="image/vnd.microsoft.icon"/>
<title>PHILIPPINE STANDARD GEOGRAPHIC CODE | Philippine Statistics Authority | Republic of the Philippines</title>
<link href="//fonts.googleapis.com/css?family=Hind:wght@400;500;600;700&amp;display=swap" media="all" rel="stylesheet"/>
<link href="/sites/default/files/css/A.css_9U6l_6abldFIOyLEsWKddz9-2Mj7WnY8K3C-BUCIdEM.css+css_SRyVyu2J6yzw4ATKvL0pyHVtJ3yMBDU2-UdqWj0YPqY.css+css_yp0aUXq9PF9dLdyzWv0XMmL2n1V8G_kM3VFdZgZQ3DI.css+css_3zRRXpkjC_yo1dLina_q0m7IFr8_A-dH4F4eocJr5Z0.css+css_ueOUe9Og6qCX1iRj5X-8IgC3ezwr-LxugE0CUW_1ZKA.css,Mcc.BYUAXA

In [7]:
# there's only one table in the website
# confirmed gives the correct output
table = soup.find('table')
print(table)

<table class="table table-striped table-bordered responsive-enabled" data-striping="1" id="tbl_allmun">
<thead>
<tr>
<th>Municipality</th>
<th>10 Digit Code</th>
<th>Correspondence Code</th>
<th>Income Class</th>
<th>Population(2020 Census)</th>
</tr>
</thead>
<tbody>
<tr>
<td><a class="psgc" href="/classification/psgc/barangays/1381701000">Pateros</a></td>
<td>1381701000</td>
<td>137606000</td>
<td>1st</td>
<td> 65,227 </td>
</tr>
<tr>
<td><a class="psgc" href="/classification/psgc/barangays/1400101000">Bangued </a></td>
<td>1400101000</td>
<td>140101000</td>
<td>1st</td>
<td> 50,382 </td>
</tr>
<tr>
<td><a class="psgc" href="/classification/psgc/barangays/1400102000">Boliney</a></td>
<td>1400102000</td>
<td>140102000</td>
<td>5th</td>
<td> 4,551 </td>
</tr>
<tr>
<td><a class="psgc" href="/classification/psgc/barangays/1400103000">Bucay</a></td>
<td>1400103000</td>
<td>140103000</td>
<td>5th</td>
<td> 17,953 </td>
</tr>
<tr>
<td><a class="psgc" href="/classification/psgc/barangays/140

In [22]:
# getting headers from var table above
table_titles = table.find_all('th')
table_titles

[<th>Municipality</th>,
 <th>10 Digit Code</th>,
 <th>Correspondence Code</th>,
 <th>Income Class</th>,
 <th>Population(2020 Census)</th>]

In [23]:
# using comprehensions
header_titles = [title.text for title in table_titles]

header_titles

['Municipality',
 '10 Digit Code',
 'Correspondence Code',
 'Income Class',
 'Population(2020 Census)']

In [24]:
# drop the last two in the list
header_titles = header_titles[0:5]

header_titles

['Municipality',
 '10 Digit Code',
 'Correspondence Code',
 'Income Class',
 'Population(2020 Census)']

In [21]:
ph_mun_inc_class = pd.DataFrame(columns = header_titles)

ph_mun_inc_class

Unnamed: 0,Municipality,10 Digit Code,Correspondence Code,Income Class,Population(2020 Census)


In [25]:
# get the td's table data
column_data = table.find_all('tr')

In [31]:
for row in column_data[1:]: # start from position 1 coz of the empty list at index 0
    #print(row.find_all('td'))
    row_data = row.find_all('td')
    # using comprehensions to clean up and extra clean row data
    cln_row_data = [data.text for data in row_data]
    #print(cln_row_data)
    # append each row to dataframe ph_mun_inc_class
    ## keep check on the curr index of the dataframe
    length = len(ph_mun_inc_class)
    ph_mun_inc_class.loc[length] = cln_row_data

In [32]:
ph_mun_inc_class

Unnamed: 0,Municipality,10 Digit Code,Correspondence Code,Income Class,Population(2020 Census)
0,Pateros,1381701000,137606000,1st,65227
1,Bangued,1400101000,140101000,1st,50382
2,Boliney,1400102000,140102000,5th,4551
3,Bucay,1400103000,140103000,5th,17953
4,Bucloc,1400104000,140104000,6th,2395
...,...,...,...,...,...
1488,Nabalawag,1999904000,,-,25723
1489,Pahamuddin,1999905000,,-,19627
1490,Malidegao,1999906000,,-,36438
1491,Ligawasan,1999907000,,-,29784


In [33]:
# export to csv file
ph_mun_inc_class.to_csv(r'/Users/masinde/Documents/phd/causal fairness/data/extra_data/phl_income_cls.csv')

# Using function ph_sgc_web_scraper_fxn

In [37]:
# loading the function ph_sgc_wb_scraper_fxn
%run '/Users/masinde/Projects/causal-fairness-Philippines-drrm/src/philippines_sgc_web_scraper_fxn.py'

In [39]:
ph_mun_inc_class = ph_sgc_web_scraper(philippines_stat_web)

table headers:['Municipality', '10 Digit Code', 'Correspondence Code', 'Income Class', 'Population(2020 Census)']


In [41]:
ph_mun_inc_class.head()

Unnamed: 0,Municipality,10 Digit Code,Correspondence Code,Income Class,Population(2020 Census)
0,Pateros,1381701000,137606000,1st,65227
1,Bangued,1400101000,140101000,1st,50382
2,Boliney,1400102000,140102000,5th,4551
3,Bucay,1400103000,140103000,5th,17953
4,Bucloc,1400104000,140104000,6th,2395


In [43]:
len(ph_mun_inc_class)

1493