# Scraping the Good Governance Index data

This script opens the site, reads all drop-down menu options, then iterates over the list of options. On each iteration, the site submits the current option to the form, navigates to the new page, selects the table and processes it.

In [1]:
## load library
library(rvest)

## initiate session on site
site <- "http://nap.psa.gov.ph/ggi/default.asp"
session <- html_session(site)

## look at nodes
html_nodes(session, "form")
html_nodes(session, "option")

## get the drop-down options
options <- html_nodes(session, "select[name='strMunicipality2'] > option")
municipalities <- data.frame(
    value  = html_attr(options, "value"),
    option = html_text(options))
head(municipalities)
dim(municipalities)

## empty data frame to be filled
data <- data.frame(matrix(nrow = 0, ncol = 5))

## get ggi data for all municipalities in the list, restrict to first two for test
for (op in municipalities$value[1:2]) {

    ## display some information
    ## print(op)
    print(paste0(which(op == municipalities$value), "/",
                 nrow(municipalities), " ", op))

    ## set option and submit form
    form <- html_form(html_node(session, "#form2"))
    form <- set_values(form, "strMunicipality2" = op)
    newpage <- submit_form(session, form)

    ## get table and process it
    table <- html_table(html_node(newpage, "table"), fill = TRUE)
    table$municipality <- op
    table <- table[3:nrow(table), ]

    ## assert correct dimension
    stopifnot(ncol(table) == 6)
    ## print(table)

    ## append to data
    data <- rbind(data, table)
}

## build header and add to data frame
header <- c("indicator", "value2005", "rank2005", "value2008", "rank2008",
            "municipality ")
names(data) <- header

## look at data
dim(data)
data

Loading required package: xml2


{xml_nodeset (5)}
[1] <form action="http://psa.gov.ph" method="post" id="search-block-form" acc ...
[2] <form id="form1" name="form1" method="post" action="/ggi/search.asp">\r\n ...
[3] <form id="form2" name="form2" method="post" action="/ggi/details.asp">\r\ ...
[4] <form id="form4" name="form2" method="post" action="/ggi/details_prov.asp ...
[5] <form id="form3" name="form2" method="post" action="/ggi/rankresults.asp" ...

{xml_nodeset (1634)}
 [1] <option value="Navotas+Metro Manila">NCR - Metro Manila, Navotas</option>\n
 [2] <option value="Pateros+Metro Manila">NCR - Metro Manila, Pateros</option>\n
 [3] <option value="San Juan+Metro Manila">NCR - Metro Manila, San Juan</opti ...
 [4] <option value="Bangued+Abra">CAR - Abra, Bangued</option>\n
 [5] <option value="Boliney+Abra">CAR - Abra, Boliney</option>\n
 [6] <option value="Bucay+Abra">CAR - Abra, Bucay</option>\n
 [7] <option value="Bucloc+Abra">CAR - Abra, Bucloc</option>\n
 [8] <option value="Daguioman+Abra">CAR - Abra, Daguioman</option>\n
 [9] <option value="Danglas+Abra">CAR - Abra, Danglas</option>\n
[10] <option value="Dolores+Abra">CAR - Abra, Dolores</option>\n
[11] <option value="La Paz+Abra">CAR - Abra, La Paz</option>\n
[12] <option value="Lacub+Abra">CAR - Abra, Lacub</option>\n
[13] <option value="Lagangilang+Abra">CAR - Abra, Lagangilang</option>\n
[14] <option value="Lagayan+Abra">CAR - Abra, Lagayan</option>\n
[15] <option value="

value,option
Navotas+Metro Manila,"NCR - Metro Manila, Navotas"
Pateros+Metro Manila,"NCR - Metro Manila, Pateros"
San Juan+Metro Manila,"NCR - Metro Manila, San Juan"
Bangued+Abra,"CAR - Abra, Bangued"
Boliney+Abra,"CAR - Abra, Boliney"
Bucay+Abra,"CAR - Abra, Bucay"


[1] "Navotas+Metro Manila"
[1] "1/1511 Navotas+Metro Manila"


Submitting with 'strMunicipality2'


[1] "Pateros+Metro Manila"
[1] "2/1511 Pateros+Metro Manila"


Submitting with 'strMunicipality2'


Unnamed: 0,indicator,value2005,rank2005,value2008,rank2008,municipality
3,Income Index,321.91,62,became a city,,Navotas+Metro Manila
4,Total Per Capita Income Index,159.74,617,,,Navotas+Metro Manila
5,Total Per Capita Income from Local Sources Index,484.09,25,,,Navotas+Metro Manila
6,Expenditure Index,262.3,102,became a city,,Navotas+Metro Manila
7,"Per Capita Expenditure on Education, Culture Sports/Manpower Development Index",535.83,30,,,Navotas+Metro Manila
8,"Per Capita Expenditure on Health, Nutrition and Population Control Index",212.61,340,,,Navotas+Metro Manila
9,Per Capita Expenditure on Economic Services Index,38.44,1427,,,Navotas+Metro Manila
10,GOOD GOVERNANCE INDEX,292.1,73,became a city,,Navotas+Metro Manila
31,Income Index,274.24,96,316.94,101.0,Pateros+Metro Manila
41,Total Per Capita Income Index,164.8,573,206.08,673.0,Pateros+Metro Manila


## CSV output

With larger projects, it may be infeasible to accumulate data in in-memory objects like data frames. A very simple alternative is to just append rows with data to a csv file each iteration. 

In [2]:
## load library
library(rvest)

## initiate session on site
site <- "http://nap.psa.gov.ph/ggi/default.asp"
session <- html_session(site)

## look at nodes
html_nodes(session, "form")
html_nodes(session, "option")

## get the drop-down options
options <- html_nodes(session, "select[name='strMunicipality2'] > option")
municipalities <- data.frame(
    value  = html_attr(options, "value"),
    option = html_text(options))
## head(municipalities)
dim(municipalities)

## build header and write to file
header <- c("indicator", "value2005", "rank2005", "value2008", "rank2008",
            "municipality")
write.table(t(header), "Data/ggi.csv", sep = ";",
            col.names = FALSE, row.names = FALSE)

## get ggi data for all municipalities in the list
for (op in municipalities$value[1:2]) {

    ## display some information
    print(op)
    print(paste0(which(op == municipalities$value), "/",
                 nrow(municipalities), " ", op))

    ## set option and submit form
    form <- html_form(html_node(session, "#form2"))
    form <- set_values(form, "strMunicipality2" = op)
    newpage <- submit_form(session, form)

    ## get table and process it
    table <- html_table(html_node(newpage, "table"), fill = TRUE)
    table$municipality <- op
    table <- table[3:nrow(table), ]

    ## assert correct dimension
    stopifnot(ncol(table) == 6)

    ## append to file
    write.table(table, "Data/ggi.csv", sep = ";", append = TRUE,
                col.names = FALSE, row.names = FALSE)
}

{xml_nodeset (5)}
[1] <form action="http://psa.gov.ph" method="post" id="search-block-form" acc ...
[2] <form id="form1" name="form1" method="post" action="/ggi/search.asp">\r\n ...
[3] <form id="form2" name="form2" method="post" action="/ggi/details.asp">\r\ ...
[4] <form id="form4" name="form2" method="post" action="/ggi/details_prov.asp ...
[5] <form id="form3" name="form2" method="post" action="/ggi/rankresults.asp" ...

{xml_nodeset (1634)}
 [1] <option value="Navotas+Metro Manila">NCR - Metro Manila, Navotas</option>\n
 [2] <option value="Pateros+Metro Manila">NCR - Metro Manila, Pateros</option>\n
 [3] <option value="San Juan+Metro Manila">NCR - Metro Manila, San Juan</opti ...
 [4] <option value="Bangued+Abra">CAR - Abra, Bangued</option>\n
 [5] <option value="Boliney+Abra">CAR - Abra, Boliney</option>\n
 [6] <option value="Bucay+Abra">CAR - Abra, Bucay</option>\n
 [7] <option value="Bucloc+Abra">CAR - Abra, Bucloc</option>\n
 [8] <option value="Daguioman+Abra">CAR - Abra, Daguioman</option>\n
 [9] <option value="Danglas+Abra">CAR - Abra, Danglas</option>\n
[10] <option value="Dolores+Abra">CAR - Abra, Dolores</option>\n
[11] <option value="La Paz+Abra">CAR - Abra, La Paz</option>\n
[12] <option value="Lacub+Abra">CAR - Abra, Lacub</option>\n
[13] <option value="Lagangilang+Abra">CAR - Abra, Lagangilang</option>\n
[14] <option value="Lagayan+Abra">CAR - Abra, Lagayan</option>\n
[15] <option value="

[1] "Navotas+Metro Manila"
[1] "1/1511 Navotas+Metro Manila"


Submitting with 'strMunicipality2'


[1] "Pateros+Metro Manila"
[1] "2/1511 Pateros+Metro Manila"


Submitting with 'strMunicipality2'
