#  Wikipedia examples


## Requirements

In [3]:
## install and load required library
## install.packages("rvest", dependencies = TRUE)
library(rvest)

## Scraping a wiki table

In [4]:
## 1) fetch and parse the website
page <- read_html("https://en.wikipedia.org/wiki/Infant_mortality")
## 2) extract the html node containing the table
table <- html_node(page, xpath = "//*[@id='mw-content-text']/div/table[2]")
## 3) extract the table as a data frame
mrates <- html_table(table)
mrates

Rank,Country,"Infant mortality rate (deaths/1,000 live births)"
1,Afghanistan,121.63
2,Niger,109.98
3,Mali,109.08
4,Somalia,103.72
5,Central African Republic,97.17
218,Sweden,2.74
219,Singapore,2.65
220,Bermuda,2.47
221,Japan,2.21
222,Monaco,1.8


## Investigating page elements and navigation

In [8]:
## using css or xpath selectors is equivalent
## table <- html_node(page, xpath = "//*[@id='mw-content-text']/div/table[2]")
table <- html_node(page, css = "#mw-content-text > div > table:nth-child(121)")
mrates <- html_table(table)
mrates

Rank,Country,"Infant mortality rate (deaths/1,000 live births)"
1,Afghanistan,121.63
2,Niger,109.98
3,Mali,109.08
4,Somalia,103.72
5,Central African Republic,97.17
218,Sweden,2.74
219,Singapore,2.65
220,Bermuda,2.47
221,Japan,2.21
222,Monaco,1.8


In [7]:
## list table nodes
html_nodes(page, "table")

{xml_nodeset (10)}
 [1] <table class="wikitable" style="float:right; margin: 0.5em 0 0.5em 1em;  ...
 [2] <table class="wikitable" style="text-align:left"><tbody>\n<tr>\n<th>Rank ...
 [3] <table role="presentation" class="mbox-small plainlinks sistersitebox" s ...
 [4] <table class="nowraplinks hlist collapsible collapsed navbox-inner" styl ...
 [5] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbo ...
 [6] <table class="nowraplinks collapsible autocollapse navbox-inner" style=" ...
 [7] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbo ...
 [8] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbo ...
 [9] <table class="nowraplinks hlist collapsible autocollapse navbox-inner" s ...
[10] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbo ...

In [11]:
## check out links in the table
html_nodes(table, "a") %>% html_attr("href")
tablelinks <- html_attr(html_nodes(table, "a"), "href")
link <- grep("Somalia", tablelinks, value = TRUE)
link

In [14]:
## looking at html elements and their attributes
# html_nodes(page, "link")
# html_nodes(page, "a") %>% html_attr("href")
html_nodes(page, "a")
html_attr(html_nodes(table, "a"), "href")

{xml_nodeset (1661)}
 [1] <a id="top"></a>
 [2] <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>
 [3] <a class="mw-jump-link" href="#p-search">Jump to search</a>
 [4] <a href="/wiki/File:Infant_mortality_map_of_the_world.svg" class="image" ...
 [5] <a href="/wiki/File:Infant_mortality_map_of_the_world.svg" class="intern ...
 [6] <a href="#cite_note-1">[1]</a>
 [7] <a href="#cite_note-2">[2]</a>
 [8] <a href="#cite_note-3">[3]</a>
 [9] <a href="/wiki/Neonatal_infection" title="Neonatal infection">neonatal i ...
[10] <a href="#cite_note-4">[4]</a>
[11] <a href="#cite_note-pmid27467563-5">[5]</a>
[12] <a href="#cite_note-pmid26195213-6">[6]</a>
[13] <a href="/wiki/Sanitation" title="Sanitation">sanitation</a>
[14] <a href="/wiki/Immunization" title="Immunization">immunization</a>
[15] <a href="/wiki/Infectious_diseases" class="mw-redirect" title="Infectiou ...
[16] <a href="/wiki/Public_health" title="Public health">public health</a>
[17] <a href="/wiki/Child_mortality" titl

In [16]:
## follwing a link to another page, fetching another table
session <- html_session("https://en.wikipedia.org/wiki/Infant_mortality")
session <- follow_link(session, "Somalia")
page <- read_html(session)
table <- html_node(page, xpath = "//*[@id='mw-content-text']/div/table[4]")
regions <- html_table(table)
regions

Navigating to /wiki/Somalia


Region,Area (km2),Population,Capital
Awdal,21374,673263,Borama
Woqooyi Galbeed,28836,1242003,Hargeisa
Togdheer,38663,721363,Burao
Sanaag,53374,544123,Erigavo
Sool,25036,327428,Las Anod
Bari,70088,719512,Bosaso
Nugal,26180,392697,Garowe
Mudug,72933,717863,Galkayo
Galguduud,46126,569434,Dusmareb
Hiran,31510,520685,Beledweyne


## Regex filtering

In [21]:
## filtering links
page <- read_html("https://en.wikipedia.org/wiki/Infant_mortality")
wikilinks <- html_attr(html_nodes(page, "a"), "href")
wikilinks

In [23]:
## regex examples
links <- grep("^/wiki", wikilinks, value = TRUE)
links <- grep("^/wiki.*[0-9][0-9]$", wikilinks, value = TRUE)
links <- grep("^/wiki.*File:.*", wikilinks, value = TRUE)
links <- grep("^(?!.*:)/wiki/.*Mortality", wikilinks, value = TRUE, perl = TRUE)
links <- grep("^(?!.*:)(/wiki/.*Mortality)|(/wiki/.*Somalia)", wikilinks, value = TRUE, perl = TRUE)
links

In [26]:
# sometimes easier to do it in multiple steps for readability
links <- grep("^/wiki/", wikilinks, value = TRUE)
links <- grep("Mortality|Somalia", links, value = TRUE)
links <- grep(":", links, value = TRUE, invert = TRUE)
links

In [28]:
# select only internal links matching with mortality or somalia, no files or category pages
links <- grep("^(?!.*:)(/wiki/.*Mortality)|(/wiki/.*Somalia)", wikilinks,
              ignore.case = TRUE, value = TRUE, perl = TRUE)
links <- unique(links)
links

In [30]:
# navigate to linked page
session <- jump_to(session, links[1])
page <- read_html(session)
html_nodes(page, "title")

{xml_nodeset (1)}
[1] <title>Child mortality - Wikipedia</title>\n

In [31]:
# navigate to linked page
session <- jump_to(session, links[5])
page <- read_html(session)
html_nodes(page, "title")

{xml_nodeset (1)}
[1] <title>Somalia - Wikipedia</title>\n