/
programmableweb-Scraper.R
65 lines (54 loc) · 2.11 KB
/
programmableweb-Scraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Scraping poll data from programmableweb.com
## ---------------------------------------------------------------------- ##
# Install/load packages
## ---------------------------------------------------------------------- ##
if(!require("pacman")) install.packages("pacman")
p_load(tidyverse, lubridate, rvest)
# Scrape
## ---------------------------------------------------------------------- ##
# all pages with information on URLs
all.urls <- map_chr(0:726, ~paste0("https://www.programmableweb.com/category/all/apis?page=", .x))
# Function that grabs the respective pages
api_fun <- function(x) {
read_html(x) %>%
html_nodes(".views-table") %>%
html_table(header = TRUE, fill = TRUE)
}
# Map function over all pages
# (1) Allocate space
api.list <- vector(mode = "list", length = length(all.urls))
# (2) Run (add purrr::possibly)
for(i in seq_along(all.urls)){
api.list[i] <- api_fun(all.urls[[i]])
Sys.sleep(sample(seq(0, 3, 0.5), 1))
}
# (3) Combine as data frame
api.df <- bind_rows(api.list) %>%
mutate(Submitted = as.numeric(str_extract(Submitted, "(?<=.)[:digit:]{4}")))
# Data wrangling and visualization
## ---------------------------------------------------------------------- ##
# Prepare data for plotting
api.plot.df <- api.df %>%
group_by(Submitted) %>%
count() %>%
ungroup() %>%
arrange(Submitted) %>%
filter(!is.na(Submitted)) %>%
mutate(cumulative.n = cumsum(n))
# Plot
ggplot(api.plot.df, aes(x = Submitted, y = cumulative.n)) +
geom_line(stat = "identity") +
theme_minimal() +
labs(title = "Anzahl and APIs auf programmableweb.com",
subtitle = "Eine steigende Anzahl an Unternehmen bietet APIs an.",
caption = "Quelle: programmableweb.org",
x = "",
y = "")
theme(panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
text = element_text(size = 14),
axis.ticks = element_line(size = .5))
# Save data
## ---------------------------------------------------------------------- ##
saveRDS(api.list, file = "./Slides/Figures/programmableweb_data.R")
saveRDS(api.plot.df, file = "./Slides/Figures/api_plot_df.R")