/
main.moon
61 lines (44 loc) · 1.52 KB
/
main.moon
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import Scraper from require "moonscrape"
import is_relative_url, decode_html_entities from require "moonscrape.util"
import query_all from require "web_sanitize.query"
leafonet = ->
scraper = Scraper {
project: "leafo.net"
sleep: {0.2, 5.0}
}
handle_result = (url, page) =>
return if page.status != 200
-- skip the directory listings
if page.body\match "Proudly Served by LiteSpeed Web Server"
return
for link in *query_all page.body, "a"
href = link.attr and link.attr.href
href = href and decode_html_entities href
if href and is_relative_url href
tags = {}
table.insert tags, "posts" if href\match "/posts/"
table.insert tags, "guides" if href\match "/guides/"
url\queue { :tags, url: href }, handle_result
scraper\queue "http://leafo.net/lapis", handle_result
-- scraper\queue "http://localhost/blog2/www/", handle_result
scraper\run!
moonrocks = ->
scraper = Scraper {
project: "moonrocks"
filter_url: (url) =>
return false if url\match "/register"
return false if url\match "/login"
return false if url\match "rockspec$"
return false if url\match "rock$"
true
}
handle_result = (url, page) =>
for link in *query_all page.body, "a"
href = link.attr and link.attr.href
href = href and decode_html_entities href
if href and is_relative_url href
url\queue href, handle_result
scraper\queue "https://luarocks.org", handle_result
scraper\run!
leafonet!
-- moonrocks!