-
Notifications
You must be signed in to change notification settings - Fork 3
/
outlinks.go
91 lines (74 loc) 路 2.48 KB
/
outlinks.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
package crawl
import (
"net/url"
"strings"
"sync"
"github.com/CorentinB/Zeno/internal/pkg/frontier"
"github.com/CorentinB/Zeno/internal/pkg/utils"
"github.com/PuerkitoBio/goquery"
)
func extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, err error) {
var rawOutlinks []string
// Extract outlinks
doc.Find("a").Each(func(index int, item *goquery.Selection) {
link, exists := item.Attr("href")
if exists {
rawOutlinks = append(rawOutlinks, link)
}
})
// Extract iframes as 'outlinks' as they usually can be treated as entirely seperate pages with entirely seperate assets.
doc.Find("iframe").Each(func(index int, item *goquery.Selection) {
link, exists := item.Attr("src")
if exists {
rawOutlinks = append(rawOutlinks, link)
}
})
// Turn strings into url.URL
outlinks = utils.StringSliceToURLSlice(rawOutlinks)
// Extract all text on the page and extract the outlinks from it
textOutlinks := extractLinksFromText(doc.Find("body").RemoveFiltered("script").Text())
outlinks = append(outlinks, textOutlinks...)
// Go over all outlinks and make sure they are absolute links
outlinks = utils.MakeAbsolute(base, outlinks)
// Hash (or fragment) URLs are navigational links pointing to the exact same page as such, they should not be treated as new outlinks.
outlinks = utils.RemoveFragments(outlinks)
return utils.DedupeURLs(outlinks), nil
}
func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *frontier.Item, wg *sync.WaitGroup) {
defer wg.Done()
var excluded bool
// Send the outlinks to the pool of workers
for _, outlink := range outlinks {
outlink := outlink
// If the host of the outlink is in the host exclusion list, we ignore it
if utils.StringInSlice(outlink.Host, c.ExcludedHosts) {
continue
}
// If the outlink match any excluded string, we ignore it
for _, excludedString := range c.ExcludedStrings {
if strings.Contains(utils.URLToString(outlink), excludedString) {
excluded = true
break
}
}
if excluded {
excluded = false
continue
}
if c.DomainsCrawl && strings.Contains(item.Host, outlink.Host) && item.Hop == 0 {
newItem := frontier.NewItem(outlink, item, "seed", 0, "")
if c.UseHQ {
c.HQProducerChannel <- newItem
} else {
c.Frontier.PushChan <- newItem
}
} else if c.MaxHops >= item.Hop+1 {
newItem := frontier.NewItem(outlink, item, "seed", item.Hop+1, "")
if c.UseHQ {
c.HQProducerChannel <- newItem
} else {
c.Frontier.PushChan <- newItem
}
}
}
}