/
utils.go
115 lines (96 loc) 路 2.34 KB
/
utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package crawl
import (
"net/url"
"regexp"
"strconv"
"time"
"github.com/CorentinB/Zeno/internal/pkg/utils"
"github.com/sirupsen/logrus"
"github.com/zeebo/xxh3"
)
var regexOutlinks *regexp.Regexp
func (c *Crawl) writeFrontierToDisk() {
for !c.Finished.Get() {
c.Frontier.Save()
time.Sleep(time.Minute * 1)
}
}
func (c *Crawl) crawlSpeedLimiter() {
maxConcurrentAssets := c.MaxConcurrentAssets
for {
if c.Client.WaitGroup.Size() > c.Workers*8 {
c.Paused.Set(true)
c.Frontier.Paused.Set(true)
} else if c.Client.WaitGroup.Size() > c.Workers*4 {
c.MaxConcurrentAssets = 1
c.Paused.Set(false)
c.Frontier.Paused.Set(false)
} else {
c.MaxConcurrentAssets = maxConcurrentAssets
c.Paused.Set(false)
c.Frontier.Paused.Set(false)
}
time.Sleep(time.Second / 4)
}
}
func (c *Crawl) handleCrawlPause() {
for {
if float64(utils.GetFreeDiskSpace(c.JobPath).Avail)/float64(GB) <= 20 {
logrus.Errorln("Not enough disk space. Please free some space and restart the crawler.")
c.Paused.Set(true)
c.Frontier.Paused.Set(true)
} else {
c.Paused.Set(false)
c.Frontier.Paused.Set(false)
}
time.Sleep(time.Second)
}
}
func (c *Crawl) seencheckURL(URL string, URLType string) bool {
hash := strconv.FormatUint(xxh3.HashString(URL), 10)
found, _ := c.Frontier.Seencheck.IsSeen(hash)
if found {
return true
} else {
c.Frontier.Seencheck.Seen(hash, URLType)
return false
}
}
func (c *Crawl) excludeHosts(URLs []*url.URL) (output []*url.URL) {
for _, URL := range URLs {
if utils.StringInSlice(URL.Host, c.ExcludedHosts) {
continue
} else {
output = append(output, URL)
}
}
return output
}
func extractLinksFromText(source string) (links []*url.URL) {
// Extract links and dedupe them
rawLinks := utils.DedupeStrings(regexOutlinks.FindAllString(source, -1))
// Validate links
for _, link := range rawLinks {
URL, err := url.Parse(link)
if err != nil {
continue
}
err = utils.ValidateURL(URL)
if err != nil {
continue
}
links = append(links, URL)
}
return links
}
func (c *Crawl) shouldPause(host string) bool {
return c.Frontier.GetActiveHostCount(host) >= c.MaxConcurrentRequestsPerDomain
}
func isStatusCodeRedirect(statusCode int) bool {
if statusCode == 300 || statusCode == 301 ||
statusCode == 302 || statusCode == 307 ||
statusCode == 308 {
return true
}
return false
}