/
worker.go
44 lines (36 loc) 路 933 Bytes
/
worker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
package crawl
import (
"time"
"github.com/CorentinB/Zeno/internal/pkg/utils"
)
const (
// B represent a Byte
B = 1
// KB represent a Kilobyte
KB = 1024 * B
// MB represent a MegaByte
MB = 1024 * KB
// GB represent a GigaByte
GB = 1024 * MB
)
// Worker is the key component of a crawl, it's a background processed dispatched
// when the crawl starts, it listens on a channel to get new URLs to archive,
// and eventually push newly discovered URLs back in the frontier.
func (c *Crawl) Worker() {
defer c.WorkerPool.Done()
// Start archiving the URLs!
for item := range c.Frontier.PullChan {
item := item
// Check if the crawl is paused
for c.Paused.Get() {
time.Sleep(time.Second)
}
// If the host of the item is in the host exclusion list, we skip it
if utils.StringInSlice(item.Host, c.ExcludedHosts) {
continue
}
c.ActiveWorkers.Incr(1)
c.Capture(item)
c.ActiveWorkers.Incr(-1)
}
}