-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcolly.go
158 lines (137 loc) · 3.49 KB
/
colly.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
package search_clients
import (
"github.com/PuerkitoBio/goquery"
colly "github.com/gocolly/colly/v2"
)
type HTMLElement = colly.HTMLElement
type Request = colly.Request
type HTMLCallback = colly.HTMLCallback
type RequestCallback = colly.RequestCallback
type ResponseCallback = colly.ResponseCallback
type ErrorCallback = colly.ErrorCallback
type Website struct {
Title string
URL string
Links []string
MIME string
HTMLContent string
TextContent string
}
func (w *Website) GetTitle() string {
return w.Title
}
func (w *Website) GetURL() string {
return w.URL
}
func (w *Website) GetLinks() []string {
return w.Links
}
func (w *Website) GetMIME() string {
return w.MIME
}
func (w *Website) GetHTMLContent() string {
return w.HTMLContent
}
func (w *Website) GetTextContent() string {
return w.TextContent
}
// Scraper wraps the Colly scraper
type Scraper struct {
c *colly.Collector
}
// NewScraper creates a new Scraper
func NewScraper() *Scraper {
return &Scraper{
c: colly.NewCollector(),
}
}
// NewScraperWithCache creates a new Scraper with a cache directory
func NewScraperWithCache(cacheDirectory string) *Scraper {
return &Scraper{
c: colly.NewCollector(colly.CacheDir(cacheDirectory)),
}
}
func (s *Scraper) Scrape(entryURL string, maxDepth int) ([]Website, error) {
websites := s.recursiveScrape(entryURL, []Website{}, maxDepth)
return websites, nil
}
func (s *Scraper) recursiveScrape(url string, websites []Website, maxDepth int) []Website {
if maxDepth <= 0 {
return websites
}
website, _ := s.ScrapePage(url)
if website == nil {
return websites
}
websites = append(websites, *website)
for _, link := range website.Links {
websites = s.recursiveScrape(link, websites, maxDepth-1)
}
return websites
}
func (s *Scraper) ScrapePages(urls []string) ([]Website, error) {
var err error
var websites []Website
for _, url := range urls {
website, err := s.ScrapePage(url)
if err == nil {
websites = append(websites, *website)
}
}
return websites, err
}
func (s *Scraper) ScrapePage(url string) (*Website, error) {
var err error
var w Website
w.URL = url
s.onHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
w.Links = append(w.Links, link)
})
s.onHTML("html", func(e *colly.HTMLElement) {
w.Title = e.DOM.Find("title").Text()
w.HTMLContent, err = e.DOM.Html()
w.TextContent = s.getTextContent(*e)
})
s.onResponse(func(r *colly.Response) {
w.MIME = r.Headers.Get("Content-Type")
})
s.onError(func(r *colly.Response, page_err error) {
err = page_err
})
if err != nil {
return nil, err
}
err = s.visit(url)
if err != nil {
return nil, err
}
return &w, err
}
// Gets text content from an HTML element
func (s *Scraper) getTextContent(e HTMLElement) string {
doc := goquery.NewDocumentFromNode(e.DOM.Nodes[0])
doc.Find("*").Each(func(i int, s *goquery.Selection) {
// Remove any non-text nodes
if s.Is("script, style, head, iframe, input, textarea") {
s.Remove()
}
})
return doc.Text()
}
// Scrape scrapes the given URL
func (s *Scraper) visit(url string) error {
return s.c.Visit(url)
}
// OnHTML registers a callback function for the OnHTML event
func (s *Scraper) onHTML(selector string, f HTMLCallback) {
s.c.OnHTML(selector, f)
}
// OnError registers a callback function for the OnError event
func (s *Scraper) onError(f ErrorCallback) {
s.c.OnError(f)
}
// OnResponse registers a callback function for the OnResponse event
func (s *Scraper) onResponse(f ResponseCallback) {
s.c.OnResponse(f)
}