forked from michenriksen/aquatone
/
url_page_title_extractor.go
56 lines (45 loc) · 1.36 KB
/
url_page_title_extractor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
package agents
import (
"bytes"
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/michenriksen/aquatone/core"
)
type URLPageTitleExtractor struct {
session *core.Session
}
func NewURLPageTitleExtractor() *URLPageTitleExtractor {
return &URLPageTitleExtractor{}
}
func (a *URLPageTitleExtractor) ID() string {
return "agent:url_page_title_extractor"
}
func (a *URLPageTitleExtractor) Register(s *core.Session) error {
s.EventBus.SubscribeAsync(core.URLResponsive, a.OnURLResponsive, false)
a.session = s
return nil
}
func (a *URLPageTitleExtractor) OnURLResponsive(url string) {
a.session.Out.Debug("[%s] Received new responsive URL %s\n", a.ID(), url)
page := a.session.GetPage(url)
if page == nil {
a.session.Out.Error("Unable to find page for URL: %s\n", url)
return
}
a.session.WaitGroup.Add()
go func(page *core.Page) {
defer a.session.WaitGroup.Done()
body, err := a.session.ReadFile(fmt.Sprintf("html/%s.html", page.BaseFilename()))
if err != nil {
a.session.Out.Debug("[%s] Error reading HTML body file for %s: %s\n", a.ID(), page.URL, err)
return
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
a.session.Out.Debug("[%s] Error when parsing HTML body file for %s: %s\n", a.ID(), page.URL, err)
return
}
page.PageTitle = strings.TrimSpace(doc.Find("Title").Text())
}(page)
}