forked from esrrhs/go-engine
/
simplecrawl.go
101 lines (87 loc) · 2.28 KB
/
simplecrawl.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
package spider
import (
"crypto/tls"
"github.com/3t2ugg1e/go-engine/src/loggo"
"github.com/PuerkitoBio/goquery"
"github.com/axgle/mahonia"
"net/http"
"strings"
"time"
)
func simplecrawl(ui *URLInfo, crawlTimeout int, ctx *Content) *PageInfo {
url := ui.Url
loggo.Info("start simple crawl %v", url)
tr := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
client := &http.Client{
Transport: tr,
Timeout: time.Duration(crawlTimeout) * time.Second,
}
defer client.CloseIdleConnections()
res, err := client.Get(url)
if err != nil {
loggo.Info("simple crawl http Get fail %v %v", url, err)
return nil
}
defer res.Body.Close()
if res.StatusCode != 200 {
loggo.Info("simple crawl http StatusCode fail %v %v", url, res.StatusCode)
return nil
}
// Load the HTML document
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
loggo.Info("simple crawl http NewDocumentFromReader fail %v %v", url, err)
return nil
}
gb2312 := false
doc.Find("META").Each(func(i int, s *goquery.Selection) {
content, ok := s.Attr("content")
if ok {
if strings.Contains(content, "gb2312") {
gb2312 = true
}
}
})
pg := &PageInfo{}
pg.UI = *ui
doc.Find("title").Each(func(i int, s *goquery.Selection) {
if pg.Title == "" {
pg.Title = s.Text()
pg.Title = strings.TrimSpace(pg.Title)
if gb2312 {
enc := mahonia.NewDecoder("gbk")
pg.Title = enc.ConvertString(pg.Title)
}
//loggo.Info("simple simple crawl title %v", pg.Title)
}
})
// Find the items
doc.Find("a").Each(func(i int, s *goquery.Selection) {
// For each item found, get the band and title
name := s.Text()
href, ok := s.Attr("href")
if ok {
href = strings.TrimSpace(href)
name = strings.TrimSpace(name)
name = strings.Replace(name, "\n", " ", -1)
if gb2312 {
enc := mahonia.NewDecoder("gbk")
href = enc.ConvertString(href)
name = enc.ConvertString(name)
}
//loggo.Info("simple simple crawl link %v %v %v %v", i, pg.Title, name, href)
if len(href) > 0 {
pgl := PageLinkInfo{URLInfo{href, ui.Deps + 1}, name}
pg.Son = append(pg.Son, pgl)
}
}
})
pg = ctx.Crawl(pg, doc)
//if len(pg.Son) == 0 {
// html, _ := doc.Html()
// loggo.Info("simple simple crawl no link %v html:\n%v", url, html)
//}
return pg
}