forked from markusmobius/go-trafilatura
/
main.go
136 lines (113 loc) · 3.6 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
package main
import (
"fmt"
"io/fs"
"os"
fp "path/filepath"
"time"
"github.com/go-shiori/dom"
"github.com/go-shiori/go-readability"
distiller "github.com/AlirezaNeGe/go-domdistiller"
"github.com/AlirezaNeGe/go-htmldate"
"github.com/AlirezaNeGe/go-trafilatura"
"golang.org/x/net/html"
)
func main() {
// Find file names
filePaths, err := getFileList()
checkError(err)
// Process each path. All error handling is skipped here for brevity,
// in real world we should check and handle each error.
var nDocument int
var parseTime time.Duration
var readabilityTime time.Duration
var domDistillerTime time.Duration
var trafilaturaTime time.Duration
var dateTime time.Duration
for _, path := range filePaths {
nDocument++
// Parse file
start := time.Now()
doc, err := parseFile(path)
checkError(err)
parseTime += time.Now().Sub(start)
// Use readability
start = time.Now()
readabilityResult, _ := readability.FromDocument(doc, nil)
readabilityTime += time.Now().Sub(start)
// Use dom distiller
start = time.Now()
distillerOpts := &distiller.Options{SkipPagination: true}
distillerResult, _ := distiller.Apply(doc, distillerOpts)
domDistillerTime += time.Now().Sub(start)
// Use trafilatura
start = time.Now()
trafilaturaOpts := trafilatura.Options{
FallbackCandidates: &trafilatura.FallbackConfig{
HasReadability: true,
ReadabilityFallback: readabilityResult.Node,
HasDistiller: true,
DistillerFallback: distillerResult.Node,
},
}
trafilatura.ExtractDocument(doc, trafilaturaOpts)
trafilaturaTime += time.Now().Sub(start)
// Use html date
start = time.Now()
// Last modified date
dateOpts := htmldate.Options{}
htmldate.FromDocument(doc, dateOpts)
// Publish date
dateOpts.UseOriginalDate = true
htmldate.FromDocument(doc, dateOpts)
dateTime += time.Now().Sub(start)
}
// Print message
parseDuration := parseTime.Seconds()
readabilityDuration := readabilityTime.Seconds()
domDistillerDuration := domDistillerTime.Seconds()
trafilaturaDuration := trafilaturaTime.Seconds()
dateDuration := dateTime.Seconds()
totalDuration := parseDuration + readabilityDuration +
domDistillerDuration + trafilaturaDuration + dateDuration
parseSpeed := float64(nDocument) / parseDuration
readabilitySpeed := float64(nDocument) / readabilityDuration
domDistillerSpeed := float64(nDocument) / domDistillerDuration
trafilaturaSpeed := float64(nDocument) / trafilaturaDuration
dateSpeed := float64(nDocument) / dateDuration
avgSpeed := float64(nDocument) / totalDuration
fmt.Printf("N document : %d\n", nDocument)
fmt.Printf("Parsing : %.3f s (%.3f doc/s)\n", parseDuration, parseSpeed)
fmt.Printf("Readability: %.3f s (%.3f doc/s)\n", readabilityDuration, readabilitySpeed)
fmt.Printf("Distiller : %.3f s (%.3f doc/s)\n", domDistillerDuration, domDistillerSpeed)
fmt.Printf("Trafilatura: %.3f s (%.3f doc/s)\n", trafilaturaDuration, trafilaturaSpeed)
fmt.Printf("HtmlDate : %.3f s (%.3f doc/s)\n", dateDuration, dateSpeed)
fmt.Printf("Total : %.3f s (%.3f doc/s)\n", totalDuration, avgSpeed)
}
func getFileList() ([]string, error) {
var filePaths []string
err := fp.Walk("test-files", func(path string, info fs.FileInfo, err error) error {
if !info.IsDir() && fp.Ext(path) == ".html" {
filePaths = append(filePaths, path)
}
return nil
})
if err != nil {
return nil, err
}
return filePaths, nil
}
func parseFile(path string) (*html.Node, error) {
// Open file
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
return dom.Parse(f)
}
func checkError(err error) {
if err != nil {
panic(err)
}
}