forked from markusmobius/go-trafilatura
/
utils.go
123 lines (102 loc) · 2.94 KB
/
utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
// This file is part of go-trafilatura, Go package for extracting readable
// content, comments and metadata from a web page. Source available in
// <https://github.com/AlirezaNeGe/go-trafilatura>.
// Copyright (C) 2021 Markus Mobius
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by the
// Free Software Foundation, either version 3 of the License, or (at your
// option) any later version.
//
// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program. If not, see <https://www.gnu.org/licenses/>.
package main
import (
nurl "net/url"
"os"
"path"
"regexp"
"strings"
)
func fileExists(path string) bool {
_, err := os.Stat(path)
return err == nil
}
func validateURL(url string) (*nurl.URL, bool) {
parsedURL, err := nurl.ParseRequestURI(url)
if err != nil {
return nil, false
}
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
return nil, false
}
return parsedURL, true
}
func isValidURL(url string) bool {
_, valid := validateURL(url)
return valid
}
func sliceToMap(strings ...string) map[string]struct{} {
result := make(map[string]struct{})
for _, s := range strings {
result[s] = struct{}{}
}
return result
}
func rxFromString(str string) (*regexp.Regexp, error) {
if str == "" {
return nil, nil
}
return regexp.Compile(str)
}
func nameFromURL(url *nurl.URL) string {
urlPath := strings.Trim(url.Path, "/")
domain := strings.TrimPrefix(url.Hostname(), "www.")
newName := strings.ReplaceAll(domain, ".", "-")
if urlPath != "" {
urlPath = path.Base(urlPath)
urlPath = strings.ReplaceAll(urlPath, "/", "-")
urlPath = strings.ReplaceAll(urlPath, ".", "-")
newName += "-" + urlPath
}
return newName
}
// createAbsoluteURL convert url to absolute path based on base.
// However, if url is prefixed with hash (#), the url won't be changed.
func createAbsoluteURL(url string, base *nurl.URL) string {
if url == "" || base == nil {
return url
}
// If it is hash tag, return as it is
if strings.HasPrefix(url, "#") {
return url
}
// If it is data URI, return as it is
if strings.HasPrefix(url, "data:") {
return url
}
// If it is javascript URI, return as it is
if strings.HasPrefix(url, "javascript:") {
return url
}
// If it is already an absolute URL, return as it is
tmp, err := nurl.ParseRequestURI(url)
if err == nil && tmp.Scheme != "" && tmp.Hostname() != "" {
return url
}
// Otherwise, resolve against base URI.
// Normalize URL first.
if !strings.HasPrefix(url, "/") {
url = path.Join(base.Path, url)
}
tmp, err = nurl.Parse(url)
if err != nil {
return url
}
return base.ResolveReference(tmp).String()
}