forked from opensearch-project/project-website
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlink-checker.rb
362 lines (280 loc) · 11.6 KB
/
link-checker.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
# frozen_string_literal: true
require "net/http"
require "jekyll/hooks"
require "jekyll/document"
require "json"
require "set"
require "uri"
require "pathname"
##
# This singleton checks links during build to warn or fail upon finding dead links.
#
# `JEKYLL_LINK_CHECKER`, set on the environment, will cause verification of external links
# Valid values: internal, forced, all.
# Usage: `JEKYLL_LINK_CHECKER=internal bundle exec jekyll build --trace`
#
# `JEKYLL_FATAL_LINK_CHECKER`, set on the environment, is the same as `JEKYLL_LINK_CHECKER`
# except that it fails the build if there are broken links. it takes the same valid values
# Usage: `JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --trace`
module Jekyll::LinkChecker
##
# The collection that will get stores as the output
@urls
##
# Pattern to identify documents that should be excluded based on their URL
@excluded_paths = /(\/_faqs\/|\.(css|js|json|map|xml|txt|yml|svg|)$)/i.freeze
##
# Pattern to identify certain HTML tags whose content should be excluded from indexing
@href_matcher = /<a[^>]+href=(['"])(.+?)\1/im.freeze
##
# Pattern to check for external URLs
@external_matcher = /^https?:\/\//.freeze
@forced_external_matcher = /^https?:\/\/.*(?=opensearch\.org\/)/.freeze
##
# List of domains to ignore
@ignored_domains = %w[localhost]
##
# Pattern of local paths to ignore
@ignored_paths = /(^\/docs$|^mailto:|^\/javadocs\/)/.freeze
##
# Valid response codes for successful links
@success_codes = %w[200 302]
##
# Questionable response codes for successful links
@@questionable_codes = %w[301 403]
##
# Retry response codes for links
@@retry_codes = %w[429]
##
# Holds the list of failures
@failures
##
# Build flags driven by environment variables
@@LINK_CHECKER_STATES = ['internal', 'forced', 'all', 'retry']
@check_links # Enables the link checker
@check_forced_external # Enables checking internal links marked as external e.g. /docs
@check_external_links # Enables checking external links
@retry_external_links # Enables retrying external links
@should_build_fatally # indicates the need to fail the build for dead links
##
# The retry durations for host to retry
@retry_timeouts_dict = {}
@retry_iteration = 0
@@retry_buffer = 10
@@max_retry_iterations = 10
##
# Initializes the singleton by recording the site
def self.init(site)
@site = site
@urls = {}
@failures = []
@retry_timeouts = {}
begin
@should_build_fatally = true if ENV.key?('JEKYLL_FATAL_LINK_CHECKER')
check_flag = @should_build_fatally ? ENV['JEKYLL_FATAL_LINK_CHECKER'] : ENV['JEKYLL_LINK_CHECKER']
return unless check_flag
unless @@LINK_CHECKER_STATES.include?(check_flag)
Jekyll.logger.info "LinkChecker: [Notice] Could not initialize, Valid values for #{@should_build_fatally ? 'JEKYLL_FATAL_LINK_CHECKER' : 'JEKYLL_LINK_CHECKER'} are #{@@LINK_CHECKER_STATES}"
return
end
@check_links = true if @@LINK_CHECKER_STATES.include?(check_flag)
@check_forced_external = true if @@LINK_CHECKER_STATES[1..3].include?(check_flag)
@check_external_links = true if @@LINK_CHECKER_STATES[2..3].include?(check_flag)
@retry_external_links = true if @@LINK_CHECKER_STATES[3].include?(check_flag)
msg = {
'internal' => 'internal links',
'forced' => 'internal and forced external links',
'all' => 'all links',
'retry' => 'all links with retry',
}
Jekyll.logger.info "LinkChecker: [Notice] Initialized successfully and will check #{msg[check_flag]}" if @check_links
Jekyll.logger.info "LinkChecker: [Notice] The build will fail if a dead link is found" if @should_build_fatally
rescue => exception
Jekyll.logger.error "LinkChecker: [Error] Failed to initialize Link Checker"
raise
end
end
##
# Processes a Document or Page and adds the links to a collection
# It also checks for anchors to parts of the same page/doc
def self.process(page)
return unless @check_links
return if @excluded_paths.match(page.path)
hrefs = page.content.scan(@href_matcher)
hrefs.each do |(_, href)|
relative_path = page.path[0] == '/' ? Pathname.new(page.path).relative_path_from(Dir.getwd) : page.path
if href.eql? '#'
next
elsif href.start_with? '#'
Jekyll.logger.info relative_path if (page.content =~ /<[a-z0-9-]+[^>]+(?:id|name)="#{href[1..]}"/i).nil?
@failures << "Process:: ##{href[1..]}, linked in ./#{relative_path}" if (page.content =~ /<[a-z0-9-]+[^>]+(?:id|name)="#{href[1..]}"/i).nil?
else
@urls[href] = Set[] unless @urls.key?(href)
@urls[href] << relative_path
end
end
end
##
# Saves the collection as a JSON file
def self.verify(site)
return unless @check_links
@base_url_matcher = /^#{@site.config["url"]}#{@site.baseurl}(\/.*)$/.freeze
retry_hosts = {}
# Run atleast once
loop do
urls = @urls
# If its a retry
unless retry_hosts.empty?
# Get min sleep time in dict of timeouts and sleep
host_name, min_timeout_obj = @retry_timeouts_dict.min_by { |k,v| v[:retry_timestamp] }
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
sleep_time = min_timeout_obj[:retry_timestamp] - now + @@retry_buffer
if sleep_time > 0
Jekyll.logger.info "LinkChecker: [Info] Going to sleep for #{sleep_time}".cyan()
sleep(sleep_time)
end
# Get URLS to retry and clear from retry hash's
Jekyll.logger.info "LinkChecker: [Info] Retrying links for host #{host_name}".cyan()
urls = retry_hosts[host_name].clone
@retry_timeouts_dict.delete(host_name)
retry_hosts.delete(host_name)
end
# checl each url
# - valid URL: should not be failures but or in retry_hosts hash
# - invalid URL: should be failures but not in retry_hosts hash
# - retry URL: should not be failures, only in retry_hosts hash
urls.each do |url, pages|
valid_or_retry, metadata = check(url)
@failures << "Verify:: #{url}, linked to in ./#{pages.to_a.join(", ./")}" unless valid_or_retry
if @retry_external_links and metadata&.key?(:retry_host_name)
retry_host_name = metadata[:retry_host_name]
retry_hosts[retry_host_name] = [] unless retry_hosts.key?(retry_host_name)
retry_hosts[retry_host_name] << url
end
end
@retry_iteration += 1
break if !@retry_external_links or (@retry_iteration >= @@max_retry_iterations) or retry_hosts.empty?
end
msg = "Found #{@failures.size} dead link#{@failures.size > 1 ? 's' : ''}:\n#{@failures.join("\n")}" unless @failures.empty?
unless retry_hosts.empty?
retry_msg = retry_hosts.map {|host, urls|
"Host:#{host}\n#{urls.map {|url| "- #{url}"}.join("\n")}\n"
}.join("\n")
msg = "Links we could not retry: \n#{retry_msg} \n#{msg}"
end
if !@failures.empty?
if @should_build_fatally
raise msg
else
Jekyll.logger.warn "\nLinkChecker: [Warning] #{msg}\n"
end
else
Jekyll.logger.info "\nLinkChecker: [Success] No broken links!\n".green()
end
end
##
# Check if URL is accessible
def self.check(url)
match = @base_url_matcher.match(url)
unless match.nil?
url = match[1]
end
url = @site.config["url"] + url if url.start_with? '/docs/'
if @forced_external_matcher =~ url
return true unless @check_forced_external
return self.check_external(url)
end
if @external_matcher =~ url
return true unless @check_external_links
return self.check_external(url)
end
return self.check_internal(url)
end
##
# Check if an external URL is accessible by making a HEAD call
def self.check_external(url)
uri = URI(url)
return true if @ignored_domains.include? uri.host
begin
Net::HTTP.start(uri.host, uri.port, :use_ssl => true) do |http|
# http.use_ssl = (uri.scheme == "https")
request = Net::HTTP::Get.new(uri)
http.request(request) do |response|
return true if @success_codes.include? response.code
if @@retry_codes.include? response.code
retry_after = response.header['retry-after']
if retry_after.nil?
Jekyll.logger.warn "LinkChecker: [Warning] Got #{response.code} from #{url}, cannot retry due to missing retry header"
return true
end
if @retry_external_links
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
retry_timestamp = retry_after.to_i + now # TODO: This could also be a timestamp
@retry_timeouts_dict[uri.host] = {
:code => response.code,
:retry_timestamp => retry_timestamp
}
Jekyll.logger.warn "LinkChecker: [Warning] Got #{response.code} from #{url}, will retry after #{retry_after}s"
return true, { retry_host_name: uri.host }
end
Jekyll.logger.warn "LinkChecker: [Warning] Got #{response.code} from #{url}, will not retry"
return true
elsif @@questionable_codes.include? response.code
Jekyll.logger.warn "LinkChecker: [Warning] Got #{response.code} from #{url}"
return true
end
Jekyll.logger.error "LinkChecker: [Error] Got #{response.code} from #{url}"
return false
end
end
rescue OpenSSL::SSL::SSLError, Net::OpenTimeout, Errno::ETIMEDOUT, Errno::ECONNREFUSED => exception
Jekyll.logger.error "LinkChecker: [Error] Exception Occurred for URL #{url} #{exception.class}. Message: #{exception.message}."
return false
rescue => exception
# TODO: This should not return false, but instead re raise. We should not have unknown exceptions
Jekyll.logger.error "LinkChecker: [Error] Unknown Error::URL: #{url}\nError: #{exception.class}. Message: #{exception.message}."
return false
end
end
##
# Check if an internal link is accessible
def self.check_internal(url)
return true if @ignored_paths =~ url
path, hash = url.split('#')
unless path =~ /\.[^\/]{2,}$/
path << '/' unless path.end_with? '/'
path << 'index.html' unless path.end_with? 'index.html'
end
filename = File.join(@site.config["destination"], path)
return false unless File.file?(filename)
content = File.read(filename)
unless content.include? "<title>Redirecting"
return true if hash.nil? || hash.empty?
return !(content =~ /<[a-z0-9-]+[^>]+id="#{hash}"/i).nil?
end
match = content.match(@href_matcher)
if match.nil?
Jekyll.logger.warn "LinkChecker: [Warning] Cannot check #{url} due to an unfollowable redirect"
return true
end
redirect = match[2]
redirect << '#' + hash unless hash.nil? || hash.empty?
return self.check(redirect)
end
end
# Before any Document or Page is processed, initialize the LinkChecker
Jekyll::Hooks.register :site, :pre_render do |site|
Jekyll::LinkChecker.init(site)
end
# Process a Page as soon as its content is ready
Jekyll::Hooks.register :pages, :post_convert do |page|
Jekyll::LinkChecker.process(page)
end
# Process a Document as soon as its content is ready
Jekyll::Hooks.register :documents, :post_convert do |document|
Jekyll::LinkChecker.process(document)
end
# Verify gathered links after Jekyll is done writing all its stuff
Jekyll::Hooks.register :site, :post_write do |site|
Jekyll::LinkChecker.verify(site)
end