Skip to content

Commit

Permalink
linkchecker: Add support for <base> tag
Browse files Browse the repository at this point in the history
Add support for the HTML <base> tag as used by mdBook so The Unstable
Book can be checked.

Also cleanup a few things:
* Stop checking the name attribute. It should never have been used and
mdBook has since been fixed not to use it.
* Make sure we only check html files.
* Remove a few unnecessary allocations.

Finally, dead links in The Unstable Book have been fixed.
  • Loading branch information
ollie27 committed May 14, 2017
1 parent 21ca9ca commit d4f20eb
Showing 1 changed file with 33 additions and 56 deletions.
89 changes: 33 additions & 56 deletions src/tools/linkchecker/main.rs
Expand Up @@ -41,7 +41,7 @@ macro_rules! t {
}

fn main() {
let docs = env::args().nth(1).unwrap();
let docs = env::args_os().nth(1).unwrap();
let docs = env::current_dir().unwrap().join(docs);
let mut errors = false;
walk(&mut HashMap::new(), &docs, &docs, &mut errors);
Expand All @@ -65,15 +65,14 @@ enum Redirect {
struct FileEntry {
source: String,
ids: HashSet<String>,
names: HashSet<String>,
}

type Cache = HashMap<PathBuf, FileEntry>;

impl FileEntry {
fn parse_ids(&mut self, file: &Path, contents: &str, errors: &mut bool) {
if self.ids.is_empty() {
with_attrs_in_source(contents, " id", |fragment, i| {
with_attrs_in_source(contents, " id", |fragment, i, _| {
let frag = fragment.trim_left_matches("#").to_owned();
if !self.ids.insert(frag) {
*errors = true;
Expand All @@ -82,15 +81,6 @@ impl FileEntry {
});
}
}

fn parse_names(&mut self, contents: &str) {
if self.names.is_empty() {
with_attrs_in_source(contents, " name", |fragment, _| {
let frag = fragment.trim_left_matches("#").to_owned();
self.names.insert(frag);
});
}
}
}

fn walk(cache: &mut Cache, root: &Path, dir: &Path, errors: &mut bool) {
Expand All @@ -116,15 +106,8 @@ fn check(cache: &mut Cache,
file: &Path,
errors: &mut bool)
-> Option<PathBuf> {
// ignore js files as they are not prone to errors as the rest of the
// documentation is and they otherwise bring up false positives.
if file.extension().and_then(|s| s.to_str()) == Some("js") {
return None;
}

// ignore handlebars files as they use {{}} to build links, we only
// want to test the generated files
if file.extension().and_then(|s| s.to_str()) == Some("hbs") {
// Ignore none HTML files.
if file.extension().and_then(|s| s.to_str()) != Some("html") {
return None;
}

Expand All @@ -147,13 +130,7 @@ fn check(cache: &mut Cache,
return None;
}

// mdbook uses the HTML <base> tag to handle links for subdirectories, which
// linkchecker doesn't support
if file.to_str().unwrap().contains("unstable-book") {
return None;
}

let res = load_file(cache, root, PathBuf::from(file), SkipRedirect);
let res = load_file(cache, root, file, SkipRedirect);
let (pretty_file, contents) = match res {
Ok(res) => res,
Err(_) => return None,
Expand All @@ -162,13 +139,10 @@ fn check(cache: &mut Cache,
cache.get_mut(&pretty_file)
.unwrap()
.parse_ids(&pretty_file, &contents, errors);
cache.get_mut(&pretty_file)
.unwrap()
.parse_names(&contents);
}

// Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
with_attrs_in_source(&contents, " href", |url, i| {
with_attrs_in_source(&contents, " href", |url, i, base| {
// Ignore external URLs
if url.starts_with("http:") || url.starts_with("https:") ||
url.starts_with("javascript:") || url.starts_with("ftp:") ||
Expand All @@ -184,9 +158,9 @@ fn check(cache: &mut Cache,
// Once we've plucked out the URL, parse it using our base url and
// then try to extract a file path.
let mut path = file.to_path_buf();
if !url.is_empty() {
if !base.is_empty() || !url.is_empty() {
path.pop();
for part in Path::new(url).components() {
for part in Path::new(base).join(url).components() {
match part {
Component::Prefix(_) |
Component::RootDir => panic!(),
Expand All @@ -197,13 +171,6 @@ fn check(cache: &mut Cache,
}
}

if let Some(extension) = path.extension() {
// don't check these files
if extension == "png" {
return;
}
}

// Alright, if we've found a file name then this file had better
// exist! If it doesn't then we register and print an error.
if path.exists() {
Expand All @@ -218,11 +185,17 @@ fn check(cache: &mut Cache,
pretty_path.display());
return;
}
let res = load_file(cache, root, path.clone(), FromRedirect(false));
if let Some(extension) = path.extension() {
// Ignore none HTML files.
if extension != "html" {
return;
}
}
let res = load_file(cache, root, &path, FromRedirect(false));
let (pretty_path, contents) = match res {
Ok(res) => res,
Err(LoadError::IOError(err)) => {
panic!(format!("error loading {}: {}", path.display(), err));
panic!("error loading {}: {}", path.display(), err);
}
Err(LoadError::BrokenRedirect(target, _)) => {
*errors = true;
Expand All @@ -245,11 +218,10 @@ fn check(cache: &mut Cache,

let entry = &mut cache.get_mut(&pretty_path).unwrap();
entry.parse_ids(&pretty_path, &contents, errors);
entry.parse_names(&contents);

if !(entry.ids.contains(*fragment) || entry.names.contains(*fragment)) {
if !entry.ids.contains(*fragment) {
*errors = true;
print!("{}:{}: broken link fragment ",
print!("{}:{}: broken link fragment ",
pretty_file.display(),
i + 1);
println!("`#{}` pointing to `{}`", fragment, pretty_path.display());
Expand All @@ -267,7 +239,7 @@ fn check(cache: &mut Cache,

fn load_file(cache: &mut Cache,
root: &Path,
mut file: PathBuf,
file: &Path,
redirect: Redirect)
-> Result<(PathBuf, String), LoadError> {
let mut contents = String::new();
Expand All @@ -279,9 +251,9 @@ fn load_file(cache: &mut Cache,
None
}
Entry::Vacant(entry) => {
let mut fp = File::open(file.clone()).map_err(|err| {
let mut fp = File::open(file).map_err(|err| {
if let FromRedirect(true) = redirect {
LoadError::BrokenRedirect(file.clone(), err)
LoadError::BrokenRedirect(file.to_path_buf(), err)
} else {
LoadError::IOError(err)
}
Expand All @@ -297,17 +269,14 @@ fn load_file(cache: &mut Cache,
entry.insert(FileEntry {
source: contents.clone(),
ids: HashSet::new(),
names: HashSet::new(),
});
}
maybe
}
};
file.pop();
match maybe_redirect.map(|url| file.join(url)) {
match maybe_redirect.map(|url| file.parent().unwrap().join(url)) {
Some(redirect_file) => {
let path = PathBuf::from(redirect_file);
load_file(cache, root, path, FromRedirect(true))
load_file(cache, root, &redirect_file, FromRedirect(true))
}
None => Ok((pretty_file, contents)),
}
Expand All @@ -329,10 +298,14 @@ fn maybe_redirect(source: &str) -> Option<String> {
})
}

fn with_attrs_in_source<F: FnMut(&str, usize)>(contents: &str, attr: &str, mut f: F) {
fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(contents: &str, attr: &str, mut f: F) {
let mut base = "";
for (i, mut line) in contents.lines().enumerate() {
while let Some(j) = line.find(attr) {
let rest = &line[j + attr.len()..];
// The base tag should always be the first link in the document so
// we can get away with using one pass.
let is_base = line[..j].ends_with("<base");
line = rest;
let pos_equals = match rest.find("=") {
Some(i) => i,
Expand All @@ -358,7 +331,11 @@ fn with_attrs_in_source<F: FnMut(&str, usize)>(contents: &str, attr: &str, mut f
Some(i) => &rest[..i],
None => continue,
};
f(url, i)
if is_base {
base = url;
continue;
}
f(url, i, base)
}
}
}

0 comments on commit d4f20eb

Please sign in to comment.