Skip to content

Commit

Permalink
lexer: lex WS/COMMENT/SHEBANG rather than skipping
Browse files Browse the repository at this point in the history
Now, the lexer will categorize every byte in its input according to the
grammar. The parser skips over these while parsing, thus avoiding their
presence in the input to syntax extensions.
  • Loading branch information
emberian committed Jul 9, 2014
1 parent cc42134 commit f512779
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 87 deletions.
41 changes: 16 additions & 25 deletions src/librustdoc/html/highlight.rs
Expand Up @@ -18,7 +18,6 @@ use std::io;

use syntax::parse;
use syntax::parse::lexer;
use syntax::codemap::{BytePos, Span};

use html::escape::Escape;

Expand Down Expand Up @@ -59,38 +58,30 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader,
None => {}
}
try!(write!(out, "class='rust {}'>\n", class.unwrap_or("")));
let mut last = BytePos(0);
let mut is_attribute = false;
let mut is_macro = false;
let mut is_macro_nonterminal = false;
loop {
let next = lexer.next_token();
let test = if next.tok == t::EOF {lexer.pos} else {next.sp.lo};

// The lexer consumes all whitespace and non-doc-comments when iterating
// between tokens. If this token isn't directly adjacent to our last
// token, then we need to emit the whitespace/comment.
//
// If the gap has any '/' characters then we consider the whole thing a
// comment. This will classify some whitespace as a comment, but that
// doesn't matter too much for syntax highlighting purposes.
if test > last {
let snip = sess.span_diagnostic.cm.span_to_snippet(Span {
lo: last,
hi: test,
expn_info: None,
}).unwrap();
if snip.as_slice().contains("/") {
try!(write!(out, "<span class='comment'>{}</span>",
Escape(snip.as_slice())));
} else {
try!(write!(out, "{}", Escape(snip.as_slice())));
}
}
last = next.sp.hi;

let snip = |sp| sess.span_diagnostic.cm.span_to_snippet(sp).unwrap();

if next.tok == t::EOF { break }

let klass = match next.tok {
t::WS => {
try!(write!(out, "{}", Escape(snip(next.sp).as_slice())));
continue
},
t::COMMENT => {
try!(write!(out, "<span class='comment'>{}</span>",
Escape(snip(next.sp).as_slice())));
continue
},
t::SHEBANG(s) => {
try!(write!(out, "{}", Escape(s.as_str())));
continue
},
// If this '&' token is directly adjacent to another token, assume
// that it's the address-of operator instead of the and-operator.
// This allows us to give all pointers their own class (`Box` and
Expand Down
2 changes: 1 addition & 1 deletion src/libsyntax/parse/attr.rs
Expand Up @@ -34,7 +34,7 @@ impl<'a> ParserAttr for Parser<'a> {
fn parse_outer_attributes(&mut self) -> Vec<ast::Attribute> {
let mut attrs: Vec<ast::Attribute> = Vec::new();
loop {
debug!("parse_outer_attributes: self.token={:?}",
debug!("parse_outer_attributes: self.token={}",
self.token);
match self.token {
token::POUND => {
Expand Down
8 changes: 4 additions & 4 deletions src/libsyntax/parse/lexer/comments.rs
Expand Up @@ -13,7 +13,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
use diagnostic;
use parse::lexer::{is_whitespace, Reader};
use parse::lexer::{StringReader, TokenAndSpan};
use parse::lexer::{is_line_non_doc_comment, is_block_non_doc_comment};
use parse::lexer::is_block_doc_comment;
use parse::lexer;
use parse::token;

Expand Down Expand Up @@ -42,9 +42,9 @@ pub struct Comment {
}

pub fn is_doc_comment(s: &str) -> bool {
(s.starts_with("///") && !is_line_non_doc_comment(s)) ||
(s.starts_with("///") && super::is_doc_comment(s)) ||
s.starts_with("//!") ||
(s.starts_with("/**") && !is_block_non_doc_comment(s)) ||
(s.starts_with("/**") && is_block_doc_comment(s)) ||
s.starts_with("/*!")
}

Expand Down Expand Up @@ -260,7 +260,7 @@ fn read_block_comment(rdr: &mut StringReader,
rdr.bump();
rdr.bump();
}
if !is_block_non_doc_comment(curr_line.as_slice()) {
if is_block_doc_comment(curr_line.as_slice()) {
return
}
assert!(!curr_line.as_slice().contains_char('\n'));
Expand Down
134 changes: 81 additions & 53 deletions src/libsyntax/parse/lexer/mod.rs
Expand Up @@ -187,7 +187,7 @@ impl<'a> StringReader<'a> {
/// Advance peek_tok and peek_span to refer to the next token, and
/// possibly update the interner.
fn advance_token(&mut self) {
match self.consume_whitespace_and_comments() {
match self.scan_whitespace_or_comment() {
Some(comment) => {
self.peek_span = comment.sp;
self.peek_tok = comment.tok;
Expand Down Expand Up @@ -339,8 +339,7 @@ impl<'a> StringReader<'a> {

/// PRECONDITION: self.curr is not whitespace
/// Eats any kind of comment.
/// Returns a Some(sugared-doc-attr) if one exists, None otherwise
fn consume_any_line_comment(&mut self) -> Option<TokenAndSpan> {
fn scan_comment(&mut self) -> Option<TokenAndSpan> {
match self.curr {
Some(c) => {
if c.is_whitespace() {
Expand Down Expand Up @@ -375,28 +374,32 @@ impl<'a> StringReader<'a> {
}
self.bump();
}
let ret = self.with_str_from(start_bpos, |string| {
return self.with_str_from(start_bpos, |string| {
// but comments with only more "/"s are not
if !is_line_non_doc_comment(string) {
Some(TokenAndSpan{
tok: token::DOC_COMMENT(str_to_ident(string)),
sp: codemap::mk_sp(start_bpos, self.last_pos)
})
let tok = if is_doc_comment(string) {
token::DOC_COMMENT(str_to_ident(string))
} else {
None
}
});
token::COMMENT
};

if ret.is_some() {
return ret;
}
return Some(TokenAndSpan{
tok: tok,
sp: codemap::mk_sp(start_bpos, self.last_pos)
});
});
} else {
let start_bpos = self.last_pos - BytePos(2);
while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
return Some(TokenAndSpan {
tok: token::COMMENT,
sp: codemap::mk_sp(start_bpos, self.last_pos)
});
}
// Restart whitespace munch.
self.consume_whitespace_and_comments()
}
Some('*') => { self.bump(); self.bump(); self.consume_block_comment() }
Some('*') => {
self.bump(); self.bump();
self.scan_block_comment()
}
_ => None
}
} else if self.curr_is('#') {
Expand All @@ -412,9 +415,15 @@ impl<'a> StringReader<'a> {
let cmap = CodeMap::new();
cmap.files.borrow_mut().push(self.filemap.clone());
let loc = cmap.lookup_char_pos_adj(self.last_pos);
debug!("Skipping a shebang");
if loc.line == 1u && loc.col == CharPos(0u) {
// FIXME: Add shebang "token", return it
let start = self.last_pos;
while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
return self.consume_whitespace_and_comments();
return Some(TokenAndSpan {
tok: token::SHEBANG(self.ident_from(start)),
sp: codemap::mk_sp(start, self.last_pos)
});
}
}
None
Expand All @@ -423,15 +432,33 @@ impl<'a> StringReader<'a> {
}
}

/// EFFECT: eats whitespace and comments.
/// Returns a Some(sugared-doc-attr) if one exists, None otherwise.
fn consume_whitespace_and_comments(&mut self) -> Option<TokenAndSpan> {
while is_whitespace(self.curr) { self.bump(); }
return self.consume_any_line_comment();
/// If there is whitespace, shebang, or a comment, scan it. Otherwise,
/// return None.
fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
match self.curr.unwrap_or('\0') {
// # to handle shebang at start of file -- this is the entry point
// for skipping over all "junk"
'/' | '#' => {
let c = self.scan_comment();
debug!("scanning a comment {}", c);
c
},
c if is_whitespace(Some(c)) => {
let start_bpos = self.last_pos;
while is_whitespace(self.curr) { self.bump(); }
let c = Some(TokenAndSpan {
tok: token::WS,
sp: codemap::mk_sp(start_bpos, self.last_pos)
});
debug!("scanning whitespace: {}", c);
c
},
_ => None
}
}

/// Might return a sugared-doc-attr
fn consume_block_comment(&mut self) -> Option<TokenAndSpan> {
fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
// block comments starting with "/**" or "/*!" are doc-comments
let is_doc_comment = self.curr_is('*') || self.curr_is('!');
let start_bpos = self.last_pos - BytePos(2);
Expand Down Expand Up @@ -466,28 +493,23 @@ impl<'a> StringReader<'a> {
self.bump();
}

let res = if is_doc_comment {
self.with_str_from(start_bpos, |string| {
// but comments with only "*"s between two "/"s are not
if !is_block_non_doc_comment(string) {
let string = if has_cr {
self.translate_crlf(start_bpos, string,
"bare CR not allowed in block doc-comment")
} else { string.into_maybe_owned() };
Some(TokenAndSpan{
tok: token::DOC_COMMENT(str_to_ident(string.as_slice())),
sp: codemap::mk_sp(start_bpos, self.last_pos)
})
} else {
None
}
})
} else {
None
};
self.with_str_from(start_bpos, |string| {
// but comments with only "*"s between two "/"s are not
let tok = if is_block_doc_comment(string) {
let string = if has_cr {
self.translate_crlf(start_bpos, string,
"bare CR not allowed in block doc-comment")
} else { string.into_maybe_owned() };
token::DOC_COMMENT(str_to_ident(string.as_slice()))
} else {
token::COMMENT
};

// restart whitespace munch.
if res.is_some() { res } else { self.consume_whitespace_and_comments() }
Some(TokenAndSpan{
tok: tok,
sp: codemap::mk_sp(start_bpos, self.last_pos)
})
})
}

/// Scan through any digits (base `radix`) or underscores, and return how
Expand Down Expand Up @@ -1242,12 +1264,18 @@ fn in_range(c: Option<char>, lo: char, hi: char) -> bool {

fn is_dec_digit(c: Option<char>) -> bool { return in_range(c, '0', '9'); }

pub fn is_line_non_doc_comment(s: &str) -> bool {
s.starts_with("////")
pub fn is_doc_comment(s: &str) -> bool {
let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/')
|| s.starts_with("//!");
debug!("is `{}` a doc comment? {}", s, res);
res
}

pub fn is_block_non_doc_comment(s: &str) -> bool {
s.starts_with("/***")
pub fn is_block_doc_comment(s: &str) -> bool {
let res = (s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*')
|| s.starts_with("/*!");
debug!("is `{}` a doc comment? {}", s, res);
res
}

fn ident_start(c: Option<char>) -> bool {
Expand Down Expand Up @@ -1383,9 +1411,9 @@ mod test {
}

#[test] fn line_doc_comments() {
assert!(!is_line_non_doc_comment("///"));
assert!(!is_line_non_doc_comment("/// blah"));
assert!(is_line_non_doc_comment("////"));
assert!(is_doc_comment("///"));
assert!(is_doc_comment("/// blah"));
assert!(!is_doc_comment("////"));
}

#[test] fn nested_block_comments() {
Expand Down
20 changes: 17 additions & 3 deletions src/libsyntax/parse/parser.rs
Expand Up @@ -325,10 +325,24 @@ fn is_plain_ident_or_underscore(t: &token::Token) -> bool {
is_plain_ident(t) || *t == token::UNDERSCORE
}

/// Get a token the parser cares about
fn real_token(rdr: &mut Reader) -> TokenAndSpan {
let mut t = rdr.next_token();
loop {
match t.tok {
token::WS | token::COMMENT | token::SHEBANG(_) => {
t = rdr.next_token();
},
_ => break
}
}
t
}

impl<'a> Parser<'a> {
pub fn new(sess: &'a ParseSess, cfg: ast::CrateConfig,
mut rdr: Box<Reader>) -> Parser<'a> {
let tok0 = rdr.next_token();
let tok0 = real_token(rdr);
let span = tok0.sp;
let placeholder = TokenAndSpan {
tok: token::UNDERSCORE,
Expand Down Expand Up @@ -864,7 +878,7 @@ impl<'a> Parser<'a> {
None
};
let next = if self.buffer_start == self.buffer_end {
self.reader.next_token()
real_token(self.reader)
} else {
// Avoid token copies with `replace`.
let buffer_start = self.buffer_start as uint;
Expand Down Expand Up @@ -908,7 +922,7 @@ impl<'a> Parser<'a> {
-> R {
let dist = distance as int;
while self.buffer_length() < dist {
self.buffer[self.buffer_end as uint] = self.reader.next_token();
self.buffer[self.buffer_end as uint] = real_token(self.reader);
self.buffer_end = (self.buffer_end + 1) & 3;
}
f(&self.buffer[((self.buffer_start + dist - 1) & 3) as uint].tok)
Expand Down
16 changes: 15 additions & 1 deletion src/libsyntax/parse/token.rs
Expand Up @@ -97,8 +97,18 @@ pub enum Token {

/* For interpolation */
INTERPOLATED(Nonterminal),

DOC_COMMENT(Ident),

// Junk. These carry no data because we don't really care about the data
// they *would* carry, and don't really want to allocate a new ident for
// them. Instead, users could extract that from the associated span.

/// Whitespace
WS,
/// Comment
COMMENT,
SHEBANG(Ident),

EOF,
}

Expand Down Expand Up @@ -231,6 +241,10 @@ pub fn to_string(t: &Token) -> String {
/* Other */
DOC_COMMENT(s) => get_ident(s).get().to_string(),
EOF => "<eof>".to_string(),
WS => " ".to_string(),
COMMENT => "/* */".to_string(),
SHEBANG(s) => format!("/* shebang: {}*/", s.as_str()),

INTERPOLATED(ref nt) => {
match nt {
&NtExpr(ref e) => ::print::pprust::expr_to_string(&**e),
Expand Down

0 comments on commit f512779

Please sign in to comment.