Skip to content

Commit

Permalink
Model lexer: Fix remaining issues
Browse files Browse the repository at this point in the history
  • Loading branch information
pczarn committed Apr 21, 2015
1 parent e5e343a commit 13bc8af
Show file tree
Hide file tree
Showing 57 changed files with 102 additions and 179 deletions.
2 changes: 1 addition & 1 deletion src/grammar/README.md
Expand Up @@ -12,7 +12,7 @@ javac *.java
rustc -O verify.rs
for file in ../*/**.rs; do
echo $file;
grun RustLexer tokens -tokens < $file | ./verify $file RustLexer.tokens || break
grun RustLexer tokens -tokens < "$file" | ./verify "$file" RustLexer.tokens || break
done
```

Expand Down
112 changes: 47 additions & 65 deletions src/grammar/RustLexer.g4
@@ -1,5 +1,12 @@
lexer grammar RustLexer;

@lexer::members {
public boolean is_at(int pos) {
return _input.index() == pos;
}
}


tokens {
EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
Expand All @@ -8,7 +15,7 @@ tokens {
LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
COMMENT
COMMENT, SHEBANG
}

import xidstart , xidcontinue;
Expand Down Expand Up @@ -86,94 +93,63 @@ fragment CHAR_ESCAPE
| [xX] HEXIT HEXIT
| 'u' HEXIT HEXIT HEXIT HEXIT
| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
| 'u{' HEXIT '}'
| 'u{' HEXIT HEXIT '}'
| 'u{' HEXIT HEXIT HEXIT '}'
| 'u{' HEXIT HEXIT HEXIT HEXIT '}'
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT '}'
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT '}'
;
fragment SUFFIX
: IDENT
;
fragment INTEGER_SUFFIX
: { _input.LA(1) != 'e' && _input.LA(1) != 'E' }? SUFFIX
;
LIT_CHAR
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] | '\ud800' .. '\udbff' '\udc00' .. '\udfff' ) '\'' SUFFIX?
: '\'' ( '\\' CHAR_ESCAPE
| ~[\\'\n\t\r]
| '\ud800' .. '\udbff' '\udc00' .. '\udfff'
)
'\'' SUFFIX?
;

LIT_BYTE
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\'' SUFFIX?
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT
| [nrt\\'"0] )
| ~[\\'\n\t\r] '\udc00'..'\udfff'?
)
'\'' SUFFIX?
;

LIT_INTEGER
: [0-9][0-9_]* SUFFIX?
| '0b' [01][01_]* SUFFIX?
| '0o' [0-7][0-7_]* SUFFIX?
| '0x' [0-9a-fA-F][0-9a-fA-F_]* SUFFIX?

: [0-9][0-9_]* INTEGER_SUFFIX?
| '0b' [01_]+ INTEGER_SUFFIX?
| '0o' [0-7_]+ INTEGER_SUFFIX?
| '0x' [0-9a-fA-F_]+ INTEGER_SUFFIX?
;

LIT_FLOAT
: [0-9][0-9_]* ('.' {
/* dot followed by another dot is a range, no float */
/* dot followed by another dot is a range, not a float */
_input.LA(1) != '.' &&
/* dot followed by an identifier is an integer with a function call, no float */
/* dot followed by an identifier is an integer with a function call, not a float */
_input.LA(1) != '_' &&
_input.LA(1) != 'a' &&
_input.LA(1) != 'b' &&
_input.LA(1) != 'c' &&
_input.LA(1) != 'd' &&
_input.LA(1) != 'e' &&
_input.LA(1) != 'f' &&
_input.LA(1) != 'g' &&
_input.LA(1) != 'h' &&
_input.LA(1) != 'i' &&
_input.LA(1) != 'j' &&
_input.LA(1) != 'k' &&
_input.LA(1) != 'l' &&
_input.LA(1) != 'm' &&
_input.LA(1) != 'n' &&
_input.LA(1) != 'o' &&
_input.LA(1) != 'p' &&
_input.LA(1) != 'q' &&
_input.LA(1) != 'r' &&
_input.LA(1) != 's' &&
_input.LA(1) != 't' &&
_input.LA(1) != 'u' &&
_input.LA(1) != 'v' &&
_input.LA(1) != 'w' &&
_input.LA(1) != 'x' &&
_input.LA(1) != 'y' &&
_input.LA(1) != 'z' &&
_input.LA(1) != 'A' &&
_input.LA(1) != 'B' &&
_input.LA(1) != 'C' &&
_input.LA(1) != 'D' &&
_input.LA(1) != 'E' &&
_input.LA(1) != 'F' &&
_input.LA(1) != 'G' &&
_input.LA(1) != 'H' &&
_input.LA(1) != 'I' &&
_input.LA(1) != 'J' &&
_input.LA(1) != 'K' &&
_input.LA(1) != 'L' &&
_input.LA(1) != 'M' &&
_input.LA(1) != 'N' &&
_input.LA(1) != 'O' &&
_input.LA(1) != 'P' &&
_input.LA(1) != 'Q' &&
_input.LA(1) != 'R' &&
_input.LA(1) != 'S' &&
_input.LA(1) != 'T' &&
_input.LA(1) != 'U' &&
_input.LA(1) != 'V' &&
_input.LA(1) != 'W' &&
_input.LA(1) != 'X' &&
_input.LA(1) != 'Y' &&
_input.LA(1) != 'Z'
!(_input.LA(1) >= 'a' && _input.LA(1) <= 'z') &&
!(_input.LA(1) >= 'A' && _input.LA(1) <= 'Z')
}? | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? SUFFIX?)
;

LIT_STR
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"' SUFFIX?
;

LIT_BINARY : 'b' LIT_STR SUFFIX?;
LIT_BINARY_RAW : 'rb' LIT_STR_RAW SUFFIX?;
LIT_BINARY : 'b' LIT_STR ;
LIT_BINARY_RAW : 'b' LIT_STR_RAW ;

/* this is a bit messy */

Expand Down Expand Up @@ -201,13 +177,19 @@ LIFETIME : '\'' IDENT ;

WHITESPACE : [ \r\n\t]+ ;

UNDOC_COMMENT : '////' ~[\r\n]* -> type(COMMENT) ;
UNDOC_COMMENT : '////' ~[\n]* -> type(COMMENT) ;
YESDOC_COMMENT : '///' ~[\r\n]* -> type(DOC_COMMENT) ;
OUTER_DOC_COMMENT : '//!' ~[\r\n]* -> type(DOC_COMMENT) ;
LINE_COMMENT : '//' ~[\r\n]* -> type(COMMENT) ;
LINE_COMMENT : '//' ( ~[/\n] ~[\n]* )? -> type(COMMENT) ;

DOC_BLOCK_COMMENT
: ('/**' ~[*] | '/*!') (DOC_BLOCK_COMMENT | .)*? '*/' -> type(DOC_COMMENT)
;

BLOCK_COMMENT : '/*' (BLOCK_COMMENT | .)*? '*/' -> type(COMMENT) ;

/* these appear at the beginning of a file */

SHEBANG : '#!' { is_at(2) && _input.LA(1) != '[' }? ~[\r\n]* -> type(SHEBANG) ;

UTF8_BOM : '\ufeff' { is_at(1) }? -> skip ;
8 changes: 4 additions & 4 deletions src/grammar/check.sh
Expand Up @@ -18,13 +18,13 @@ failed=0
skipped=0

check() {
grep --silent "// ignore-lexer-test" $1;
grep --silent "// ignore-lexer-test" "$1";

# if it's *not* found...
if [ $? -eq 1 ]; then
cd $2 # This `cd` is so java will pick up RustLexer.class. I couldn't
# figure out how to wrangle the CLASSPATH, just adding build/grammr didn't
# seem to have anny effect.
# figure out how to wrangle the CLASSPATH, just adding build/grammar
# didn't seem to have any effect.
if $3 RustLexer tokens -tokens < $1 | $4 $1 $5; then
echo "pass: $1"
passed=`expr $passed + 1`
Expand All @@ -39,7 +39,7 @@ check() {
}

for file in $(find $1 -iname '*.rs' ! -path '*/test/compile-fail*'); do
check $file $2 $3 $4 $5
check "$file" $2 $3 $4 $5
done

printf "\ntest result: "
Expand Down
84 changes: 50 additions & 34 deletions src/grammar/verify.rs
Expand Up @@ -8,9 +8,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#![feature(plugin)]

#![allow(unstable)]
#![feature(plugin, rustc_private, str_char, collections)]

extern crate syntax;
extern crate rustc;
Expand All @@ -19,14 +17,18 @@ extern crate rustc;
extern crate log;

use std::collections::HashMap;
use std::io::File;
use std::env;
use std::fs::File;
use std::io::{BufRead, Read};
use std::path::Path;

use syntax::parse;
use syntax::parse::lexer;
use rustc::session::{self, config};

use syntax::ast;
use syntax::ast::Name;
use syntax::codemap;
use syntax::codemap::Pos;
use syntax::parse::token;
use syntax::parse::lexer::TokenAndSpan;
Expand Down Expand Up @@ -108,6 +110,7 @@ fn parse_token_list(file: &str) -> HashMap<String, token::Token> {
"LIT_BINARY" => token::Literal(token::Binary(Name(0)), None),
"LIT_BINARY_RAW" => token::Literal(token::BinaryRaw(Name(0), 0), None),
"QUESTION" => token::Question,
"SHEBANG" => token::Shebang(Name(0)),
_ => continue,
};

Expand Down Expand Up @@ -166,24 +169,26 @@ fn count(lit: &str) -> usize {
lit.chars().take_while(|c| *c == '#').count()
}

fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize])
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize],
has_bom: bool)
-> TokenAndSpan {
// old regex:
// \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
let start = s.find_str("[@").unwrap();
let comma = start + s[start..].find_str(",").unwrap();
let colon = comma + s[comma..].find_str(":").unwrap();
let content_start = colon + s[colon..].find_str("='").unwrap();
let content_end = content_start + s[content_start..].find_str("',<").unwrap();
let toknum_end = content_end + s[content_end..].find_str(">,").unwrap();
let start = s.find("[@").unwrap();
let comma = start + s[start..].find(",").unwrap();
let colon = comma + s[comma..].find(":").unwrap();
let content_start = colon + s[colon..].find("='").unwrap();
// Use rfind instead of find, because we don't want to stop at the content
let content_end = content_start + s[content_start..].rfind("',<").unwrap();
let toknum_end = content_end + s[content_end..].find(">,").unwrap();

let start = &s[comma + 1 .. colon];
let end = &s[colon + 1 .. content_start];
let content = &s[content_start + 2 .. content_end];
let toknum = &s[content_end + 3 .. toknum_end];

let proto_tok = tokens.get(toknum).expect(format!("didn't find token {:?} in the map",
toknum));
let not_found = format!("didn't find token {:?} in the map", toknum);
let proto_tok = tokens.get(toknum).expect(&not_found[..]);

let nm = parse::token::intern(content);

Expand All @@ -209,24 +214,25 @@ fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_
ref t => t.clone()
};

let offset = if real_tok == token::Eof
{
let start_offset = if real_tok == token::Eof {
1
} else {
0
};

let mut lo = start.parse::<u32>().unwrap() - offset;
let mut hi = end.parse::<u32>().unwrap() + 1;
let offset = if has_bom { 1 } else { 0 };

let mut lo = start.parse::<u32>().unwrap() - start_offset - offset;
let mut hi = end.parse::<u32>().unwrap() + 1 - offset;

// Adjust the span: For each surrogate pair already encountered, subtract one position.
lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32;
hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32;

let sp = syntax::codemap::Span {
lo: syntax::codemap::BytePos(lo),
hi: syntax::codemap::BytePos(hi),
expn_id: syntax::codemap::NO_EXPANSION
let sp = codemap::Span {
lo: codemap::BytePos(lo),
hi: codemap::BytePos(hi),
expn_id: codemap::NO_EXPANSION
};

TokenAndSpan {
Expand All @@ -245,10 +251,10 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
}
}

fn span_cmp(antlr_sp: syntax::codemap::Span, rust_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool {
fn span_cmp(antlr_sp: codemap::Span, rust_sp: codemap::Span, cm: &codemap::CodeMap) -> bool {
antlr_sp.expn_id == rust_sp.expn_id &&
antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() &&
antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint()
antlr_sp.lo.to_usize() == cm.bytepos_to_file_charpos(rust_sp.lo).to_usize() &&
antlr_sp.hi.to_usize() == cm.bytepos_to_file_charpos(rust_sp.hi).to_usize()
}

fn main() {
Expand All @@ -257,10 +263,15 @@ fn main() {
r.next_token()
}

let args = std::os::args();
let mut args = env::args().skip(1);
let filename = args.next().unwrap();
if filename.find("parse-fail").is_some() {
return;
}

// Rust's lexer
let code = File::open(&Path::new(args[1])).unwrap().read_to_string().unwrap();
let mut code = String::new();
File::open(&Path::new(&filename)).unwrap().read_to_string(&mut code).unwrap();

let surrogate_pairs_pos: Vec<usize> = code.chars().enumerate()
.filter(|&(_, c)| c as usize > 0xFFFF)
Expand All @@ -269,6 +280,8 @@ fn main() {
.map(|(x, n)| x + n)
.collect();

let has_bom = code.starts_with("\u{feff}");

debug!("Pairs: {:?}", surrogate_pairs_pos);

let options = config::basic_options();
Expand All @@ -281,15 +294,18 @@ fn main() {
let ref cm = lexer.span_diagnostic.cm;

// ANTLR
let mut token_file = File::open(&Path::new(args[2]));
let token_map = parse_token_list(token_file.read_to_string().unwrap());
let mut token_file = File::open(&Path::new(&args.next().unwrap())).unwrap();
let mut token_list = String::new();
token_file.read_to_string(&mut token_list).unwrap();
let token_map = parse_token_list(&token_list[..]);

let mut stdin = std::io::stdin();
let mut lock = stdin.lock();
let stdin = std::io::stdin();
let lock = stdin.lock();
let lines = lock.lines();
let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
&token_map,
&surrogate_pairs_pos[]));
let antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
&token_map,
&surrogate_pairs_pos[..],
has_bom));

for antlr_tok in antlr_tokens {
let rustc_tok = next(&mut lexer);
Expand All @@ -314,7 +330,7 @@ fn main() {
}
_ => panic!("{:?} is not {:?}", antlr_tok, rustc_tok)
},)*
ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", rustc_tok, antlr_tok)
ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", antlr_tok, rustc_tok)
}
)
}
Expand Down
2 changes: 0 additions & 2 deletions src/libcollections/fmt.rs
Expand Up @@ -7,8 +7,6 @@
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// ignore-lexer-test FIXME #15679

//! Utilities for formatting and printing strings
//!
Expand Down

0 comments on commit 13bc8af

Please sign in to comment.