Skip to content

Commit

Permalink
Prohibit bare CRs in raw byte strings
Browse files Browse the repository at this point in the history
  • Loading branch information
Xanewok committed Jun 8, 2019
1 parent cab7e7f commit 49d62e8
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 74 deletions.
94 changes: 24 additions & 70 deletions src/libsyntax/parse/lexer/mod.rs
Expand Up @@ -292,15 +292,6 @@ impl<'a> StringReader<'a> {
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
}

/// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
/// escaped character to the error message
fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
let mut m = m.to_string();
m.push_str(": ");
push_escaped_char(&mut m, c);
self.err_span_(from_pos, to_pos, &m[..]);
}

/// Advance peek_token to refer to the next token, and
/// possibly update the interner.
fn advance_token(&mut self) -> Result<(), ()> {
Expand Down Expand Up @@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> {
self.validate_byte_str_escape(start_with_quote);
(token::ByteStr, symbol)
},
Some('r') => self.scan_raw_byte_string(),
Some('r') => {
let (start, end, hash_count) = self.scan_raw_string();
let symbol = self.name_from_to(start, end);
self.validate_raw_byte_str_escape(start, end);

(token::ByteStrRaw(hash_count), symbol)
}
_ => unreachable!(), // Should have been a token::Ident above.
};
let suffix = self.scan_optional_raw_name();
Expand Down Expand Up @@ -1300,66 +1297,6 @@ impl<'a> StringReader<'a> {
(content_start_bpos, content_end_bpos, hash_count)
}

fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
let start_bpos = self.pos;
self.bump();
let mut hash_count = 0;
while self.ch_is('#') {
if hash_count == 65535 {
let bpos = self.next_pos;
self.fatal_span_(start_bpos,
bpos,
"too many `#` symbols: raw strings may be \
delimited by up to 65535 `#` symbols").raise();
}
self.bump();
hash_count += 1;
}

if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
} else if !self.ch_is('"') {
let last_bpos = self.pos;
let curr_char = self.ch.unwrap();
self.fatal_span_char(start_bpos,
pos,
"found invalid character; only `#` is allowed in raw \
string delimitation",
ch).raise();
}
self.bump();
let content_start_bpos = self.pos;
let mut content_end_bpos;
'outer: loop {
match self.ch {
None => {
self.fail_unterminated_raw_string(start_bpos, hash_count);
}
Some('"') => {
content_end_bpos = self.pos;
for _ in 0..hash_count {
self.bump();
if !self.ch_is('#') {
continue 'outer;
}
}
break;
}
Some(c) => {
if c > '\x7F' {
let pos = self.pos;
self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
}
}
}
self.bump();
}

self.bump();

(token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos))
}

fn validate_char_escape(&self, start_with_quote: BytePos) {
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
if let Err((off, err)) = unescape::unescape_char(lit) {
Expand Down Expand Up @@ -1424,6 +1361,23 @@ impl<'a> StringReader<'a> {
});
}

fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
self.with_str_from_to(content_start, content_end, |lit: &str| {
unescape::unescape_raw_byte_str(lit, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
unescape::Mode::ByteStr,
range,
err,
)
}
})
});
}

fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
unescape::unescape_byte_str(lit, &mut |range, c| {
Expand Down
24 changes: 24 additions & 0 deletions src/libsyntax/parse/unescape.rs
Expand Up @@ -29,6 +29,7 @@ pub(crate) enum EscapeError {

UnicodeEscapeInByte,
NonAsciiCharInByte,
NonAsciiCharInByteString,
}

/// Takes a contents of a char literal (without quotes), and returns an
Expand Down Expand Up @@ -88,6 +89,29 @@ where
}
}

/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only translate CRLF to LF and produce errors on bare CR.
pub(crate) fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
let mut byte_offset: usize = 0;

let mut chars = literal_text.chars().peekable();
while let Some(curr) = chars.next() {
let result = match (curr, chars.peek()) {
('\r', Some('\n')) => Ok(curr),
('\r', _) => Err(EscapeError::BareCarriageReturn),
(c, _) if c > '\x7F' => Err(EscapeError::NonAsciiCharInByteString),
_ => Ok(curr),
};
callback(byte_offset..(byte_offset + curr.len_utf8()), result);
byte_offset += curr.len_utf8();
}
}

#[derive(Debug, Clone, Copy)]
pub(crate) enum Mode {
Char,
Expand Down
5 changes: 5 additions & 0 deletions src/libsyntax/parse/unescape_error_reporting.rs
Expand Up @@ -124,6 +124,11 @@ pub(crate) fn emit_unescape_error(
handler.span_err(span, "byte constant must be ASCII. \
Use a \\xHH escape for a non-ASCII byte")
}
EscapeError::NonAsciiCharInByteString => {
assert!(mode.is_bytes());
let (_c, span) = last_char();
handler.span_err(span, "raw byte string must be ASCII")
}
EscapeError::OutOfRangeHexEscape => {
handler.span_err(span, "this form of character escape may only be used \
with characters in the range [\\x00-\\x7f]")
Expand Down
3 changes: 3 additions & 0 deletions src/test/ui/parser/raw-byte-string-literals.rs
@@ -1,4 +1,7 @@
// ignore-tidy-cr
// compile-flags: -Z continue-parse-after-error
pub fn main() {
br"a"; //~ ERROR bare CR not allowed in string
br"é"; //~ ERROR raw byte string must be ASCII
br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation
}
Expand Down
14 changes: 10 additions & 4 deletions src/test/ui/parser/raw-byte-string-literals.stderr
@@ -1,14 +1,20 @@
error: raw byte string must be ASCII: \u{e9}
--> $DIR/raw-byte-string-literals.rs:2:8
error: bare CR not allowed in string, use \r instead
--> $DIR/raw-byte-string-literals.rs:4:9
|
LL | br"a";
| ^

error: raw byte string must be ASCII
--> $DIR/raw-byte-string-literals.rs:5:8
|
LL | br"é";
| ^

error: found invalid character; only `#` is allowed in raw string delimitation: ~
--> $DIR/raw-byte-string-literals.rs:3:6
--> $DIR/raw-byte-string-literals.rs:6:6
|
LL | br##~"a"~##;
| ^^^

error: aborting due to 2 previous errors
error: aborting due to 3 previous errors

Expand Down

0 comments on commit 49d62e8

Please sign in to comment.