From 612bbaf7a07fe247e5e2d057cc4f10742918ead0 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 17 Jun 2014 17:58:13 +0200 Subject: [PATCH] Refactor backslash-escape parsing to share similar code. Move into a new syntax::parse::lexer::StringReader method the code that was almost duplicated for parsing backslash-escapes in byte, byte string, char, and string literals. --- src/libsyntax/parse/lexer/mod.rs | 220 ++++++++++--------------------- 1 file changed, 69 insertions(+), 151 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 31f15fd7495a4..f67b77d64dda5 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -636,6 +636,67 @@ impl<'a> StringReader<'a> { } } + /// Scan for a single (possibly escaped) byte or char + /// in a byte, (non-raw) byte string, char, or (non-raw) string literal. + /// `start` is the position of `first_source_char`, which is already consumed. + fn scan_char_or_byte(&mut self, start: BytePos, first_source_char: char, + ascii_only: bool, delim: char) -> Option { + match first_source_char { + '\\' => { + // '\X' for some X must be a character constant: + let escaped = self.curr; + let escaped_pos = self.last_pos; + self.bump(); + match escaped { + None => {}, // EOF here is an error that will be checked later. + Some(e) => { + return Some(match e { + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '"' => '"', + '0' => '\x00', + 'x' => self.scan_numeric_escape(2u, delim), + 'u' if !ascii_only => self.scan_numeric_escape(4u, delim), + 'U' if !ascii_only => self.scan_numeric_escape(8u, delim), + '\n' if delim == '"' => { + self.consume_whitespace(); + return None + }, + c => { + let last_pos = self.last_pos; + self.err_span_char( + escaped_pos, last_pos, + if ascii_only { "unknown byte escape" } + else { "unknown character escape" }, + c); + c + } + }) + } + } + } + '\t' | '\n' | '\r' | '\'' if delim == '\'' => { + let last_pos = self.last_pos; + self.err_span_char( + start, last_pos, + if ascii_only { "byte constant must be escaped" } + else { "character constant must be escaped" }, + first_source_char); + } + _ => if ascii_only && first_source_char > '\x7F' { + let last_pos = self.last_pos; + self.err_span_char( + start, last_pos, + "byte constant must be ASCII. \ + Use a \\xHH escape for a non-ASCII byte", first_source_char); + } + } + Some(first_source_char) + } + fn binop(&mut self, op: token::BinOp) -> token::Token { self.bump(); if self.curr_is('=') { @@ -810,43 +871,7 @@ impl<'a> StringReader<'a> { } // Otherwise it is a character constant: - match c2 { - '\\' => { - // '\X' for some X must be a character constant: - let escaped = self.curr; - let escaped_pos = self.last_pos; - self.bump(); - match escaped { - None => {} - Some(e) => { - c2 = match e { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '"' => '"', - '0' => '\x00', - 'x' => self.scan_numeric_escape(2u, '\''), - 'u' => self.scan_numeric_escape(4u, '\''), - 'U' => self.scan_numeric_escape(8u, '\''), - c2 => { - let last_bpos = self.last_pos; - self.err_span_char(escaped_pos, last_bpos, - "unknown character escape", c2); - c2 - } - } - } - } - } - '\t' | '\n' | '\r' | '\'' => { - let last_bpos = self.last_pos; - self.err_span_char( start, last_bpos, - "character constant must be escaped", c2); - } - _ => {} - } + c2 = self.scan_char_or_byte(start, c2, /* ascii_only = */ false, '\'').unwrap(); if !self.curr_is('\'') { let last_bpos = self.last_pos; self.fatal_span_verbose( @@ -876,44 +901,7 @@ impl<'a> StringReader<'a> { let mut c2 = self_.curr.unwrap_or('\x00'); self_.bump(); - match c2 { - '\\' => { - // '\X' for some X must be a character constant: - let escaped = self_.curr; - let escaped_pos = self_.last_pos; - self_.bump(); - match escaped { - None => {} - Some(e) => { - c2 = match e { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '"' => '"', - '0' => '\x00', - 'x' => self_.scan_numeric_escape(2u, '\''), - c2 => { - self_.err_span_char( - escaped_pos, self_.last_pos, - "unknown byte escape", c2); - c2 - } - } - } - } - } - '\t' | '\n' | '\r' | '\'' => { - self_.err_span_char( start, self_.last_pos, - "byte constant must be escaped", c2); - } - _ => if c2 > '\x7F' { - self_.err_span_char( start, self_.last_pos, - "byte constant must be ASCII. \ - Use a \\xHH escape for a non-ASCII byte", c2); - } - } + c2 = self_.scan_char_or_byte(start, c2, /* ascii_only = */ true, '\'').unwrap(); if !self_.curr_is('\'') { // Byte offsetting here is okay because the // character before position `start` are an @@ -936,46 +924,11 @@ impl<'a> StringReader<'a> { "unterminated double quote byte string"); } + let ch_start = self_.last_pos; let ch = self_.curr.unwrap(); self_.bump(); - match ch { - '\\' => { - if self_.is_eof() { - self_.fatal_span(start, self_.last_pos, - "unterminated double quote byte string"); - } - - let escaped = self_.curr.unwrap(); - let escaped_pos = self_.last_pos; - self_.bump(); - match escaped { - 'n' => value.push('\n' as u8), - 'r' => value.push('\r' as u8), - 't' => value.push('\t' as u8), - '\\' => value.push('\\' as u8), - '\'' => value.push('\'' as u8), - '"' => value.push('"' as u8), - '\n' => self_.consume_whitespace(), - '0' => value.push(0), - 'x' => { - value.push(self_.scan_numeric_escape(2u, '"') as u8); - } - c2 => { - self_.err_span_char(escaped_pos, self_.last_pos, - "unknown byte string escape", c2); - } - } - } - _ => { - if ch <= '\x7F' { - value.push(ch as u8) - } else { - self_.err_span_char(self_.last_pos, self_.last_pos, - "byte string must be ASCII. \ - Use a \\xHH escape for a non-ASCII byte", ch); - } - } - } + self_.scan_char_or_byte(ch_start, ch, /* ascii_only = */ true, '"') + .map(|ch| value.push(ch as u8)); } self_.bump(); return token::LIT_BINARY(Rc::new(value)); @@ -1039,46 +992,11 @@ impl<'a> StringReader<'a> { self.fatal_span(start_bpos, last_bpos, "unterminated double quote string"); } + let ch_start = self.last_pos; let ch = self.curr.unwrap(); self.bump(); - match ch { - '\\' => { - if self.is_eof() { - let last_bpos = self.last_pos; - self.fatal_span(start_bpos, last_bpos, - "unterminated double quote string"); - } - - let escaped = self.curr.unwrap(); - let escaped_pos = self.last_pos; - self.bump(); - match escaped { - 'n' => accum_str.push_char('\n'), - 'r' => accum_str.push_char('\r'), - 't' => accum_str.push_char('\t'), - '\\' => accum_str.push_char('\\'), - '\'' => accum_str.push_char('\''), - '"' => accum_str.push_char('"'), - '\n' => self.consume_whitespace(), - '0' => accum_str.push_char('\x00'), - 'x' => { - accum_str.push_char(self.scan_numeric_escape(2u, '"')); - } - 'u' => { - accum_str.push_char(self.scan_numeric_escape(4u, '"')); - } - 'U' => { - accum_str.push_char(self.scan_numeric_escape(8u, '"')); - } - c2 => { - let last_bpos = self.last_pos; - self.err_span_char(escaped_pos, last_bpos, - "unknown string escape", c2); - } - } - } - _ => accum_str.push_char(ch) - } + self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ false, '"') + .map(|ch| accum_str.push_char(ch)); } self.bump(); return token::LIT_STR(str_to_ident(accum_str.as_slice()));