Skip to content

Commit

Permalink
Simple impl of \uxxxx for 1.9 mode. Basic stuff seems to work and no …
Browse files Browse the repository at this point in the history
…tests break. More later
  • Loading branch information
enebo committed Sep 4, 2010
1 parent 5289b37 commit cae02d5
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 31 deletions.
189 changes: 163 additions & 26 deletions src/org/jruby/lexer/yacc/RubyYaccLexer.java
Expand Up @@ -58,6 +58,8 @@
/** This is a port of the MRI lexer to Java it is compatible to Ruby 1.8.1.
*/
public class RubyYaccLexer {
public static final Encoding UTF8_ENCODING = Encoding.load("UTF8");

private static ByteList END_MARKER = new ByteList(new byte[] {'_', 'E', 'N', 'D', '_', '_'});
private static ByteList BEGIN_DOC_MARKER = new ByteList(new byte[] {'b', 'e', 'g', 'i', 'n'});
private static ByteList END_DOC_MARKER = new ByteList(new byte[] {'e', 'n', 'd'});
Expand Down Expand Up @@ -391,6 +393,10 @@ public StackState getCmdArgumentState() {
return cmdArgumentState;
}

public boolean isOneEight() {
return isOneEight;
}

public StackState getConditionState() {
return conditionState;
}
Expand Down Expand Up @@ -825,6 +831,7 @@ private int yylex() throws IOException {
lex_strterm = null;
lex_state = LexState.EXPR_END;
}

return tok;
}

Expand Down Expand Up @@ -1843,13 +1850,17 @@ private int questionMark() throws IOException {
yaccValue = new Token("?", getPosition());
return '?';
} else if (c == '\\') {
// FIXME: peek('u') utf8 stuff for 1.9
c = readEscape();
if (!isOneEight && src.peek('u')) {
src.read(); // Eat 'u'
c = readUTFEscape(null /* Not String literal so no buffer setting */, false, false);
} else {
c = readEscape();
}
}

c &= 0xff;
lex_state = LexState.EXPR_END;
if (isOneEight) {
c &= 0xff;
yaccValue = new FixnumNode(getPosition(), c);
} else {
// TODO: this isn't handling multibyte yet
Expand Down Expand Up @@ -2214,6 +2225,92 @@ private int getNumberToken(String number, boolean isFloat, int nondigit) {
yaccValue = getInteger(number, 10);
return Tokens.tINTEGER;
}

// Note: parser_tokadd_utf8 variant just for regexp literal parsing. This variant is to be
// called when string_literal and regexp_literal.
public void readUTFEscapeRegexpLiteral() throws IOException {
tokenBuffer.append("\\u");
if (src.peek('{')) { // handle \\u{...}
do {
tokenBuffer.append("{");
if (scanHexLiteral(6, false, "invalid Unicode escape") > 0x10ffff) {
throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
getCurrentLine(), "invalid Unicode codepoint (too large)");
}
} while (src.peek(' ') || src.peek('\t'));

int c = src.read();
if (c != '}') {
throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
getCurrentLine(), "unterminated Unicode escape");
}
tokenBuffer.append((char) c);
} else { // handle \\uxxxx
scanHexLiteral(token, true, "Invalid Unicode escape");
}
}

private byte[] mbcBuf = new byte[6];

//FIXME: This seems like it could be more efficient to ensure size in bytelist and then pass
// in bytelists byte backing store. This method would look ugly since realSize would need
// to be tweaked and I don't know how many bytes this codepoint has up front so I would need
// to grow by 6 (which may be wasteful). Another idea is to make Encoding accept an interface
// for populating bytes and then make ByteList implement that interface. I like this last idea
// since it would not leak bytelist impl details all over the place.
private void tokenAddMBC(int codepoint, ByteList buffer, Encoding encoding) {
int length = encoding.codeToMbc(codepoint, mbcBuf, 0);
buffer.append(mbcBuf, 0, length);
}

// MRI: parser_tokadd_utf8 sans regexp literal parsing
public int readUTFEscape(ByteList buffer, boolean stringLiteral, boolean symbolLiteral) throws IOException {
int codepoint;
int c;

if (src.peek('{')) { // handle \\u{...}
do {
src.read(); // Eat curly or whitespace
codepoint = scanHex(6, false, "invalid Unicode escape");
if (codepoint > 0x10ffff) {
throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
getCurrentLine(), "invalid Unicode codepoint (too large)");
}
if (codepoint >= 0x80) {
buffer.setEncoding(UTF8_ENCODING);
if (stringLiteral) tokenAddMBC(codepoint, buffer, UTF8_ENCODING);
} else if (stringLiteral) {
if (codepoint == 0 && symbolLiteral) {
throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
getCurrentLine(), "symbol cannot contain '\\u0000'");
}

buffer.append((char) codepoint);
}
} while (src.peek(' ') || src.peek('\t'));

c = src.read();
if (c != '}') {
throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
getCurrentLine(), "unterminated Unicode escape");
}
} else { // handle \\uxxxx
codepoint = scanHex(4, true, "Invalid Unicode escape");
if (codepoint >= 0x80) {
buffer.setEncoding(UTF8_ENCODING);
if (stringLiteral) tokenAddMBC(codepoint, buffer, UTF8_ENCODING);
} else if (stringLiteral) {
if (codepoint == 0 && symbolLiteral) {
throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
getCurrentLine(), "symbol cannot contain '\\u0000'");
}

buffer.append((char) codepoint);
}
}

return codepoint;
}

public int readEscape() throws IOException {
int c = src.read();
Expand All @@ -2240,29 +2337,7 @@ public int readEscape() throws IOException {
src.unread(c);
return scanOct(3);
case 'x' : // hex constant
int i = 0;
//char hexValue = scanHex(2);

char hexValue = '\0';

for (; i < 2; i++) {
int h1 = src.read();

if (!RubyYaccLexer.isHexChar(h1)) {
src.unread(h1);
break;
}

hexValue <<= 4;
hexValue |= Integer.parseInt(""+(char)h1, 16) & 15;
}

// No hex value after the 'x'.
if (i == 0) {
throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
getCurrentLine(), "Invalid escape character syntax");
}
return hexValue;
return scanHex(2, false, "Invalid escape character syntax");
case 'b' : // backspace
return '\010';
case 's' : // space
Expand Down Expand Up @@ -2301,6 +2376,68 @@ public int readEscape() throws IOException {
}
}

/**
* Read up to count hexadecimal digits and store those digits in a token buffer. If strict is
* provided then count number of hex digits must be present. If no digits can be read a syntax
* exception will be thrown. This will also return the codepoint as a value so codepoint
* ranges can be checked.
*/
private char scanHexLiteral(int count, boolean strict, String errorMessage) throws IOException {
int i = 0;
char hexValue = '\0';

for (; i < count; i++) {
int h1 = src.read();

if (!RubyYaccLexer.isHexChar(h1)) {
src.unread(h1);
break;
}

tokenBuffer.append(h1);

hexValue <<= 4;
hexValue |= Integer.parseInt("" + (char) h1, 16) & 15;
}

// No hex value after the 'x'.
if (i == 0 || strict && count != i) {
throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
getCurrentLine(), errorMessage);
}

return hexValue;
}

/**
* Read up to count hexadecimal digits. If strict is provided then count number of hex
* digits must be present. If no digits can be read a syntax exception will be thrown.
*/
private int scanHex(int count, boolean strict, String errorMessage) throws IOException {
int i = 0;
int hexValue = '\0';

for (; i < count; i++) {
int h1 = src.read();

if (!RubyYaccLexer.isHexChar(h1)) {
src.unread(h1);
break;
}

hexValue <<= 4;
hexValue |= Integer.parseInt("" + (char) h1, 16) & 15;
}

// No hex value after the 'x'.
if (i == 0 || (strict && count != i)) {
throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
getCurrentLine(), errorMessage);
}

return hexValue;
}

private char scanOct(int count) throws IOException {
char value = '\0';

Expand Down
32 changes: 27 additions & 5 deletions src/org/jruby/lexer/yacc/StringTerm.java
Expand Up @@ -99,7 +99,7 @@ public int parseString(RubyYaccLexer lexer, LexerSource src) throws java.io.IOEx
if (begin == '\0' && flags == 0) {
ByteList buffer = new ByteList();
src.unread(c);
if (parseSimpleStringIntoBuffer(src, buffer) == RubyYaccLexer.EOF) {
if (parseSimpleStringIntoBuffer(lexer, src, buffer) == RubyYaccLexer.EOF) {
throw new SyntaxException(PID.STRING_HITS_EOF, src.getPosition(),
src.getCurrentLine(), "unterminated string meets end of file");
}
Expand Down Expand Up @@ -192,19 +192,28 @@ private int parseRegexpFlags(final LexerSource src) throws java.io.IOException {
return options | kcode;
}

public int parseSimpleStringIntoBuffer(LexerSource src, ByteList buffer) throws java.io.IOException {
public int parseSimpleStringIntoBuffer(RubyYaccLexer lexer, LexerSource src, ByteList buffer) throws java.io.IOException {
int c;


while ((c = src.read()) != RubyYaccLexer.EOF) {
if (c == end) {
src.unread(c);
break;
} else if (c == '\\') {
c = src.read();
if ((c == '\n' || c != end) && c != '\\') buffer.append('\\');
}
if (!lexer.isOneEight() && c == 'u') {
lexer.readUTFEscape(buffer, true, false);

buffer.append(c);
// ENEBO: Mixed escape via non-ascii magic missing
} else if((c == '\n' || c != end) && c != '\\') {
buffer.append('\\').append(c);
} else {
buffer.append(c);
}
} else {
buffer.append(c);
}
}

return c;
Expand All @@ -215,6 +224,7 @@ public int parseStringIntoBuffer(RubyYaccLexer lexer, LexerSource src, ByteList
boolean expand = (flags & RubyYaccLexer.STR_FUNC_EXPAND) != 0;
boolean escape = (flags & RubyYaccLexer.STR_FUNC_ESCAPE) != 0;
boolean regexp = (flags & RubyYaccLexer.STR_FUNC_REGEXP) != 0;
boolean symbol = (flags & RubyYaccLexer.STR_FUNC_SYMBOL) != 0;
int c;

while ((c = src.read()) != RubyYaccLexer.EOF) {
Expand Down Expand Up @@ -248,6 +258,18 @@ public int parseStringIntoBuffer(RubyYaccLexer lexer, LexerSource src, ByteList
if (escape) buffer.append(c);
break;

case 'u':
if (!lexer.isOneEight()) {
if (!expand) {
buffer.append('\\');
break;
}
lexer.readUTFEscape(buffer, true, symbol);

// ENEBO: Mixed escape via non-ascii magic missing

continue;
}
default:
if (regexp) {
src.unread(c);
Expand Down

0 comments on commit cae02d5

Please sign in to comment.