Simple impl of \uxxxx for 1.9 mode. Basic stuff seems to work and no …

…tests break. More later
jruby · Sep 4, 2010 · cae02d5 · cae02d5
1 parent 5289b37
commit cae02d5
Show file tree

Hide file tree

Showing 2 changed files with 190 additions and 31 deletions.
diff --git a/src/org/jruby/lexer/yacc/RubyYaccLexer.java b/src/org/jruby/lexer/yacc/RubyYaccLexer.java
@@ -58,6 +58,8 @@
 /** This is a port of the MRI lexer to Java it is compatible to Ruby 1.8.1.
  */
 public class RubyYaccLexer {
+    public static final Encoding UTF8_ENCODING = Encoding.load("UTF8");
+
     private static ByteList END_MARKER = new ByteList(new byte[] {'_', 'E', 'N', 'D', '_', '_'});
     private static ByteList BEGIN_DOC_MARKER = new ByteList(new byte[] {'b', 'e', 'g', 'i', 'n'});
     private static ByteList END_DOC_MARKER = new ByteList(new byte[] {'e', 'n', 'd'});
@@ -391,6 +393,10 @@ public StackState getCmdArgumentState() {
         return cmdArgumentState;
     }
 
+    public boolean isOneEight() {
+        return isOneEight;
+    }
+
     public StackState getConditionState() {
         return conditionState;
     }
@@ -825,6 +831,7 @@ private int yylex() throws IOException {
                 lex_strterm = null;
                 lex_state = LexState.EXPR_END;
             }
+
             return tok;
         }
 
@@ -1843,13 +1850,17 @@ private int questionMark() throws IOException {
             yaccValue = new Token("?", getPosition());
             return '?';
         } else if (c == '\\') {
-            // FIXME: peek('u') utf8 stuff for 1.9
-            c = readEscape();
+            if (!isOneEight && src.peek('u')) {
+                src.read(); // Eat 'u'
+                c = readUTFEscape(null /* Not String literal so no buffer setting */, false, false);
+            } else {
+                c = readEscape();
+            }
         }
 
-        c &= 0xff;
         lex_state = LexState.EXPR_END;
         if (isOneEight) {
+            c &= 0xff;
             yaccValue = new FixnumNode(getPosition(), c);
         } else {
             // TODO: this isn't handling multibyte yet
@@ -2214,6 +2225,92 @@ private int getNumberToken(String number, boolean isFloat, int nondigit) {
         yaccValue = getInteger(number, 10);
         return Tokens.tINTEGER;
     }
+
+    // Note: parser_tokadd_utf8 variant just for regexp literal parsing.  This variant is to be
+    // called when string_literal and regexp_literal.
+    public void readUTFEscapeRegexpLiteral() throws IOException {
+        tokenBuffer.append("\\u");
+        if (src.peek('{')) { // handle \\u{...}
+            do {
+                tokenBuffer.append("{");
+                if (scanHexLiteral(6, false, "invalid Unicode escape") > 0x10ffff) {
+                    throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
+                            getCurrentLine(), "invalid Unicode codepoint (too large)");
+                }
+            } while (src.peek(' ') || src.peek('\t'));
+
+            int c = src.read();
+            if (c != '}') {
+                throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
+                        getCurrentLine(), "unterminated Unicode escape");
+            }
+            tokenBuffer.append((char) c);
+        } else { // handle \\uxxxx
+            scanHexLiteral(token, true, "Invalid Unicode escape");
+        }
+    }
+
+    private byte[] mbcBuf = new byte[6];
+
+    //FIXME: This seems like it could be more efficient to ensure size in bytelist and then pass
+    // in bytelists byte backing store.  This method would look ugly since realSize would need
+    // to be tweaked and I don't know how many bytes this codepoint has up front so I would need
+    // to grow by 6 (which may be wasteful).  Another idea is to make Encoding accept an interface
+    // for populating bytes and then make ByteList implement that interface.  I like this last idea
+    // since it would not leak bytelist impl details all over the place.
+    private void tokenAddMBC(int codepoint, ByteList buffer, Encoding encoding) {
+        int length = encoding.codeToMbc(codepoint, mbcBuf, 0);
+        buffer.append(mbcBuf, 0, length);
+    }
+
+    // MRI: parser_tokadd_utf8 sans regexp literal parsing
+    public int readUTFEscape(ByteList buffer, boolean stringLiteral, boolean symbolLiteral) throws IOException {
+        int codepoint;
+        int c;
+
+        if (src.peek('{')) { // handle \\u{...}
+            do {
+                src.read(); // Eat curly or whitespace
+                codepoint = scanHex(6, false, "invalid Unicode escape");
+                if (codepoint > 0x10ffff) {
+                    throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
+                            getCurrentLine(), "invalid Unicode codepoint (too large)");
+                }
+                if (codepoint >= 0x80) {
+                    buffer.setEncoding(UTF8_ENCODING);
+                    if (stringLiteral) tokenAddMBC(codepoint, buffer, UTF8_ENCODING);
+                } else if (stringLiteral) {
+                    if (codepoint == 0 && symbolLiteral) {
+                        throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
+                            getCurrentLine(), "symbol cannot contain '\\u0000'");
+                    }
+
+                    buffer.append((char) codepoint);
+                }
+            } while (src.peek(' ') || src.peek('\t'));
+
+            c = src.read();
+            if (c != '}') {
+                throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
+                        getCurrentLine(), "unterminated Unicode escape");
+            }
+        } else { // handle \\uxxxx
+            codepoint = scanHex(4, true, "Invalid Unicode escape");
+            if (codepoint >= 0x80) {
+                buffer.setEncoding(UTF8_ENCODING);
+                if (stringLiteral) tokenAddMBC(codepoint, buffer, UTF8_ENCODING);
+            } else if (stringLiteral) {
+                if (codepoint == 0 && symbolLiteral) {
+                    throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
+                        getCurrentLine(), "symbol cannot contain '\\u0000'");
+                }
+
+                buffer.append((char) codepoint);
+            }
+        }
+
+        return codepoint;
+    }
 
     public int readEscape() throws IOException {
         int c = src.read();
@@ -2240,29 +2337,7 @@ public int readEscape() throws IOException {
                 src.unread(c);
                 return scanOct(3);
             case 'x' : // hex constant
-                int i = 0;
-                //char hexValue = scanHex(2);
-
-                char hexValue = '\0';
-
-                for (; i < 2; i++) {
-                    int h1 = src.read();
-
-                    if (!RubyYaccLexer.isHexChar(h1)) {
-                        src.unread(h1);
-                        break;
-                    }
-
-                    hexValue <<= 4;
-                    hexValue |= Integer.parseInt(""+(char)h1, 16) & 15;
-                }
-
-                // No hex value after the 'x'.
-                if (i == 0) {
-                    throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
-                            getCurrentLine(), "Invalid escape character syntax");
-                }
-                return hexValue;
+                return scanHex(2, false, "Invalid escape character syntax");
             case 'b' : // backspace
                 return '\010';
             case 's' : // space
@@ -2301,6 +2376,68 @@ public int readEscape() throws IOException {
         }
     }
 
+    /**
+     * Read up to count hexadecimal digits and store those digits in a token buffer.  If strict is
+     * provided then count number of hex digits must be present. If no digits can be read a syntax
+     * exception will be thrown.  This will also return the codepoint as a value so codepoint
+     * ranges can be checked.
+     */
+    private char scanHexLiteral(int count, boolean strict, String errorMessage) throws IOException {
+        int i = 0;
+        char hexValue = '\0';
+
+        for (; i < count; i++) {
+            int h1 = src.read();
+
+            if (!RubyYaccLexer.isHexChar(h1)) {
+                src.unread(h1);
+                break;
+            }
+
+            tokenBuffer.append(h1);
+
+            hexValue <<= 4;
+            hexValue |= Integer.parseInt("" + (char) h1, 16) & 15;
+        }
+
+        // No hex value after the 'x'.
+        if (i == 0 || strict && count != i) {
+            throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
+                    getCurrentLine(), errorMessage);
+        }
+
+        return hexValue;
+    }
+
+    /**
+     * Read up to count hexadecimal digits.  If strict is provided then count number of hex
+     * digits must be present. If no digits can be read a syntax exception will be thrown.
+     */
+    private int scanHex(int count, boolean strict, String errorMessage) throws IOException {
+        int i = 0;
+        int hexValue = '\0';
+
+        for (; i < count; i++) {
+            int h1 = src.read();
+
+            if (!RubyYaccLexer.isHexChar(h1)) {
+                src.unread(h1);
+                break;
+            }
+
+            hexValue <<= 4;
+            hexValue |= Integer.parseInt("" + (char) h1, 16) & 15;
+        }
+
+        // No hex value after the 'x'.
+        if (i == 0 || (strict && count != i)) {
+            throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
+                    getCurrentLine(), errorMessage);
+        }
+
+        return hexValue;
+    }
+
     private char scanOct(int count) throws IOException {
         char value = '\0';
 

diff --git a/src/org/jruby/lexer/yacc/StringTerm.java b/src/org/jruby/lexer/yacc/StringTerm.java
@@ -99,7 +99,7 @@ public int parseString(RubyYaccLexer lexer, LexerSource src) throws java.io.IOEx
         if (begin == '\0' && flags == 0) {
             ByteList buffer = new ByteList();
             src.unread(c);
-            if (parseSimpleStringIntoBuffer(src, buffer) == RubyYaccLexer.EOF) {
+            if (parseSimpleStringIntoBuffer(lexer, src, buffer) == RubyYaccLexer.EOF) {
                 throw new SyntaxException(PID.STRING_HITS_EOF, src.getPosition(), 
                         src.getCurrentLine(), "unterminated string meets end of file");
             }
@@ -192,19 +192,28 @@ private int parseRegexpFlags(final LexerSource src) throws java.io.IOException {
         return options | kcode;
     }
 
-    public int parseSimpleStringIntoBuffer(LexerSource src, ByteList buffer) throws java.io.IOException {
+    public int parseSimpleStringIntoBuffer(RubyYaccLexer lexer, LexerSource src, ByteList buffer) throws java.io.IOException {
         int c;
 
+
         while ((c = src.read()) != RubyYaccLexer.EOF) {
             if (c == end) {
                 src.unread(c);
                 break;
             } else if (c == '\\') {
                 c = src.read();
-                if ((c == '\n' || c != end) && c != '\\') buffer.append('\\');
-            } 
+                if (!lexer.isOneEight() && c == 'u') {
+                    lexer.readUTFEscape(buffer, true, false);
 
-            buffer.append(c);
+                    // ENEBO: Mixed escape via non-ascii magic missing
+                } else if((c == '\n' || c != end) && c != '\\') {
+                    buffer.append('\\').append(c);
+                } else {
+                    buffer.append(c);
+                }
+            } else {
+                buffer.append(c);
+            }
         }
 
         return c;
@@ -215,6 +224,7 @@ public int parseStringIntoBuffer(RubyYaccLexer lexer, LexerSource src, ByteList
         boolean expand = (flags & RubyYaccLexer.STR_FUNC_EXPAND) != 0;
         boolean escape = (flags & RubyYaccLexer.STR_FUNC_ESCAPE) != 0;
         boolean regexp = (flags & RubyYaccLexer.STR_FUNC_REGEXP) != 0;
+        boolean symbol = (flags & RubyYaccLexer.STR_FUNC_SYMBOL) != 0;
         int c;
 
         while ((c = src.read()) != RubyYaccLexer.EOF) {
@@ -248,6 +258,18 @@ public int parseStringIntoBuffer(RubyYaccLexer lexer, LexerSource src, ByteList
                     if (escape) buffer.append(c);
                     break;
 
+                case 'u':
+                    if (!lexer.isOneEight()) {
+                        if (!expand) {
+                            buffer.append('\\');
+                            break;
+                        }
+                        lexer.readUTFEscape(buffer, true, symbol);
+
+                        // ENEBO: Mixed escape via non-ascii magic missing
+
+                        continue;
+                    }
                 default:
                     if (regexp) {
                         src.unread(c);