Update scanner handling of numeric literal suffixes (Fixes #3281)

tcare · tcare · commit 19bf015170bc · 2017-09-07T13:31:59.000-07:00
The scanner was missing logic to error out in some cases when we have forbidden characters after a numeric literal. The spec states that we cannot have an identifier start or a digit immediately following a numeric literal.

- Changed Scanner::FScanNumber to check for forbidden characters following a valid literal.
- Added an error message to convey the identifier after literal case.
- Removed the unused oFScanNumber function.
- Created a scanner test dir since we don't have one.
- Added a comprehensive test to test both the following literal and digit case (for octal and binary literals.)
diff --git a/lib/Parser/Scan.cpp b/lib/Parser/Scan.cpp
@@ -565,9 +565,25 @@ typename Scanner<EncodingPolicy>::EncodedCharPtr Scanner<EncodingPolicy>::FScanN
 {
     EncodedCharPtr last = m_pchLast;
     EncodedCharPtr pchT = nullptr;
+    bool baseSpecified = false;
     likelyInt = true;
     // Reset
     m_OctOrLeadingZeroOnLastTKNumber = false;
+
+    auto baseSpecifierCheck = [&pchT, &pdbl, p, &baseSpecified]()
+    {
+        if (pchT == p + 2)
+        {
+            // An octal token '0' was followed by a base specifier: /0[xXoObB]/
+            // This literal can no longer be a double
+            *pdbl = 0;
+            // Advance the character pointer to the base specifier
+            pchT = p + 1;
+            // Set the flag so we know to offset the potential identifier search after the literal
+            baseSpecified = true;
+        }
+    };
+
     if ('0' == this->PeekFirst(p, last))
     {
         switch(this->PeekFirst(p + 1, last))
@@ -583,37 +599,21 @@ typename Scanner<EncodingPolicy>::EncodedCharPtr Scanner<EncodingPolicy>::FScanN
         case 'X':
             // Hex
             *pdbl = Js::NumberUtilities::DblFromHex(p + 2, &pchT);
-            if (pchT == p + 2)
-            {
-                // "Octal zero token "0" followed by an identifier token beginning with character 'x'/'X'
-                *pdbl = 0;
-                return p + 1;
-            }
-            else
-                return pchT;
+            baseSpecifierCheck();
+            goto LIdCheck;
         case 'o':
         case 'O':
             // Octal
             *pdbl = Js::NumberUtilities::DblFromOctal(p + 2, &pchT);
-            if (pchT == p + 2)
-            {
-                // "Octal zero token "0" followed by an identifier token beginning with character 'o'/'O'
-                *pdbl = 0;
-                return p + 1;
-            }
-            return pchT;
+            baseSpecifierCheck();
+            goto LIdCheck;
 
         case 'b':
         case 'B':
             // Binary
             *pdbl = Js::NumberUtilities::DblFromBinary(p + 2, &pchT);
-            if (pchT == p + 2)
-            {
-                // "Octal zero token "0" followed by an identifier token beginning with character 'b'/'B'
-                *pdbl = 0;
-                return p + 1;
-            }
-            return  pchT;
+            baseSpecifierCheck();
+            goto LIdCheck;
 
         default:
             // Octal
@@ -636,113 +636,45 @@ typename Scanner<EncodingPolicy>::EncodedCharPtr Scanner<EncodingPolicy>::FScanN
                 m_OctOrLeadingZeroOnLastTKNumber = false;  //08...  or 09....
                 goto LFloat;
             }
-            return pchT;
+            goto LIdCheck;
         }
     }
     else
     {
 LFloat:
         *pdbl = Js::NumberUtilities::StrToDbl(p, &pchT, likelyInt);
         Assert(pchT == p || !Js::NumberUtilities::IsNan(*pdbl));
-        return pchT;
+        // fall through to LIdCheck
     }
-}
 
-template <typename EncodingPolicy>
-BOOL Scanner<EncodingPolicy>::oFScanNumber(double *pdbl, bool& likelyInt)
-{
-    EncodedCharPtr pchT;
-    m_OctOrLeadingZeroOnLastTKNumber = false;
-    likelyInt = true;
-    if  ('0' == *m_currentCharacter)
+LIdCheck:
+    // https://tc39.github.io/ecma262/#sec-literals-numeric-literals
+    // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
+    // For example : 3in is an error and not the two input elements 3 and in
+    codepoint_t outChar = 0;
+    // If a base was speficied, use the first character denoting the constant. In this case, pchT is pointing to the base specifier.
+    EncodedCharPtr startingLocation = baseSpecified ? pchT + 1 : pchT;
+    if (this->charClassifier->IsIdStart(*startingLocation))
     {
-        switch (m_currentCharacter[1])
-        {
-        case '.':
-        case 'e':
-        case 'E':
-            likelyInt = false;
-            // Floating point.
-            goto LFloat;
-
-        case 'x':
-        case 'X':
-            // Hex.
-            *pdbl = Js::NumberUtilities::DblFromHex<EncodedChar>(m_currentCharacter + 2, &pchT);
-            if (pchT == m_currentCharacter + 2)
-            {
-                // "Octal zero token "0" followed by an identifier token beginning with character 'x'/'X'
-                *pdbl = 0;
-                m_currentCharacter++;
-            }
-            else
-                m_currentCharacter = pchT;
-            break;
-        case 'o':
-        case 'O':
-            *pdbl = Js::NumberUtilities::DblFromOctal(m_currentCharacter + 2, &pchT);
-            if (pchT == m_currentCharacter + 2)
-            {
-                // "Octal zero token "0" followed by an identifier token beginning with character 'o'/'O'
-                *pdbl = 0;
-                m_currentCharacter++;
-            }
-            else
-                m_currentCharacter = pchT;
-            break;
-
-        case 'b':
-        case 'B':
-            *pdbl = Js::NumberUtilities::DblFromBinary(m_currentCharacter + 2, &pchT);
-            if (pchT == m_currentCharacter + 2)
-            {
-                // "Octal zero token "0" followed by an identifier token beginning with character 'b'/'B'
-                *pdbl = 0;
-                m_currentCharacter++;
-            }
-            else
-                m_currentCharacter = pchT;
-            break;
-
-        default:
-            // Octal.
-            *pdbl = Js::NumberUtilities::DblFromOctal(m_currentCharacter, &pchT);
-            Assert(pchT > m_currentCharacter);
-
-
-#if !SOURCERELEASE
-            // If an octal literal is malformed then it is in fact a decimal literal.
-#endif // !SOURCERELEASE
-            if(*pdbl != 0 || pchT > m_currentCharacter + 1)
-                m_OctOrLeadingZeroOnLastTKNumber = true; //report as an octal or hex for JSON when leading 0. Just '0' is ok
-            switch (*pchT)
-            {
-            case '8':
-            case '9':
-                //            case 'e':
-                //            case 'E':
-                //            case '.':
-                m_OctOrLeadingZeroOnLastTKNumber = false;  //08...  or 09....
-                goto LFloat;
-            }
+        Error(ERRIdAfterLit);
+    }
 
-            m_currentCharacter = pchT;
-            break;
+    // IsIdStart does not cover the unicode escape case. Try to read a unicode escape from the 'u' char.
+    if (*pchT == '\\')
+    {
+        startingLocation++; // TryReadEscape expects us to point to the 'u', and since it is by reference we need to do it beforehand.
+        if (TryReadEscape(startingLocation, m_pchLast, &outChar))
+        {
+            Error(ERRIdAfterLit);
         }
     }
-    else
-    {
-LFloat:
-        // Let StrToDbl do all the work.
 
-        *pdbl = Js::NumberUtilities::StrToDbl(m_currentCharacter, &pchT, likelyInt);
-        if (pchT == m_currentCharacter)
-            return FALSE;
-        m_currentCharacter = pchT;
-        Assert(!Js::NumberUtilities::IsNan(*pdbl));
+    if (Js::NumberUtilities::IsDigit(*startingLocation))
+    {
+        Error(ERRbadNumber);
     }
 
-    return TRUE;
+    return pchT;
 }
 
 template <typename EncodingPolicy>
diff --git a/lib/Parser/Scan.h b/lib/Parser/Scan.h
@@ -767,7 +767,6 @@ class Scanner : public IScanner, public EncodingPolicy
     tokens SkipComment(EncodedCharPtr *pp, /* out */ bool* containTypeDef);
     tokens ScanRegExpConstant(ArenaAllocator* alloc);
     tokens ScanRegExpConstantNoAST(ArenaAllocator* alloc);
-    BOOL oFScanNumber(double *pdbl, bool& likelyInt);
     EncodedCharPtr FScanNumber(EncodedCharPtr p, double *pdbl, bool& likelyInt);
     IdentPtr PidOfIdentiferAt(EncodedCharPtr p, EncodedCharPtr last, bool fHadEscape, bool fHasMultiChar);
     IdentPtr PidOfIdentiferAt(EncodedCharPtr p, EncodedCharPtr last);
diff --git a/lib/Parser/perrors.h b/lib/Parser/perrors.h
@@ -22,6 +22,7 @@ LSC_ERROR_MSG( 1013, ERRbadNumber     , "Invalid number")
 LSC_ERROR_MSG( 1014, ERRillegalChar   , "Invalid character")
 LSC_ERROR_MSG( 1015, ERRnoStrEnd      , "Unterminated string constant")
 LSC_ERROR_MSG( 1016, ERRnoCmtEnd      , "Unterminated comment")
+LSC_ERROR_MSG( 1017, ERRIdAfterLit    , "Unexpected identifier after numeric literal")
 
 LSC_ERROR_MSG( 1018, ERRbadReturn     , "'return' statement outside of function")
 LSC_ERROR_MSG( 1019, ERRbadBreak      , "Can't have 'break' outside of loop")
diff --git a/test/Scanner/NumericLiteralSuffix.js b/test/Scanner/NumericLiteralSuffix.js
@@ -0,0 +1,80 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+
+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
+
+// https://tc39.github.io/ecma262/#sec-reserved-words
+let keywords = ['await', 'break', 'case', 'catch', 'class', 'const', 'continue', 'debugger', 'default', 'delete', 'do', 'else', 'export', 'extends', 'finally', 'for', 'function', 'if', 'import', 'in', 'instanceof', 'new', 'return', 'super', 'switch', 'this', 'throw', 'try', 'typeof', 'var', 'void', 'while', 'with', 'yield'];
+let futureReservedWords = ['enum', 'implements', 'package', 'protected ', 'interface', 'private', 'public'];
+
+// https://tc39.github.io/ecma262/#sec-names-and-keywords
+let idStarts = ["\u{50}", '$', '_', "\\u{50}"];
+
+// https://tc39.github.io/ecma262/#sec-literals-numeric-literals
+let literalClasses = {
+    'Decimal Integer Literal': [
+        '0', '1', '123',
+        '0.1', '1.1', '123.1', '123.123',
+        '0e1', '1e1', '1e+1', '1e-1',
+        '0E1', '1E1', '1E+1', '1E-1',
+        '123e123', '123e+123', '123e-123',
+        '123E123', '123E+123', '123E-123'
+     ],
+     'Binary Integer Literal': [
+        '0b0', '0b1', '0b010101',
+        '0B0', '0B1', '0B010101',
+     ],
+     'Octal Integer Literal': [
+        '0o0', '0o1', '0o123',
+        '0O0', '0O1', '0O123'
+     ],
+     'Hex Integer Literal': [
+        '0x0', '0x1', '0x123', '0xabc', '0xABC', '0x123abc', '0x123ABC',
+        '0X0', '0X1', '0X123', '0Xabc', '0XABC', '0X123abc', '0X123ABC'
+     ]
+};
+
+var tests = [
+    {
+        name: "Numeric literal followed by an identifier start throws",
+        body: function () {
+            for (let literalClass in literalClasses) {
+                for (let literal of literalClasses[literalClass]) {
+                    for (let idStart of idStarts) {
+                        for (let keyword of keywords) {
+                          assert.throws(function () { eval(`${literal}${keyword}`); },            SyntaxError, `Keyword '${keyword}' directly after ${literalClass} '${literal}' throws`, "Unexpected identifier after numeric literal");
+                        }
+                        for (let futureReservedWord of futureReservedWords) {
+                          assert.throws(function () { eval(`${literal}${futureReservedWord}`); }, SyntaxError, `Future reserved word '${futureReservedWord}' directly after ${literalClass} '${literal}' throws`, "Unexpected identifier after numeric literal");
+                        }
+                        for (let idStart of idStarts) {
+                          assert.throws(function () { eval(`${literal}${idStart}`); },            SyntaxError, `Identifier start '${idStart}' directly after ${literalClass} '${literal}' throws`, "Unexpected identifier after numeric literal");
+                        }
+                    }
+                }
+            }
+        }
+    },
+    {
+        name: "Numeric literal followed by invalid digit throws",
+        body: function () {
+            let nonOctalDigits = ['8', '9'];
+            for (let literal of literalClasses['Octal Integer Literal']) {
+                for (let nonOctalDigit of nonOctalDigits) {
+                    assert.throws(function () { eval(`${literal}${nonOctalDigit}`); },            SyntaxError, `Non-octal digit '${nonOctalDigit}' directly after Octal Integer Literal '${literal}' throws`, "Invalid number");
+                }
+            }
+
+            let nonBinaryDigits = ['2', '3', '4', '5', '6', '7', '8', '9'];
+            for (let literal of literalClasses['Binary Integer Literal']) {
+                for (let nonBinaryDigit of nonBinaryDigits) {
+                    assert.throws(function () { eval(`${literal}${nonBinaryDigit}`); },            SyntaxError, `Non-binary digit '${nonBinaryDigit}' directly after Binary Integer Literal '${literal}' throws`, "Invalid number");
+                }
+            }
+        }
+    }
+];
+
+testRunner.runTests(tests, { verbose: WScript.Arguments[0] != "summary" });
diff --git a/test/Scanner/rlexe.xml b/test/Scanner/rlexe.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<regress-exe>
+  <test>
+    <default>
+      <files>NumericLiteralSuffix.js</files>
+      <compile-flags>-args summary -endargs</compile-flags>
+    </default>
+  </test>
+</regress-exe>
diff --git a/test/es6/bug_OS12095746.baseline b/test/es6/bug_OS12095746.baseline
@@ -1,6 +1,6 @@
 NotifyModuleReadyCallback(exception) bug_OS12095746_mod0.js
 NotifyModuleReadyCallback(exception) bug_OS12095746_mod1.js
 mod0 catch:Syntax error
-mod1 catch:Expected ';'
+mod1 catch:Unexpected identifier after numeric literal
 NotifyModuleReadyCallback(exception) bug_OS12095746_mod2.js
-mod2 catch:Expected ';'
+mod2 catch:Unexpected identifier after numeric literal
diff --git a/test/rlexedirs.xml b/test/rlexedirs.xml
@@ -321,4 +321,9 @@
     <tags>sequential,exclude_dynapogo,exclude_jshost,exclude_snap,exclude_serialized,require_debugger</tags>
   </default>
 </dir>
+<dir>
+  <default>
+    <files>Scanner</files>
+  </default>
+</dir>
 </regress-exe>