CR Fixes

Cellule · Cellule · commit 68344c944cfd · 2017-08-22T17:24:45.000-07:00
Use new utf8 DecodingOption to throw when using invalid character instead of rechecking the whole buffer
diff --git a/lib/Common/Codex/Utf8Codex.cpp b/lib/Common/Codex/Utf8Codex.cpp
@@ -70,12 +70,20 @@ namespace utf8
         return ((0x5B >> (((prefix ^ 0xF0) >> 3) & 0x1E)) & 0x03) + 1;
     }
 
-    const char16 g_chUnknown = char16(UNICODE_UNKNOWN_CHAR_MARK);
     const char16 WCH_UTF16_HIGH_FIRST  =  char16(0xd800);
     const char16 WCH_UTF16_HIGH_LAST   =  char16(0xdbff);
     const char16 WCH_UTF16_LOW_FIRST   =  char16(0xdc00);
     const char16 WCH_UTF16_LOW_LAST    =  char16(0xdfff);
 
+    char16 GetUnknownCharacter(DecodeOptions options = doDefault)
+    {
+        if ((options & doThrowOnInvalidWCHARs) != 0)
+        {
+            throw InvalidWideCharException();
+        }
+        return char16(UNICODE_UNKNOWN_CHAR_MARK);
+    }
+
     inline BOOL InRange(const char16 ch, const char16 chMin, const char16 chMax)
     {
         return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
@@ -122,7 +130,7 @@ namespace utf8
             }
 
             // 10xxxxxx (trail byte appearing in a lead byte position
-            return g_chUnknown;
+            return GetUnknownCharacter(options);
 
         case 2:
             // Look for an overlong utf-8 sequence.
@@ -138,7 +146,7 @@ namespace utf8
                         *chunkEndsAtTruncatedSequence = true;
                     }
                 }
-                return g_chUnknown;
+                return GetUnknownCharacter(options);
             }
             c2 = *ptr++;
             // 110XXXXx 10xxxxxx
@@ -152,12 +160,14 @@ namespace utf8
                 ch |= WCHAR(c1 & 0x1f) << 6;     // 0x0080 - 0x07ff
                 ch |= WCHAR(c2 & 0x3f);
                 if (!IsValidWideChar(ch) && ((options & doAllowInvalidWCHARs) == 0))
-                    ch = g_chUnknown;
+                {
+                    ch = GetUnknownCharacter(options);
+                }
             }
             else
             {
                 ptr--;
-                ch = g_chUnknown;
+                ch = GetUnknownCharacter(options);
             }
             break;
 
@@ -177,7 +187,7 @@ namespace utf8
                     }
                 }
 
-                return g_chUnknown;
+                return GetUnknownCharacter(options);
             }
 
             //      UTF16       |   UTF8 1st byte  2nd byte 3rd byte
@@ -217,12 +227,14 @@ namespace utf8
                 ch |= WCHAR(c2 & 0x3f) << 6;     // 0x0080 - 0x07ff
                 ch |= WCHAR(c3 & 0x3f);
                 if (!IsValidWideChar(ch) && ((options & (doAllowThreeByteSurrogates | doAllowInvalidWCHARs)) == 0))
-                    ch = g_chUnknown;
+                {
+                    ch = GetUnknownCharacter(options);
+                }
                 ptr += 2;
             }
             else
             {
-                ch = g_chUnknown;
+                ch = GetUnknownCharacter(options);
                 // Windows OS 1713952. Only drop the illegal leading byte
                 // Retry next byte.
                 // ptr is already advanced.
@@ -246,7 +258,7 @@ namespace utf8
                     }
                 }
 
-                ch = g_chUnknown;
+                ch = GetUnknownCharacter(options);
                 break;
             }
 
@@ -281,7 +293,7 @@ namespace utf8
                 // Windows OS 1713952. Only drop the illegal leading byte.
                 // Retry next byte.
                 // ptr is already advanced 1.
-                ch = g_chUnknown;
+                ch = GetUnknownCharacter(options);
                 break;
             }
 
diff --git a/lib/Common/Codex/Utf8Codex.h b/lib/Common/Codex/Utf8Codex.h
@@ -95,6 +95,7 @@ typedef const utf8char_t *LPCUTF8;
 
 namespace utf8
 {
+    class InvalidWideCharException {};
 
     // Terminology -
     //   Code point      - A ordinal value mapped to a standard ideograph as defined by ISO/IEC 10646-1. Here
@@ -138,6 +139,7 @@ namespace utf8
                                             // surrogate pair. The second call will return the second word and reset
                                             // this 'option'.
         doAllowInvalidWCHARs        = 0x08, // Don't replace invalid wide chars with 0xFFFD
+        doThrowOnInvalidWCHARs      = 0x10, // throw InvalidWideCharException if an invalid wide char is seen. Incompatible with doAllowInvalidWCHARs
     };
     DEFINE_ENUM_FLAG_OPERATORS(DecodeOptions);
 
diff --git a/lib/WasmReader/WasmBinaryReader.cpp b/lib/WasmReader/WasmBinaryReader.cpp
@@ -1012,24 +1012,23 @@ const char16* WasmBinaryReader::ReadInlineName(uint32& length, uint32& nameLengt
     m_pc += rawNameLength;
     length += rawNameLength;
 
-    utf8::DecodeOptions decodeOptions = utf8::doDefault;
-    nameLength = (uint32)utf8::ByteIndexIntoCharacterIndex(rawName, rawNameLength, decodeOptions);
-    char16* contents = AnewArray(m_alloc, char16, nameLength + 1);
-    size_t decodedLength = utf8::DecodeUnitsIntoAndNullTerminate(contents, rawName, rawName + rawNameLength, decodeOptions);
-    if (decodedLength != nameLength)
+    utf8::DecodeOptions decodeOptions = utf8::doThrowOnInvalidWCHARs;
+    try
     {
-        AssertMsg(UNREACHED, "We calculated the length before decoding, what happened ?");
-        ThrowDecodingError(_u("Error while decoding utf8 string"));
-    }
-    for (size_t i = 0; i < decodedLength; ++i)
-    {
-        const char16 c = contents[i];
-        if (!utf8::IsValidWideChar(c) || c == UNICODE_UNKNOWN_CHAR_MARK)
+        nameLength = (uint32)utf8::ByteIndexIntoCharacterIndex(rawName, rawNameLength, decodeOptions);
+        char16* contents = AnewArray(m_alloc, char16, nameLength + 1);
+        size_t decodedLength = utf8::DecodeUnitsIntoAndNullTerminate(contents, rawName, rawName + rawNameLength, decodeOptions);
+        if (decodedLength != nameLength)
         {
-            ThrowDecodingError(_u("Invalid UTF-8 encoding"));
+            AssertMsg(UNREACHED, "We calculated the length before decoding, what happened ?");
+            ThrowDecodingError(_u("Error while decoding utf8 string"));
         }
+        return contents;
+    }
+    catch (utf8::InvalidWideCharException)
+    {
+        ThrowDecodingError(_u("Invalid UTF-8 encoding"));
     }
-    return contents;
 }
 
 void WasmBinaryReader::ReadImportSection()
diff --git a/lib/WasmReader/WasmByteCodeGenerator.cpp b/lib/WasmReader/WasmByteCodeGenerator.cpp
@@ -1181,18 +1181,18 @@ void WasmBytecodeGenerator::EmitBrTable()
     EmitInfo scrutineeInfo = PopEvalStack(WasmTypes::I32, _u("br_table expression must be of type i32"));
 
     m_writer->AsmReg2(Js::OpCodeAsmJs::BeginSwitch_Int, scrutineeInfo.location, scrutineeInfo.location);
-    EmitInfo yieldvalue;
+    EmitInfo yieldValue;
     BlockInfo defaultBlockInfo = GetBlockInfo(defaultEntry);
     if (defaultBlockInfo.HasYield())
     {
         // If the scrutinee is any then check the stack before popping
         if (scrutineeInfo.type == WasmTypes::Any && m_evalStack.Peek().type == WasmTypes::Limit)
         {
-            yieldvalue = scrutineeInfo;
+            yieldValue = scrutineeInfo;
         }
         else
         {
-            yieldvalue = PopEvalStack();
+            yieldValue = PopEvalStack();
         }
     }
 
@@ -1207,14 +1207,14 @@ void WasmBytecodeGenerator::EmitBrTable()
             WasmTypes::WasmType type = blockInfo.yieldInfo ? blockInfo.yieldInfo->info.type : WasmTypes::Void;
             throw WasmCompilationException(_u("br_table target %u signature mismatch. Expected ()->%s, got ()->%s"), target, GetTypeName(defaultType), GetTypeName(type));
         }
-        YieldToBlock(blockInfo, yieldvalue);
+        YieldToBlock(blockInfo, yieldValue);
         m_writer->AsmBrReg1Const1(Js::OpCodeAsmJs::Case_IntConst, blockInfo.label, scrutineeInfo.location, i);
     }
 
-    YieldToBlock(defaultBlockInfo, yieldvalue);
+    YieldToBlock(defaultBlockInfo, yieldValue);
     m_writer->AsmBr(defaultBlockInfo.label, Js::OpCodeAsmJs::EndSwitch_Int);
     ReleaseLocation(&scrutineeInfo);
-    ReleaseLocation(&yieldvalue);
+    ReleaseLocation(&yieldValue);
 
     SetUnreachableState(true);
 }

Original file line number	Diff line number	Diff line change
`@@ -70,12 +70,20 @@ namespace utf8`
`70`	`70`	`return ((0x5B >> (((prefix ^ 0xF0) >> 3) & 0x1E)) & 0x03) + 1;`
`71`	`71`	`}`
`72`	`72`
`73`		`- const char16 g_chUnknown = char16(UNICODE_UNKNOWN_CHAR_MARK);`
`74`	`73`	`const char16 WCH_UTF16_HIGH_FIRST = char16(0xd800);`
`75`	`74`	`const char16 WCH_UTF16_HIGH_LAST = char16(0xdbff);`
`76`	`75`	`const char16 WCH_UTF16_LOW_FIRST = char16(0xdc00);`
`77`	`76`	`const char16 WCH_UTF16_LOW_LAST = char16(0xdfff);`
`78`	`77`
	`78`	`+ char16 GetUnknownCharacter(DecodeOptions options = doDefault)`
	`79`	`+ {`
	`80`	`+ if ((options & doThrowOnInvalidWCHARs) != 0)`
	`81`	`+ {`
	`82`	`+ throw InvalidWideCharException();`
	`83`	`+ }`
	`84`	`+ return char16(UNICODE_UNKNOWN_CHAR_MARK);`
	`85`	`+ }`
	`86`	`+`
`79`	`87`	`inline BOOL InRange(const char16 ch, const char16 chMin, const char16 chMax)`
`80`	`88`	`{`
`81`	`89`	`return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);`
`@@ -122,7 +130,7 @@ namespace utf8`
`122`	`130`	`}`
`123`	`131`
`124`	`132`	`// 10xxxxxx (trail byte appearing in a lead byte position`
`125`		`- return g_chUnknown;`
	`133`	`+ return GetUnknownCharacter(options);`
`126`	`134`
`127`	`135`	`case 2:`
`128`	`136`	`// Look for an overlong utf-8 sequence.`
`@@ -138,7 +146,7 @@ namespace utf8`
`138`	`146`	`*chunkEndsAtTruncatedSequence = true;`
`139`	`147`	`}`
`140`	`148`	`}`
`141`		`- return g_chUnknown;`
	`149`	`+ return GetUnknownCharacter(options);`
`142`	`150`	`}`
`143`	`151`	`c2 = *ptr++;`
`144`	`152`	`// 110XXXXx 10xxxxxx`
`@@ -152,12 +160,14 @@ namespace utf8`
`152`	`160`	`ch \|= WCHAR(c1 & 0x1f) << 6; // 0x0080 - 0x07ff`
`153`	`161`	`ch \|= WCHAR(c2 & 0x3f);`
`154`	`162`	`if (!IsValidWideChar(ch) && ((options & doAllowInvalidWCHARs) == 0))`
`155`		`- ch = g_chUnknown;`
	`163`	`+ {`
	`164`	`+ ch = GetUnknownCharacter(options);`
	`165`	`+ }`
`156`	`166`	`}`
`157`	`167`	`else`
`158`	`168`	`{`
`159`	`169`	`ptr--;`
`160`		`- ch = g_chUnknown;`
	`170`	`+ ch = GetUnknownCharacter(options);`
`161`	`171`	`}`
`162`	`172`	`break;`
`163`	`173`
`@@ -177,7 +187,7 @@ namespace utf8`
`177`	`187`	`}`
`178`	`188`	`}`
`179`	`189`
`180`		`- return g_chUnknown;`
	`190`	`+ return GetUnknownCharacter(options);`
`181`	`191`	`}`
`182`	`192`
`183`	`193`	`// UTF16 \| UTF8 1st byte 2nd byte 3rd byte`
`@@ -217,12 +227,14 @@ namespace utf8`
`217`	`227`	`ch \|= WCHAR(c2 & 0x3f) << 6; // 0x0080 - 0x07ff`
`218`	`228`	`ch \|= WCHAR(c3 & 0x3f);`
`219`	`229`	`if (!IsValidWideChar(ch) && ((options & (doAllowThreeByteSurrogates \| doAllowInvalidWCHARs)) == 0))`
`220`		`- ch = g_chUnknown;`
	`230`	`+ {`
	`231`	`+ ch = GetUnknownCharacter(options);`
	`232`	`+ }`
`221`	`233`	`ptr += 2;`
`222`	`234`	`}`
`223`	`235`	`else`
`224`	`236`	`{`
`225`		`- ch = g_chUnknown;`
	`237`	`+ ch = GetUnknownCharacter(options);`
`226`	`238`	`// Windows OS 1713952. Only drop the illegal leading byte`
`227`	`239`	`// Retry next byte.`
`228`	`240`	`// ptr is already advanced.`
`@@ -246,7 +258,7 @@ namespace utf8`
`246`	`258`	`}`
`247`	`259`	`}`
`248`	`260`
`249`		`- ch = g_chUnknown;`
	`261`	`+ ch = GetUnknownCharacter(options);`
`250`	`262`	`break;`
`251`	`263`	`}`
`252`	`264`
`@@ -281,7 +293,7 @@ namespace utf8`
`281`	`293`	`// Windows OS 1713952. Only drop the illegal leading byte.`
`282`	`294`	`// Retry next byte.`
`283`	`295`	`// ptr is already advanced 1.`
`284`		`- ch = g_chUnknown;`
	`296`	`+ ch = GetUnknownCharacter(options);`
`285`	`297`	`break;`
`286`	`298`	`}`
`287`	`299`
Original file line number	Diff line number	Diff line change
`@@ -1181,18 +1181,18 @@ void WasmBytecodeGenerator::EmitBrTable()`
`1181`	`1181`	`EmitInfo scrutineeInfo = PopEvalStack(WasmTypes::I32, _u("br_table expression must be of type i32"));`
`1182`	`1182`
`1183`	`1183`	`m_writer->AsmReg2(Js::OpCodeAsmJs::BeginSwitch_Int, scrutineeInfo.location, scrutineeInfo.location);`
`1184`		`- EmitInfo yieldvalue;`
	`1184`	`+ EmitInfo yieldValue;`
`1185`	`1185`	`BlockInfo defaultBlockInfo = GetBlockInfo(defaultEntry);`
`1186`	`1186`	`if (defaultBlockInfo.HasYield())`
`1187`	`1187`	`{`
`1188`	`1188`	`// If the scrutinee is any then check the stack before popping`
`1189`	`1189`	`if (scrutineeInfo.type == WasmTypes::Any && m_evalStack.Peek().type == WasmTypes::Limit)`
`1190`	`1190`	`{`
`1191`		`- yieldvalue = scrutineeInfo;`
	`1191`	`+ yieldValue = scrutineeInfo;`
`1192`	`1192`	`}`
`1193`	`1193`	`else`
`1194`	`1194`	`{`
`1195`		`- yieldvalue = PopEvalStack();`
	`1195`	`+ yieldValue = PopEvalStack();`
`1196`	`1196`	`}`
`1197`	`1197`	`}`
`1198`	`1198`
`@@ -1207,14 +1207,14 @@ void WasmBytecodeGenerator::EmitBrTable()`
`1207`	`1207`	`WasmTypes::WasmType type = blockInfo.yieldInfo ? blockInfo.yieldInfo->info.type : WasmTypes::Void;`
`1208`	`1208`	`throw WasmCompilationException(_u("br_table target %u signature mismatch. Expected ()->%s, got ()->%s"), target, GetTypeName(defaultType), GetTypeName(type));`
`1209`	`1209`	`}`
`1210`		`- YieldToBlock(blockInfo, yieldvalue);`
	`1210`	`+ YieldToBlock(blockInfo, yieldValue);`
`1211`	`1211`	`m_writer->AsmBrReg1Const1(Js::OpCodeAsmJs::Case_IntConst, blockInfo.label, scrutineeInfo.location, i);`
`1212`	`1212`	`}`
`1213`	`1213`
`1214`		`- YieldToBlock(defaultBlockInfo, yieldvalue);`
	`1214`	`+ YieldToBlock(defaultBlockInfo, yieldValue);`
`1215`	`1215`	`m_writer->AsmBr(defaultBlockInfo.label, Js::OpCodeAsmJs::EndSwitch_Int);`
`1216`	`1216`	`ReleaseLocation(&scrutineeInfo);`
`1217`		`- ReleaseLocation(&yieldvalue);`
	`1217`	`+ ReleaseLocation(&yieldValue);`
`1218`	`1218`
`1219`	`1219`	`SetUnreachableState(true);`
`1220`	`1220`	`}`