Skip to content

Commit 68344c9

Browse files
committed
CR Fixes
Use new utf8 DecodingOption to throw when using invalid character instead of rechecking the whole buffer
1 parent 8bdded8 commit 68344c9

File tree

4 files changed

+43
-30
lines changed

4 files changed

+43
-30
lines changed

lib/Common/Codex/Utf8Codex.cpp

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,20 @@ namespace utf8
7070
return ((0x5B >> (((prefix ^ 0xF0) >> 3) & 0x1E)) & 0x03) + 1;
7171
}
7272

73-
const char16 g_chUnknown = char16(UNICODE_UNKNOWN_CHAR_MARK);
7473
const char16 WCH_UTF16_HIGH_FIRST = char16(0xd800);
7574
const char16 WCH_UTF16_HIGH_LAST = char16(0xdbff);
7675
const char16 WCH_UTF16_LOW_FIRST = char16(0xdc00);
7776
const char16 WCH_UTF16_LOW_LAST = char16(0xdfff);
7877

78+
char16 GetUnknownCharacter(DecodeOptions options = doDefault)
79+
{
80+
if ((options & doThrowOnInvalidWCHARs) != 0)
81+
{
82+
throw InvalidWideCharException();
83+
}
84+
return char16(UNICODE_UNKNOWN_CHAR_MARK);
85+
}
86+
7987
inline BOOL InRange(const char16 ch, const char16 chMin, const char16 chMax)
8088
{
8189
return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
@@ -122,7 +130,7 @@ namespace utf8
122130
}
123131

124132
// 10xxxxxx (trail byte appearing in a lead byte position
125-
return g_chUnknown;
133+
return GetUnknownCharacter(options);
126134

127135
case 2:
128136
// Look for an overlong utf-8 sequence.
@@ -138,7 +146,7 @@ namespace utf8
138146
*chunkEndsAtTruncatedSequence = true;
139147
}
140148
}
141-
return g_chUnknown;
149+
return GetUnknownCharacter(options);
142150
}
143151
c2 = *ptr++;
144152
// 110XXXXx 10xxxxxx
@@ -152,12 +160,14 @@ namespace utf8
152160
ch |= WCHAR(c1 & 0x1f) << 6; // 0x0080 - 0x07ff
153161
ch |= WCHAR(c2 & 0x3f);
154162
if (!IsValidWideChar(ch) && ((options & doAllowInvalidWCHARs) == 0))
155-
ch = g_chUnknown;
163+
{
164+
ch = GetUnknownCharacter(options);
165+
}
156166
}
157167
else
158168
{
159169
ptr--;
160-
ch = g_chUnknown;
170+
ch = GetUnknownCharacter(options);
161171
}
162172
break;
163173

@@ -177,7 +187,7 @@ namespace utf8
177187
}
178188
}
179189

180-
return g_chUnknown;
190+
return GetUnknownCharacter(options);
181191
}
182192

183193
// UTF16 | UTF8 1st byte 2nd byte 3rd byte
@@ -217,12 +227,14 @@ namespace utf8
217227
ch |= WCHAR(c2 & 0x3f) << 6; // 0x0080 - 0x07ff
218228
ch |= WCHAR(c3 & 0x3f);
219229
if (!IsValidWideChar(ch) && ((options & (doAllowThreeByteSurrogates | doAllowInvalidWCHARs)) == 0))
220-
ch = g_chUnknown;
230+
{
231+
ch = GetUnknownCharacter(options);
232+
}
221233
ptr += 2;
222234
}
223235
else
224236
{
225-
ch = g_chUnknown;
237+
ch = GetUnknownCharacter(options);
226238
// Windows OS 1713952. Only drop the illegal leading byte
227239
// Retry next byte.
228240
// ptr is already advanced.
@@ -246,7 +258,7 @@ namespace utf8
246258
}
247259
}
248260

249-
ch = g_chUnknown;
261+
ch = GetUnknownCharacter(options);
250262
break;
251263
}
252264

@@ -281,7 +293,7 @@ namespace utf8
281293
// Windows OS 1713952. Only drop the illegal leading byte.
282294
// Retry next byte.
283295
// ptr is already advanced 1.
284-
ch = g_chUnknown;
296+
ch = GetUnknownCharacter(options);
285297
break;
286298
}
287299

lib/Common/Codex/Utf8Codex.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ typedef const utf8char_t *LPCUTF8;
9595

9696
namespace utf8
9797
{
98+
class InvalidWideCharException {};
9899

99100
// Terminology -
100101
// Code point - A ordinal value mapped to a standard ideograph as defined by ISO/IEC 10646-1. Here
@@ -138,6 +139,7 @@ namespace utf8
138139
// surrogate pair. The second call will return the second word and reset
139140
// this 'option'.
140141
doAllowInvalidWCHARs = 0x08, // Don't replace invalid wide chars with 0xFFFD
142+
doThrowOnInvalidWCHARs = 0x10, // throw InvalidWideCharException if an invalid wide char is seen. Incompatible with doAllowInvalidWCHARs
141143
};
142144
DEFINE_ENUM_FLAG_OPERATORS(DecodeOptions);
143145

lib/WasmReader/WasmBinaryReader.cpp

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,24 +1012,23 @@ const char16* WasmBinaryReader::ReadInlineName(uint32& length, uint32& nameLengt
10121012
m_pc += rawNameLength;
10131013
length += rawNameLength;
10141014

1015-
utf8::DecodeOptions decodeOptions = utf8::doDefault;
1016-
nameLength = (uint32)utf8::ByteIndexIntoCharacterIndex(rawName, rawNameLength, decodeOptions);
1017-
char16* contents = AnewArray(m_alloc, char16, nameLength + 1);
1018-
size_t decodedLength = utf8::DecodeUnitsIntoAndNullTerminate(contents, rawName, rawName + rawNameLength, decodeOptions);
1019-
if (decodedLength != nameLength)
1015+
utf8::DecodeOptions decodeOptions = utf8::doThrowOnInvalidWCHARs;
1016+
try
10201017
{
1021-
AssertMsg(UNREACHED, "We calculated the length before decoding, what happened ?");
1022-
ThrowDecodingError(_u("Error while decoding utf8 string"));
1023-
}
1024-
for (size_t i = 0; i < decodedLength; ++i)
1025-
{
1026-
const char16 c = contents[i];
1027-
if (!utf8::IsValidWideChar(c) || c == UNICODE_UNKNOWN_CHAR_MARK)
1018+
nameLength = (uint32)utf8::ByteIndexIntoCharacterIndex(rawName, rawNameLength, decodeOptions);
1019+
char16* contents = AnewArray(m_alloc, char16, nameLength + 1);
1020+
size_t decodedLength = utf8::DecodeUnitsIntoAndNullTerminate(contents, rawName, rawName + rawNameLength, decodeOptions);
1021+
if (decodedLength != nameLength)
10281022
{
1029-
ThrowDecodingError(_u("Invalid UTF-8 encoding"));
1023+
AssertMsg(UNREACHED, "We calculated the length before decoding, what happened ?");
1024+
ThrowDecodingError(_u("Error while decoding utf8 string"));
10301025
}
1026+
return contents;
1027+
}
1028+
catch (utf8::InvalidWideCharException)
1029+
{
1030+
ThrowDecodingError(_u("Invalid UTF-8 encoding"));
10311031
}
1032-
return contents;
10331032
}
10341033

10351034
void WasmBinaryReader::ReadImportSection()

lib/WasmReader/WasmByteCodeGenerator.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,18 +1181,18 @@ void WasmBytecodeGenerator::EmitBrTable()
11811181
EmitInfo scrutineeInfo = PopEvalStack(WasmTypes::I32, _u("br_table expression must be of type i32"));
11821182

11831183
m_writer->AsmReg2(Js::OpCodeAsmJs::BeginSwitch_Int, scrutineeInfo.location, scrutineeInfo.location);
1184-
EmitInfo yieldvalue;
1184+
EmitInfo yieldValue;
11851185
BlockInfo defaultBlockInfo = GetBlockInfo(defaultEntry);
11861186
if (defaultBlockInfo.HasYield())
11871187
{
11881188
// If the scrutinee is any then check the stack before popping
11891189
if (scrutineeInfo.type == WasmTypes::Any && m_evalStack.Peek().type == WasmTypes::Limit)
11901190
{
1191-
yieldvalue = scrutineeInfo;
1191+
yieldValue = scrutineeInfo;
11921192
}
11931193
else
11941194
{
1195-
yieldvalue = PopEvalStack();
1195+
yieldValue = PopEvalStack();
11961196
}
11971197
}
11981198

@@ -1207,14 +1207,14 @@ void WasmBytecodeGenerator::EmitBrTable()
12071207
WasmTypes::WasmType type = blockInfo.yieldInfo ? blockInfo.yieldInfo->info.type : WasmTypes::Void;
12081208
throw WasmCompilationException(_u("br_table target %u signature mismatch. Expected ()->%s, got ()->%s"), target, GetTypeName(defaultType), GetTypeName(type));
12091209
}
1210-
YieldToBlock(blockInfo, yieldvalue);
1210+
YieldToBlock(blockInfo, yieldValue);
12111211
m_writer->AsmBrReg1Const1(Js::OpCodeAsmJs::Case_IntConst, blockInfo.label, scrutineeInfo.location, i);
12121212
}
12131213

1214-
YieldToBlock(defaultBlockInfo, yieldvalue);
1214+
YieldToBlock(defaultBlockInfo, yieldValue);
12151215
m_writer->AsmBr(defaultBlockInfo.label, Js::OpCodeAsmJs::EndSwitch_Int);
12161216
ReleaseLocation(&scrutineeInfo);
1217-
ReleaseLocation(&yieldvalue);
1217+
ReleaseLocation(&yieldValue);
12181218

12191219
SetUnreachableState(true);
12201220
}

0 commit comments

Comments
 (0)