Skip to content

Commit 37e79d8

Browse files
committed
Add a flag to DecodeUnitsInto to detect the case where the buffer ends at a truncated unicode sequence
Only enabled this if the option to do chunked-decoding is enabled.
1 parent 2622a27 commit 37e79d8

File tree

2 files changed

+34
-7
lines changed

2 files changed

+34
-7
lines changed

lib/Common/Codex/Utf8Codex.cpp

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ namespace utf8
9797
}
9898

9999
_At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end))
100-
inline char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options)
100+
inline char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence)
101101
{
102102
char16 ch = 0;
103103
BYTE c2, c3, c4;
@@ -129,8 +129,15 @@ namespace utf8
129129
if (ptr >= end)
130130
{
131131
if ((options & doChunkedEncoding) != 0)
132+
{
132133
// The is a sequence that spans a chunk, push ptr back to the beginning of the sequence.
133134
ptr--;
135+
136+
if (chunkEndsAtTruncatedSequence)
137+
{
138+
*chunkEndsAtTruncatedSequence = true;
139+
}
140+
}
134141
return g_chUnknown;
135142
}
136143
c2 = *ptr++;
@@ -160,8 +167,16 @@ namespace utf8
160167
if (ptr + 1 >= end)
161168
{
162169
if ((options & doChunkedEncoding) != 0)
170+
{
163171
// The is a sequence that spans a chunk, push ptr back to the beginning of the sequence.
164172
ptr--;
173+
174+
if (chunkEndsAtTruncatedSequence)
175+
{
176+
*chunkEndsAtTruncatedSequence = true;
177+
}
178+
}
179+
165180
return g_chUnknown;
166181
}
167182

@@ -221,9 +236,16 @@ namespace utf8
221236
if (ptr + 2 >= end)
222237
{
223238
if ((options & doChunkedEncoding) != 0)
239+
{
224240
// The is a sequence that spans a chunk, push ptr back to the beginning of the sequence.
225241
ptr--;
226242

243+
if (chunkEndsAtTruncatedSequence)
244+
{
245+
*chunkEndsAtTruncatedSequence = true;
246+
}
247+
}
248+
227249
ch = g_chUnknown;
228250
break;
229251
}
@@ -378,10 +400,15 @@ namespace utf8
378400
}
379401

380402
_Use_decl_annotations_
381-
size_t DecodeUnitsInto(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
403+
size_t DecodeUnitsInto(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options, bool *chunkEndsAtTruncatedSequence)
382404
{
383405
DecodeOptions localOptions = options;
384406

407+
if (chunkEndsAtTruncatedSequence)
408+
{
409+
*chunkEndsAtTruncatedSequence = false;
410+
}
411+
385412
LPCUTF8 p = pbUtf8;
386413
char16 *dest = buffer;
387414

@@ -402,7 +429,7 @@ namespace utf8
402429
while (p < pbEnd)
403430
{
404431
LPCUTF8 s = p;
405-
char16 chDest = Decode(p, pbEnd, localOptions);
432+
char16 chDest = Decode(p, pbEnd, localOptions, chunkEndsAtTruncatedSequence);
406433

407434
if (s < p)
408435
{

lib/Common/Codex/Utf8Codex.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,19 +143,19 @@ namespace utf8
143143

144144
// Decode the trail bytes after the UTF8 lead byte c1 but returning 0xFFFD if trail bytes are expected after end.
145145
_At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end))
146-
char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options);
146+
char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence = nullptr);
147147

148148
// Decode the UTF8 sequence into a UTF16 encoding. Code points outside the Unicode base plain will generate
149149
// surrogate pairs, using the 'doSecondSurrogatePair' option to remember the first word has already been returned.
150150
// If ptr == end 0x0000 is emitted. If ptr < end but the lead byte of the UTF8 sequence
151151
// expects trail bytes past end then 0xFFFD are emitted until ptr == end.
152152
_At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) && ptr <= end))
153-
inline char16 Decode(LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options)
153+
inline char16 Decode(LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence = nullptr)
154154
{
155155
if (ptr >= end) return 0;
156156
utf8char_t c1 = *ptr++;
157157
if (c1 < 0x80) return static_cast<char16>(c1);
158-
return DecodeTail(c1, ptr, end, options);
158+
return DecodeTail(c1, ptr, end, options, chunkEndsAtTruncatedSequence);
159159
}
160160

161161
// Encode ch into a UTF8 sequence ignoring surrogate pairs (which are encoded as two
@@ -274,7 +274,7 @@ namespace utf8
274274

275275
// Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer
276276
_Ret_range_(0, pbEnd - _Old_(pbUtf8))
277-
size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);
277+
size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault, bool *chunkEndsAtTruncatedSequence = nullptr);
278278

279279
// Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer (excluding the null terminator)
280280
size_t DecodeUnitsIntoAndNullTerminate(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);

0 commit comments

Comments
 (0)