Skip to content

Commit 0526bc9

Browse files
Clean up JSRT utf8 APIs
1. Change utf8 APIs to only expect UTF8 output and not cesu output 2. Change ch to only support ANSI and utf8 files (not utf16-le) 3. Change existing tests which use utf16-le to use utf8 4. Fix bug in the scanner around decoding of utf8 codepoints which result in utf16 surrogate pairs.
1 parent 655edb8 commit 0526bc9

19 files changed

+1575
-111
lines changed

bin/ch/Helpers.cpp

Lines changed: 39 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,13 @@
44
//-------------------------------------------------------------------------------------------------------
55
#include "stdafx.h"
66

7-
HRESULT Helpers::LoadScriptFromFile(LPCSTR filename, LPCSTR& contents, bool* isUtf8Out /*= nullptr*/, UINT* lengthBytesOut /*= nullptr*/)
7+
HRESULT Helpers::LoadScriptFromFile(LPCSTR filename, LPCSTR& contents, UINT* lengthBytesOut /*= nullptr*/)
88
{
99
HRESULT hr = S_OK;
1010
BYTE * pRawBytes = nullptr;
1111
UINT lengthBytes = 0;
12-
bool isUtf8 = false;
1312
contents = nullptr;
14-
FILE * file;
13+
FILE * file = nullptr;
1514

1615
//
1716
// Open the file as a binary file to prevent CRT from handling encoding, line-break conversions,
@@ -42,81 +41,60 @@ HRESULT Helpers::LoadScriptFromFile(LPCSTR filename, LPCSTR& contents, bool* isU
4241
IfFailGo(E_FAIL);
4342
}
4443

45-
//
46-
// Determine the file length, in bytes.
47-
//
48-
fseek(file, 0, SEEK_END);
49-
lengthBytes = ftell(file);
50-
fseek(file, 0, SEEK_SET);
51-
pRawBytes = (LPBYTE)malloc(lengthBytes + sizeof(WCHAR));
52-
if (nullptr == pRawBytes)
44+
if (file != nullptr)
5345
{
54-
fwprintf(stderr, _u("out of memory"));
55-
IfFailGo(E_OUTOFMEMORY);
56-
}
57-
58-
//
59-
// Read the entire content as a binary block.
60-
//
61-
fread(pRawBytes, sizeof(BYTE), lengthBytes, file);
62-
fclose(file);
63-
*reinterpret_cast<WCHAR*>(pRawBytes + lengthBytes) = 0; // Null terminate it. Could be UTF16
64-
65-
//
66-
// Read encoding, handling any conversion to Unicode.
67-
//
68-
// Warning: The UNICODE buffer for parsing is supposed to be provided by the host.
69-
// This is not a complete read of the encoding. Some encodings like UTF7, UTF1, EBCDIC, SCSU, BOCU could be
70-
// wrongly classified as ANSI
71-
//
72-
{
73-
LPCWSTR contentsRaw = reinterpret_cast<LPCWSTR>(pRawBytes);
74-
if ((0xEF == *pRawBytes && 0xBB == *(pRawBytes + 1) && 0xBF == *(pRawBytes + 2)))
75-
{
76-
isUtf8 = true;
77-
}
78-
else if (0xFFFE == *contentsRaw || (0x0000 == *contentsRaw && 0xFEFF == *(contentsRaw + 1)))
46+
// Determine the file length, in bytes.
47+
fseek(file, 0, SEEK_END);
48+
lengthBytes = ftell(file);
49+
fseek(file, 0, SEEK_SET);
50+
pRawBytes = (LPBYTE)malloc(lengthBytes + sizeof(WCHAR));
51+
if (nullptr == pRawBytes)
7952
{
80-
// unicode unsupported
81-
fwprintf(stderr, _u("unsupported file encoding"));
82-
IfFailGo(E_UNEXPECTED);
53+
fwprintf(stderr, _u("out of memory"));
54+
IfFailGo(E_OUTOFMEMORY);
8355
}
84-
else if (0xFEFF == *contentsRaw)
85-
{
86-
// unicode LE
87-
isUtf8 = false;
88-
}
89-
else
56+
57+
//
58+
// Read the entire content as a binary block.
59+
//
60+
fread(pRawBytes, sizeof(BYTE), lengthBytes, file);
61+
*reinterpret_cast<WCHAR*>(pRawBytes + lengthBytes) = 0; // Null terminate it. Could be UTF16
62+
63+
//
64+
// Read encoding, handling any conversion to Unicode.
65+
//
66+
// Warning: The UNICODE buffer for parsing is supposed to be provided by the host.
67+
// This is not a complete read of the encoding. Some encodings like UTF7, UTF1, EBCDIC, SCSU, BOCU could be
68+
// wrongly classified as ANSI
69+
//
9070
{
91-
// Assume UTF8
92-
isUtf8 = true;
71+
LPCWSTR contentsRaw = reinterpret_cast<LPCWSTR>(pRawBytes);
72+
if (0xFFFE == *contentsRaw || (0x0000 == *contentsRaw && 0xFEFF == *(contentsRaw + 1)) ||
73+
0xFEFF == *contentsRaw)
74+
{
75+
// unicode unsupported
76+
fwprintf(stderr, _u("unsupported file encoding. Only ANSI and UTF8 supported"));
77+
IfFailGo(E_UNEXPECTED);
78+
}
9379
}
9480
}
9581

96-
if (isUtf8)
97-
{
98-
contents = reinterpret_cast<LPCSTR>(pRawBytes);
99-
}
100-
else
101-
{
102-
LPSTR pNarrow = nullptr;
103-
IfFailGo(WideStringToNarrowDynamic(reinterpret_cast<LPCWSTR>(pRawBytes), &pNarrow));
104-
contents = pNarrow;
105-
}
82+
contents = reinterpret_cast<LPCSTR>(pRawBytes);
10683

10784
Error:
10885
if (SUCCEEDED(hr))
10986
{
110-
if (isUtf8Out)
111-
{
112-
*isUtf8Out = isUtf8;
113-
}
11487
if (lengthBytesOut)
11588
{
11689
*lengthBytesOut = lengthBytes;
11790
}
11891
}
11992

93+
if (file != nullptr)
94+
{
95+
fclose(file);
96+
}
97+
12098
if (pRawBytes && reinterpret_cast<LPCSTR>(pRawBytes) != contents)
12199
{
122100
free(pRawBytes);

bin/ch/Helpers.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
class Helpers
88
{
99
public :
10-
static HRESULT LoadScriptFromFile(LPCSTR filename, LPCSTR& contents, bool* isUtf8Out = nullptr, UINT* lengthBytesOut = nullptr);
10+
static HRESULT LoadScriptFromFile(LPCSTR filename, LPCSTR& contents, UINT* lengthBytesOut = nullptr);
1111
static LPCWSTR JsErrorCodeToString(JsErrorCode jsErrorCode);
1212
static void LogError(__in __nullterminated const char16 *msg, ...);
1313
};

bin/ch/ch.cpp

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -309,15 +309,14 @@ HRESULT ExecuteTest(const char* fileName)
309309
HRESULT hr = S_OK;
310310
LPCSTR fileContents = nullptr;
311311
JsRuntimeHandle runtime = JS_INVALID_RUNTIME_HANDLE;
312-
bool isUtf8 = false;
313312
UINT lengthBytes = 0;
314313

315314
JsContextRef context = JS_INVALID_REFERENCE;
316315

317316
char fullPath[_MAX_PATH];
318317
size_t len = 0;
319318

320-
hr = Helpers::LoadScriptFromFile(fileName, fileContents, &isUtf8, &lengthBytes);
319+
hr = Helpers::LoadScriptFromFile(fileName, fileContents, &lengthBytes);
321320

322321
IfFailGo(hr);
323322
if (HostConfigFlags::flags.GenerateLibraryByteCodeHeaderIsEnabled)
@@ -359,39 +358,23 @@ HRESULT ExecuteTest(const char* fileName)
359358

360359
if (HostConfigFlags::flags.GenerateLibraryByteCodeHeaderIsEnabled)
361360
{
362-
if (isUtf8)
361+
if (HostConfigFlags::flags.GenerateLibraryByteCodeHeader != nullptr && *HostConfigFlags::flags.GenerateLibraryByteCodeHeader != _u('\0'))
363362
{
364-
if (HostConfigFlags::flags.GenerateLibraryByteCodeHeader != nullptr && *HostConfigFlags::flags.GenerateLibraryByteCodeHeader != _u('\0'))
365-
{
366-
CHAR libraryName[_MAX_PATH];
367-
CHAR ext[_MAX_EXT];
368-
_splitpath_s(fullPath, NULL, 0, NULL, 0, libraryName, _countof(libraryName), ext, _countof(ext));
369-
370-
IfFailGo(CreateLibraryByteCodeHeader(fileContents, lengthBytes, HostConfigFlags::flags.GenerateLibraryByteCodeHeader, libraryName));
371-
}
372-
else
373-
{
374-
fwprintf(stderr, _u("FATAL ERROR: -GenerateLibraryByteCodeHeader must provide the file name, i.e., -GenerateLibraryByteCodeHeader:<bytecode file name>, exiting\n"));
375-
IfFailGo(E_FAIL);
376-
}
363+
CHAR libraryName[_MAX_PATH];
364+
CHAR ext[_MAX_EXT];
365+
_splitpath_s(fullPath, NULL, 0, NULL, 0, libraryName, _countof(libraryName), ext, _countof(ext));
366+
367+
IfFailGo(CreateLibraryByteCodeHeader(fileContents, lengthBytes, HostConfigFlags::flags.GenerateLibraryByteCodeHeader, libraryName));
377368
}
378369
else
379370
{
380-
fwprintf(stderr, _u("FATAL ERROR: GenerateLibraryByteCodeHeader flag can only be used on UTF8 file, exiting\n"));
371+
fwprintf(stderr, _u("FATAL ERROR: -GenerateLibraryByteCodeHeader must provide the file name, i.e., -GenerateLibraryByteCodeHeader:<bytecode file name>, exiting\n"));
381372
IfFailGo(E_FAIL);
382373
}
383374
}
384375
else if (HostConfigFlags::flags.SerializedIsEnabled)
385376
{
386-
if (isUtf8)
387-
{
388-
CreateAndRunSerializedScript(fileName, fileContents, fullPath);
389-
}
390-
else
391-
{
392-
fwprintf(stderr, _u("FATAL ERROR: Serialized flag can only be used on UTF8 file, exiting\n"));
393-
IfFailGo(E_FAIL);
394-
}
377+
CreateAndRunSerializedScript(fileName, fileContents, fullPath);
395378
}
396379
else
397380
{

lib/Parser/Scan.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1831,8 +1831,10 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
18311831

18321832
if (Js::NumberUtilities::IsSurrogateUpperPart(upper))
18331833
{
1834+
// Consume the rest of the utf8 bytes for the codepoint
1835+
OLECHAR decodedUpper = this->ReadSurrogatePairUpper(p, last);
1836+
Assert(decodedUpper == (OLECHAR) upper);
18341837
ch = Js::NumberUtilities::SurrogatePairAsCodePoint(ch, upper);
1835-
this->template ReadFull<true>(p, last);
18361838
}
18371839
}
18381840

lib/Parser/Scan.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,13 @@ class NullTerminatedUnicodeEncodingPolicy
163163
static OLECHAR ReadFull(EncodedCharPtr &p, EncodedCharPtr last) { return *p++; }
164164
static OLECHAR PeekFirst(EncodedCharPtr p, EncodedCharPtr last) { return *p; }
165165
static OLECHAR PeekFull(EncodedCharPtr p, EncodedCharPtr last) { return *p; }
166+
167+
static OLECHAR ReadSurrogatePairUpper(const EncodedCharPtr&, const EncodedCharPtr& last)
168+
{
169+
AssertMsg(false, "method should not be called while scanning UTF16 string");
170+
return 0xfffe;
171+
}
172+
166173
static void RestoreMultiUnits(size_t multiUnits) { }
167174
static size_t CharacterOffsetToUnitOffset(EncodedCharPtr start, EncodedCharPtr current, EncodedCharPtr last, charcount_t offset) { return offset; }
168175

@@ -205,6 +212,14 @@ class UTF8EncodingPolicyBase
205212
return !IsMultiUnitChar(ch) ? static_cast< OLECHAR >(ch) : ReadRest<bScan>(ch, p, last);
206213
}
207214

215+
OLECHAR ReadSurrogatePairUpper(EncodedCharPtr &p, EncodedCharPtr last)
216+
{
217+
EncodedChar ch = (nullTerminated || p < last) ? *p++ : (p++, 0);
218+
Assert(IsMultiUnitChar(ch));
219+
this->m_decodeOptions |= utf8::DecodeOptions::doSecondSurrogatePair;
220+
return ReadRest<true>(ch, p, last);
221+
}
222+
208223
static OLECHAR PeekFirst(EncodedCharPtr p, EncodedCharPtr last) { return (nullTerminated || p < last) ? static_cast< OLECHAR >(*p) : 0; }
209224

210225
OLECHAR PeekFull(EncodedCharPtr p, EncodedCharPtr last)

lib/Runtime/Base/ScriptContext.cpp

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1591,7 +1591,10 @@ if (!sourceList)
15911591

15921592
bool isLibraryCode = ((loadScriptFlag & LoadScriptFlag_LibraryCode) == LoadScriptFlag_LibraryCode);
15931593

1594-
bool utf8FromExternal = false;
1594+
// We assume that any buffer that wasn't passed in with LoadScriptFlag_Utf8Source is UTF16-LE
1595+
// We convert this into CESU-8 using our codex library
1596+
// isRealUtf8 tracks whether the buffer is truly UTF8 or if it's CESU-8, for the purpose of parsing later
1597+
bool isRealUtf8 = false;
15951598
if ((loadScriptFlag & LoadScriptFlag_Utf8Source) != LoadScriptFlag_Utf8Source)
15961599
{
15971600
// Convert to UTF8 and then load that
@@ -1629,16 +1632,12 @@ if (!sourceList)
16291632
}
16301633
else
16311634
{
1632-
// xplat-todo: This is temporary. How to tell if utf8 fromExternal?
1633-
if (cb >= 3 && script[0] == 0xEF && script[1] == 0xBB && script[2] == 0xBF) // ef bb bf
1634-
{
1635-
utf8Script = (LPUTF8)script;
1636-
cbNeeded = cb;
1637-
}
1638-
else
1639-
{
1640-
utf8FromExternal = true;
1641-
}
1635+
// If LoadScriptFlag_Utf8Source was passed in, then the source buffer is guaranteed to actually be
1636+
// UTF8, and not CESU-8. JSRT APIs expect real UTF8 buffers to be passed in, not CESU-8, and ch.exe
1637+
// only supports ANSI (utf8-compatible) and actual UTF8 files, not UTF16/UTF16-LE encoded files.
1638+
isRealUtf8 = true;
1639+
utf8Script = (LPUTF8)script;
1640+
cbNeeded = cb;
16421641

16431642
// We do not own the memory passed into DefaultLoadScriptUtf8. We need to save it so we copy the memory.
16441643
if (*ppSourceInfo == nullptr)
@@ -1693,7 +1692,7 @@ if (!sourceList)
16931692
}
16941693

16951694
ParseNodePtr parseTree;
1696-
if (utf8FromExternal)
1695+
if (isRealUtf8)
16971696
{
16981697
Assert((loadScriptFlag & LoadScriptFlag_Utf8Source)
16991698
== LoadScriptFlag_Utf8Source);

test/es6/rlexe.xml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -505,8 +505,8 @@
505505
</test>
506506
<test>
507507
<default>
508-
<files>unicode_6_identifiers.js</files>
509-
<baseline>unicode_6_identifiers.baseline</baseline>
508+
<files>unicode_6_identifiers_utf8.js</files>
509+
<baseline>unicode_6_identifiers_utf8.baseline</baseline>
510510
<compile-flags> -ES6Unicode</compile-flags>
511511
<tags>exclude_win7,exclude_ship</tags>
512512
</default>
@@ -584,7 +584,7 @@
584584
</test>
585585
<test>
586586
<default>
587-
<files>unicode_idDeferParseFunctions.js</files>
587+
<files>unicode_idDeferParseFunctions_utf8.js</files>
588588
<compile-flags>-ES6Unicode</compile-flags>
589589
</default>
590590
</test>
@@ -627,17 +627,17 @@
627627
</test>
628628
<test>
629629
<default>
630-
<files>unicode_regex_surrogate.js</files>
631-
<baseline>unicode_regex_surrogate.baseline</baseline>
630+
<files>unicode_regex_surrogate_utf8.js</files>
631+
<baseline>unicode_regex_surrogate_utf8.baseline</baseline>
632632
<compile-flags> -ES6Unicode -ES6RegExSticky</compile-flags>
633633
<tags>exclude_ship,Slow</tags>
634634
<timeout>300</timeout>
635635
</default>
636636
</test>
637637
<test>
638638
<default>
639-
<files>unicode_blue_533163.js</files>
640-
<baseline>unicode_blue_533163.baseline</baseline>
639+
<files>unicode_blue_533163_utf8.js</files>
640+
<baseline>unicode_blue_533163_utf8.baseline</baseline>
641641
<compile-flags> -ES6Unicode</compile-flags>
642642
<tags>exclude_ship</tags>
643643
</default>

test/es6/unicode_6_identifiers.js

-13 KB
Binary file not shown.

0 commit comments

Comments
 (0)