Skip to content

Commit

Permalink
Add string conversion error modes (#1902)
Browse files Browse the repository at this point in the history
  • Loading branch information
dcodeIO committed Jun 12, 2021
1 parent 3a76daf commit 3564848
Show file tree
Hide file tree
Showing 23 changed files with 5,214 additions and 4,087 deletions.
23 changes: 15 additions & 8 deletions lib/loader/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,24 @@ const ARRAY_SIZE = 16;
const BIGINT = typeof BigUint64Array !== "undefined";
const THIS = Symbol();

const STRING_DECODE_THRESHOLD = 32;
const decoder = new TextDecoder("utf-16le");
const STRING_SMALLSIZE = 192; // break-even point in V8
const STRING_CHUNKSIZE = 1024; // mitigate stack overflow
const utf16 = new TextDecoder("utf-16le", { fatal: true }); // != wtf16

/** Gets a string from an U32 and an U16 view on a memory. */
/** Gets a string from memory. */
function getStringImpl(buffer, ptr) {
const len = new Uint32Array(buffer)[ptr + SIZE_OFFSET >>> 2] >>> 1;
const arr = new Uint16Array(buffer, ptr, len);
if (len <= STRING_DECODE_THRESHOLD) {
return String.fromCharCode.apply(String, arr);
let len = new Uint32Array(buffer)[ptr + SIZE_OFFSET >>> 2] >>> 1;
const wtf16 = new Uint16Array(buffer, ptr, len);
if (len <= STRING_SMALLSIZE) return String.fromCharCode(...wtf16);
try {
return utf16.decode(wtf16);
} catch {
let str = "", off = 0;
while (len - off > STRING_CHUNKSIZE) {
str += String.fromCharCode(...wtf16.subarray(off, off += STRING_CHUNKSIZE));
}
return str + String.fromCharCode(...wtf16.subarray(off));
}
return decoder.decode(arr);
}

/** Prepares the base module prior to instantiation. */
Expand Down
7 changes: 5 additions & 2 deletions lib/loader/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,8 @@
"umd/index.js",
"umd/package.json",
"README.md"
]
}
],
"devDependencies": {
"esm2umd": "^0.1.2"
}
}
Binary file modified lib/loader/tests/build/default.wasm
Binary file not shown.
Binary file modified lib/loader/tests/build/legacy.wasm
Binary file not shown.
45 changes: 30 additions & 15 deletions lib/loader/umd/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,33 @@ var loader = (function(exports) {
const ARRAY_SIZE = 16;
const BIGINT = typeof BigUint64Array !== "undefined";
const THIS = Symbol();
const STRING_DECODE_THRESHOLD = 32;
const decoder = new TextDecoder("utf-16le");
/** Gets a string from an U32 and an U16 view on a memory. */
const STRING_SMALLSIZE = 192; // break-even point in V8

const STRING_CHUNKSIZE = 1024; // mitigate stack overflow

const utf16 = new TextDecoder("utf-16le", {
fatal: true
}); // != wtf16

/** Gets a string from memory. */

function getStringImpl(buffer, ptr) {
const len = new Uint32Array(buffer)[ptr + SIZE_OFFSET >>> 2] >>> 1;
const arr = new Uint16Array(buffer, ptr, len);
let len = new Uint32Array(buffer)[ptr + SIZE_OFFSET >>> 2] >>> 1;
const wtf16 = new Uint16Array(buffer, ptr, len);
if (len <= STRING_SMALLSIZE) return String.fromCharCode(...wtf16);

try {
return utf16.decode(wtf16);
} catch {
let str = "",
off = 0;

while (len - off > STRING_CHUNKSIZE) {
str += String.fromCharCode(...wtf16.subarray(off, off += STRING_CHUNKSIZE));
}

if (len <= STRING_DECODE_THRESHOLD) {
return String.fromCharCode.apply(String, arr);
return str + String.fromCharCode(...wtf16.subarray(off));
}

return decoder.decode(arr);
}
/** Prepares the base module prior to instantiation. */

Expand Down Expand Up @@ -110,9 +124,10 @@ var loader = (function(exports) {

const __collect = exports.__collect || F_NOEXPORTRUNTIME;

const __rtti_base = exports.__rtti_base || ~0; // oob if not present


const __rtti_base = exports.__rtti_base;
const getRttiCount = __rtti_base ? function (arr) {
return arr[__rtti_base >>> 2];
} : F_NOEXPORTRUNTIME;
extendedExports.__new = __new;
extendedExports.__pin = __pin;
extendedExports.__unpin = __unpin;
Expand All @@ -121,7 +136,7 @@ var loader = (function(exports) {

function getInfo(id) {
const U32 = new Uint32Array(memory.buffer);
const count = U32[__rtti_base >>> 2];
const count = getRttiCount(U32);
if ((id >>>= 0) >= count) throw Error(`invalid id: ${id}`);
return U32[(__rtti_base + 4 >>> 2) + id * 2];
}
Expand All @@ -138,7 +153,7 @@ var loader = (function(exports) {

function getBase(id) {
const U32 = new Uint32Array(memory.buffer);
const count = U32[__rtti_base >>> 2];
const count = getRttiCount(U32);
if ((id >>>= 0) >= count) throw Error(`invalid id: ${id}`);
return U32[(__rtti_base + 4 >>> 2) + id * 2 + 1];
}
Expand Down Expand Up @@ -330,7 +345,7 @@ var loader = (function(exports) {
const U32 = new Uint32Array(memory.buffer);
let id = U32[ptr + ID_OFFSET >>> 2];

if (id <= U32[__rtti_base >>> 2]) {
if (id <= getRttiCount(U32)) {
do {
if (id == baseId) return true;
id = getBase(id);
Expand Down
17 changes: 13 additions & 4 deletions std/assembly/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1773,12 +1773,21 @@ declare class String {
declare namespace String {
/** Encoding helpers for UTF-8. */
export namespace UTF8 {
/** UTF-8 encoding error modes. */
export const enum ErrorMode {
/** Keeps unpaired surrogates as of WTF-8. This is the default. */
WTF8,
/** Replaces unpaired surrogates with the replacement character (U+FFFD). */
REPLACE,
/** Throws an error on unpaired surrogates. */
ERROR
}
/** Calculates the byte length of the specified string when encoded as UTF-8, optionally null terminated. */
export function byteLength(str: string, nullTerminated?: bool): i32;
/** Encodes the specified string to UTF-8 bytes, optionally null terminated. */
export function encode(str: string, nullTerminated?: bool): ArrayBuffer;
/** Encodes the specified raw string to UTF-8 bytes, opionally null terminated. Returns the number of bytes written. */
export function encodeUnsafe(str: usize, len: i32, buf: usize, nullTerminated?: bool): usize;
/** Encodes the specified string to UTF-8 bytes, optionally null terminated. ErrorMode defaults to WTF-8. */
export function encode(str: string, nullTerminated?: bool, errorMode?: ErrorMode): ArrayBuffer;
/** Encodes the specified raw string to UTF-8 bytes, opionally null terminated. ErrorMode defaults to WTF-8. Returns the number of bytes written. */
export function encodeUnsafe(str: usize, len: i32, buf: usize, nullTerminated?: bool, errorMode?: ErrorMode): usize;
/** Decodes the specified buffer from UTF-8 bytes to a string, optionally null terminated. */
export function decode(buf: ArrayBuffer, nullTerminated?: bool): string;
/** Decodes raw UTF-8 bytes to a string, optionally null terminated. */
Expand Down
48 changes: 33 additions & 15 deletions std/assembly/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import { OBJECT, BLOCK_MAXSIZE, TOTAL_OVERHEAD } from "./rt/common";
import { compareImpl, strtol, strtod, isSpace, isAscii, isFinalSigma, toLower8, toUpper8 } from "./util/string";
import { SPECIALS_UPPER, casemap, bsearch } from "./util/casemap";
import { E_INDEXOUTOFRANGE, E_INVALIDLENGTH } from "./util/error";
import { E_INDEXOUTOFRANGE, E_INVALIDLENGTH, E_UNPAIRED_SURROGATE } from "./util/error";
import { idof } from "./builtins";
import { Array } from "./array";

Expand Down Expand Up @@ -661,6 +661,12 @@ export namespace String {

export namespace UTF8 {

export const enum ErrorMode {
WTF8,
REPLACE,
ERROR
}

export function byteLength(str: string, nullTerminated: bool = false): i32 {
var strOff = changetype<usize>(str);
var strEnd = strOff + <usize>changetype<OBJECT>(changetype<usize>(str) - TOTAL_OVERHEAD).rtSize;
Expand All @@ -687,15 +693,15 @@ export namespace String {
return bufLen;
}

export function encode(str: string, nullTerminated: bool = false): ArrayBuffer {
export function encode(str: string, nullTerminated: bool = false, errorMode: ErrorMode = ErrorMode.WTF8): ArrayBuffer {
var buf = changetype<ArrayBuffer>(__new(<usize>byteLength(str, nullTerminated), idof<ArrayBuffer>()));
encodeUnsafe(changetype<usize>(str), str.length, changetype<usize>(buf), nullTerminated);
encodeUnsafe(changetype<usize>(str), str.length, changetype<usize>(buf), nullTerminated, errorMode);
return buf;
}

// @ts-ignore: decorator
@unsafe
export function encodeUnsafe(str: usize, len: i32, buf: usize, nullTerminated: bool = false): usize {
export function encodeUnsafe(str: usize, len: i32, buf: usize, nullTerminated: bool = false, errorMode: ErrorMode = ErrorMode.WTF8): usize {
var strEnd = str + (<usize>len << 1);
var bufOff = buf;
while (str < strEnd) {
Expand All @@ -709,17 +715,29 @@ export namespace String {
store<u16>(bufOff, b1 << 8 | b0);
bufOff += 2;
} else {
if ((c1 & 0xFC00) == 0xD800 && str + 2 < strEnd) {
let c2 = <u32>load<u16>(str, 2);
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) | (c2 & 0x03FF);
let b0 = c1 >> 18 | 240;
let b1 = c1 >> 12 & 63 | 128;
let b2 = c1 >> 6 & 63 | 128;
let b3 = c1 & 63 | 128;
store<u32>(bufOff, b3 << 24 | b2 << 16 | b1 << 8 | b0);
bufOff += 4; str += 4;
continue;
// D800: 11011 0 0000000000 Lead
// DBFF: 11011 0 1111111111
// DC00: 11011 1 0000000000 Trail
// DFFF: 11011 1 1111111111
// F800: 11111 0 0000000000 Mask
// FC00: 11111 1 0000000000
if ((c1 & 0xF800) == 0xD800) {
if (c1 < 0xDC00 && str + 2 < strEnd) {
let c2 = <u32>load<u16>(str, 2);
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) | (c2 & 0x03FF);
let b0 = c1 >> 18 | 240;
let b1 = c1 >> 12 & 63 | 128;
let b2 = c1 >> 6 & 63 | 128;
let b3 = c1 & 63 | 128;
store<u32>(bufOff, b3 << 24 | b2 << 16 | b1 << 8 | b0);
bufOff += 4; str += 4;
continue;
}
}
if (errorMode != ErrorMode.WTF8) { // unlikely
if (errorMode == ErrorMode.ERROR) throw new Error(E_UNPAIRED_SURROGATE);
c1 = 0xFFFD;
}
}
let b0 = c1 >> 12 | 224;
Expand Down
4 changes: 4 additions & 0 deletions std/assembly/util/error.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,7 @@ export const E_URI_MALFORMED: string = "URI malformed";
// @ts-ignore: decorator
@lazy @inline
export const E_INVALIDDATE: string = "Invalid Date";

// @ts-ignore: decorator
@lazy @inline
export const E_UNPAIRED_SURROGATE: string = "Unpaired surrogate";

0 comments on commit 3564848

Please sign in to comment.