-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.ts
152 lines (139 loc) · 6.28 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import assert from "assert"
type Nominal<Ty, Discriminant> = Ty & { __discriminant: Discriminant }
type string_as_utf_8_buffer = Nominal<string, "encoding: UTF-8">
// `fast-text-encoding` pollutes the global namespace. The following is an attempt to circumvent that.
let textEncoder = undefined,
textDecoder = undefined
if (typeof TextEncoder !== "undefined" || typeof TextDecoder !== "undefined") {
textEncoder = TextEncoder
textDecoder = TextDecoder
} else {
// This is a horrible hack, but I don't know a better way to avoid polluting the global
// namespace.
const ctx = (1, eval)("this")
require("fast-text-encoding")
textEncoder = TextEncoder
textDecoder = TextDecoder
delete ctx.TextEncoder
delete ctx.TextDecoder
// FIXME: add an unassert build-step
assert(typeof TextEncoder === "undefined")
assert(typeof TextDecoder === "undefined")
}
assert(typeof textEncoder !== "undefined")
assert(typeof textDecoder !== "undefined")
// This function is intended to be called on JavaScript strings (possibly containing Unicode
// characters outside the ASCII range) that need to be passed to OCaml functions; it
// ‘double-encodes’ those strings such that they will be perceived by BuckleScript-compiled OCaml
// as UTF-8-encoded `char`-arrays.
//
// #### Rationale
//
// Required reading: Mathias Bynens' [JavaScript has a Unicode problem](https://mathiasbynens.be/notes/javascript-unicode) and then [JavaScript's internal character encoding](https://mathiasbynens.be/notes/javascript-encoding).
//
// BuckleScript, at least as of `4.0.6`, treats JavaScript primitive `String`s as, basically, uint
// `char`-arrays. At the boundaries of BuckleScript-compiled code, especially when the OCaml code
// treats `char`-arrays as UTF-8, we need to massage those back into valid JavaScript UCS-2
// strings. This function, paired with `fromFakeUTF8String`, handle these transformations.
//
// (See the README for more information.)
//
// #### Input
//
// This function takes one argument, a ‘standard’ JavaScript `String`; that is, one with Unicode
// characters outside the ASCII range (but still within the BMP!) encoded as single, 16-bit
// code-units; and [higher-plane characters][] encoded as UTF-16-style [surrogate pairs][].
//
// - Example, as a UCS-2 sequence of 16-bit code-units:
//
// ```js
// [102, 111, 111, 183, 98, 97, 114]
// ```
//
// - Example, as typed into a UTF-8 JavaScript source-file:
//
// ```js
// "foo·bar"
// ```
//
// #### Output
//
// An abomination. This produces a JavaScript `String` (that is still technically encoded as UCS-2,
// mind you!) *containing a series of UTF-8 bytes, as interpreted as UCS-2 codepoints*.
//
// - Example, as a UCS-2 sequence of 16-bit code-units:
//
// ```js
// [102, 111, 111, 194, 183, 98, 97, 114]
// ```
//
// - Example, as typed into a UTF-8 JavaScript source-file:
//
// ```js
// "foo\xC2\xB7bar" // or "foo·bar", if you're a heathen
// ```
//
// See that in this example, the non-ASCII character U+00B7 “MIDDLE DOT”, which is one code-unit
// (literally `\u00B7`) in the original input-string, is encoded as *two* JavaScript / UCS-2
// code-units, `\xC2\xB7` — C2-B7 being the UTF-8 encoding of U+00B7.
//
// Finally, note that I mint a nominal type for the return-value of this function; I prefer to
// attempt to track encoding at the type-level if at all possible. It's trivial, though I'd assert
// a bad idea, to discard this additional information if it's giving you type-errors you don't care
// about:
//
// ```typescript
// let this_has_a_simple_type = toFakeUTF8String(some_input) as string
// ```
//
// [higher-plane characters]: <https://en.wikipedia.org/wiki/Plane_(Unicode)>
// [surrogate pairs]: <https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs>
//
//---
// FIXME: The original implementation of this was fast, but broken. Currently, I've resorted to
// simply iterating over the *entire string* an extra time on *both* encoding and decoding,
// which I hate and don't want to do ...
export function toFakeUTF8String(js_string: string): string_as_utf_8_buffer {
const byte_arr = new textEncoder("utf-8").encode(js_string)
let result = new String()
for (var i = 0; i < byte_arr.length; i++) {
// FIXME: I'm not clear on the performance of string-concatenation in JavaScript; all the
// references I find claim that it's not slower than a single join-operation, but that
// seems ... impossible? Oh well.
result = result + String.fromCharCode(byte_arr[i])
}
return result as string_as_utf_8_buffer
}
// Given a double-encoded (effectively, mis-encoded) BuckleScript ‘string’ that's been manipulated
// as if it's a UTF-8 `char`-array, this function will decode (effectively, re-encode) that value
// into a functional, correct JavaScript (i.e. UCS-2) string. See {@link toFakeUTF8String} for
// in-depth details.
//
// Takes a `String`, containing a series of UTF-8 bytes encoded as Unicode codepoints (in
// JavaScript's standard UCS-2, that is); returns a standard JavaScript `String` with those Unicode
// scalars properly represented in UCS-2 code units, ready for standard JavaScript manipulation.
//
// Finally, if you're using TypeScript, this expects a value known at compile-time to contain UTF-8
// text; this is tracked in the type-system via the minted `string_as_utf_8_buffer` type. Idiomatic
// usage would be to tag every stringish return-value from a BuckleScript module with this type:
//
// ```typescript
// import { toFakeUTF8String, fromFakeUTF8String } from 'ocaml-string-convert'
// import $AModule from './aModule.bs'
//
// let $yuck = $AModule.returns_a_string() as string_as_utf_8_buffer
// // ... manipulation ...
// let str = fromFakeUTF8String($yuck)
// ```
//
//---
// FIXME: This ends up iterating over the string *twice*. Not great; but I also don't have access
// to the internals of the UTF-8 decoder if it's built into the JavaScript engine, sooooo ... and
// this is becoming a mantra now ... forgive me. /=
export function fromFakeUTF8String(eldritch_horror: string_as_utf_8_buffer): string {
const utf8_arr = new Uint8Array(eldritch_horror.length)
for (var i = 0; i < eldritch_horror.length; i++) {
utf8_arr[i] = eldritch_horror.charCodeAt(i)
}
return new textDecoder("utf-8").decode(utf8_arr)
}