/
SmileConstants.java
358 lines (296 loc) · 12.6 KB
/
SmileConstants.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
package com.fasterxml.jackson.dataformat.smile;
/**
* Constants used by {@link SmileGenerator} and {@link SmileParser}
*
* @author tatu
*/
public final class SmileConstants
{
/*
/**********************************************************
/* Thresholds
/**********************************************************
*/
/**
* Encoding has special "short" forms for value Strings that can
* be represented by 64 bytes of UTF-8 or less.
*/
public final static int MAX_SHORT_VALUE_STRING_BYTES = 64;
/**
* Maximum byte length for short ASCII names is 64.
*/
public final static int MAX_SHORT_NAME_ASCII_BYTES = 64;
/**
* Maximum byte length for short non-ASCII names is slightly
* less due to having to reserve bytes 0xF8 and above (but
* we get one more as values 0 and 1 are not valid)
*/
public final static int MAX_SHORT_NAME_UNICODE_BYTES = 56;
/**
* Regardless of ASCII/non-ASCII aspect, maximum byte length for any
* short name is then 64 bytes.
*/
public final static int MAX_SHORT_NAME_ANY_BYTES = 64;
/**
* Longest back reference we use for field names is 10 bits; no point
* in keeping much more around
*/
public final static int MAX_SHARED_NAMES = 1024;
/**
* Longest back reference we use for short shared String values is 10 bits,
* so up to {@code (1 << 10)} values to keep track of.
*/
public final static int MAX_SHARED_STRING_VALUES = 1024;
/**
* Also: whereas we can refer to names of any length, we will only consider
* text values that are considered "tiny" or "short" (ones encoded with
* length prefix); this value thereby has to be maximum length of Strings
* that can be encoded as such.
*/
public final static int MAX_SHARED_STRING_LENGTH_BYTES = 65;
/**
* And to make encoding logic tight and simple, we can always
* require that output buffer has this amount of space
* available before encoding possibly short String (3 bytes since
* longest UTF-8 encoded Java char is 3 bytes).
* Two extra bytes need to be reserved as well; first for token indicator,
* and second for terminating null byte (in case it's not a short String after all)
*/
public final static int MIN_BUFFER_FOR_POSSIBLE_SHORT_STRING = 1 + (3 * 65);
/*
/**********************************************************
/* Byte markers
/**********************************************************
*/
/**
* We need a byte marker to denote end of variable-length Strings. Although
* null byte is commonly used, let's try to avoid using it since it can't
* be embedded in Web Sockets content (similarly, 0xFF can't). There are
* multiple candidates for bytes UTF-8 can not have; 0xFC is chosen to
* allow reasonable ordering (highest values meaning most significant
* framing function; 0xFF being end-of-content and so on)
*/
public final static int INT_MARKER_END_OF_STRING = 0xFC;
public final static byte BYTE_MARKER_END_OF_STRING = (byte) INT_MARKER_END_OF_STRING;
/**
* In addition we can use a marker to allow simple framing; splitting
* of physical data (like file) into distinct logical sections like
* JSON documents. 0xFF makes sense here since it is also used
* as end marker for Web Sockets.
*/
public final static byte BYTE_MARKER_END_OF_CONTENT = (byte) 0xFF;
/*
/**********************************************************
/* Format header: put smile on your data...
/**********************************************************
*/
/**
* First byte of data header (0x3A)
*/
public final static byte HEADER_BYTE_1 = (byte) ':';
/**
* Second byte of data header (0x29)
*/
public final static byte HEADER_BYTE_2 = (byte) ')';
/**
* Third byte of data header
*/
public final static byte HEADER_BYTE_3 = (byte) '\n';
/**
* Current version consists of four zero bits (nibble)
*/
public final static int HEADER_VERSION_0 = 0x0;
/**
* Fourth byte of data header; contains version nibble, may
* have flags
*/
public final static byte HEADER_BYTE_4 = (HEADER_VERSION_0 << 4);
/**
* Indicator bit that indicates whether encoded content may
* have Shared names (back references to recently encoded field
* names). If no header available, must be
* processed as if this was set to true.
* If (and only if) header exists, and value is 0, can parser
* omit storing of seen names, as it is guaranteed that no back
* references exist.
*/
public final static int HEADER_BIT_HAS_SHARED_NAMES = 0x01;
/**
* Indicator bit that indicates whether encoded content may
* have shared String values (back references to recently encoded
* 'short' String values, where short is defined as 64 bytes or less).
* If no header available, can be assumed to be 0 (false).
* If header exists, and bit value is 1, parsers has to store up
* to 1024 most recently seen distinct short String values.
*/
public final static int HEADER_BIT_HAS_SHARED_STRING_VALUES = 0x02;
/**
* Indicator bit that indicates whether encoded content may
* contain raw (unquoted) binary values.
* If no header available, can be assumed to be 0 (false).
* If header exists, and bit value is 1, parser can not assume that
* specific byte values always have default meaning (specifically,
* content end marker 0xFF and header signature can be contained
* in binary values)
*<p>
* Note that this bit being true does not automatically mean that
* such raw binary content indeed exists; just that it may exist.
* This because header is written before any binary data may be
* written.
*/
public final static int HEADER_BIT_HAS_RAW_BINARY = 0x04;
/*
/**********************************************************
/* Type prefixes: 3 MSB of token byte
/**********************************************************
*/
public final static int TOKEN_PREFIX_INTEGER = 0x24;
public final static int TOKEN_PREFIX_FP = 0x28;
// Shared strings are back references for last 63 short (< 64 byte) string values
// NOTE: 0x00 is reserved, not used with current version (may be used in future)
public final static int TOKEN_PREFIX_SHARED_STRING_SHORT = 0x00;
// literals are put between 0x20 and 0x3F to reserve markers (smiley), along with ints/doubles
//public final static int TOKEN_PREFIX_MISC_NUMBERS = 0x20;
public final static int TOKEN_PREFIX_SHARED_STRING_LONG = 0xEC;
public final static int TOKEN_PREFIX_TINY_ASCII = 0x40;
public final static int TOKEN_PREFIX_SMALL_ASCII = 0x60;
public final static int TOKEN_PREFIX_TINY_UNICODE = 0x80;
public final static int TOKEN_PREFIX_SHORT_UNICODE = 0xA0;
// Small ints are 4-bit (-16 to +15) integer constants
public final static int TOKEN_PREFIX_SMALL_INT = 0xC0;
// And misc types have empty at the end too, to reserve 0xF8 - 0xFF
public final static int TOKEN_PREFIX_MISC_OTHER = 0xE0;
/*
/**********************************************************
/* Token literals, normal mode
/**********************************************************
*/
// First, non-structured literals
public final static byte TOKEN_LITERAL_EMPTY_STRING = 0x20;
public final static byte TOKEN_LITERAL_NULL = 0x21;
public final static byte TOKEN_LITERAL_FALSE = 0x22;
public final static byte TOKEN_LITERAL_TRUE = 0x23;
// And then structured literals
public final static byte TOKEN_LITERAL_START_ARRAY = (byte) 0xF8;
public final static byte TOKEN_LITERAL_END_ARRAY = (byte) 0xF9;
public final static byte TOKEN_LITERAL_START_OBJECT = (byte) 0xFA;
public final static byte TOKEN_LITERAL_END_OBJECT = (byte) 0xFB;
/*
/**********************************************************
/* Subtype constants for misc text/binary types
/**********************************************************
*/
public final static int INT_MISC_BINARY_7BIT = 0xE8;
public final static int INT_MISC_BINARY_RAW = 0xFD;
/**
* Type (for misc, other) used for
* variable length UTF-8 encoded text, when it is known to only contain ASCII chars.
* Note: 2 LSB are reserved for future use; must be zeroes for now
*/
public final static byte TOKEN_MISC_LONG_TEXT_ASCII = (byte) 0xE0;
/**
* Type (for misc, other) used
* for variable length UTF-8 encoded text, when it is NOT known to only contain ASCII chars
* (which means it MAY have multi-byte characters)
* Note: 2 LSB are reserved for future use; must be zeroes for now
*/
public final static byte TOKEN_MISC_LONG_TEXT_UNICODE = (byte) 0xE4;
/**
* Type (for misc, other) used
* for "safe" (encoded by only using 7 LSB, giving 8/7 expansion ratio).
* This is usually done to ensure that certain bytes are never included
* in encoded data (like 0xFF)
* Note: 2 LSB are reserved for future use; must be zeroes for now
*/
public final static byte TOKEN_MISC_BINARY_7BIT = (byte) INT_MISC_BINARY_7BIT;
/**
* Raw binary data marker is specifically chosen as separate from
* other types, since it can have significant impact on framing
* (or rather fast scanning based on structure and framing markers).
*/
public final static byte TOKEN_MISC_BINARY_RAW = (byte) INT_MISC_BINARY_RAW;
/*
/**********************************************************
/* Modifiers for numeric entries
/**********************************************************
*/
/**
* Numeric subtype (2 LSB)
* indicating 32-bit integer (int)
*/
public final static int TOKEN_MISC_INTEGER_32 = 0x00;
/**
* Numeric subtype (2 LSB)
* indicating 32-bit integer (long)
*/
public final static int TOKEN_MISC_INTEGER_64 = 0x01;
/**
* Numeric subtype (2 LSB) for
* indicating {@link java.math.BigInteger} type.
*/
public final static int TOKEN_MISC_INTEGER_BIG = 0x02;
// Note: type 3 (0xF3) reserved for future use
/**
* Numeric subtype (2 LSB) for
* indicating 32-bit IEEE single precision floating point number.
*/
public final static int TOKEN_MISC_FLOAT_32 = 0x00;
/**
* Numeric subtype (2 LSB)
* indicating 64-bit IEEE double precision floating point number.
*/
public final static int TOKEN_MISC_FLOAT_64 = 0x01;
/**
* Numeric subtype (2 LSB) for
* indicating {@link java.math.BigDecimal} type.
*/
public final static int TOKEN_MISC_FLOAT_BIG = 0x02;
// Note: type 3 (0xF7) reserved for future use
/*
/**********************************************************
/* Token types for keys
/**********************************************************
*/
/**
* Let's use same code for empty key as for empty String value
*/
public final static byte TOKEN_KEY_EMPTY_STRING = 0x20;
public final static int TOKEN_PREFIX_KEY_SHARED_LONG = 0x30;
public final static byte TOKEN_KEY_LONG_STRING = 0x34;
public final static int TOKEN_PREFIX_KEY_SHARED_SHORT = 0x40;
public final static int TOKEN_PREFIX_KEY_ASCII = 0x80;
public final static int TOKEN_PREFIX_KEY_UNICODE = 0xC0;
/*
/**********************************************************
/* Basic UTF-8 decode/encode table
/**********************************************************
*/
/**
* Additionally we can combine UTF-8 decoding info into similar
* data table.
* Values indicate "byte length - 1"; meaning -1 is used for
* invalid bytes, 0 for single-byte codes, 1 for 2-byte codes
* and 2 for 3-byte codes.
*/
public final static int[] sUtf8UnitLengths;
static {
int[] table = new int[256];
for (int c = 128; c < 256; ++c) {
int code;
// We'll add number of bytes needed for decoding
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
code = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
code = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
code = 3;
} else {
// And -1 seems like a good "universal" error marker...
code = -1;
}
table[c] = code;
}
sUtf8UnitLengths = table;
}
}