smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileConstants.java

package com.fasterxml.jackson.dataformat.smile;

/**
 * Constants used by {@link SmileGenerator} and {@link SmileParser}
 * 
 * @author tatu
 */
public final class SmileConstants
{
    /*
    /**********************************************************
    /* Thresholds
    /**********************************************************
     */

    /**
     * Encoding has special "short" forms for value Strings that can
     * be represented by 64 bytes of UTF-8 or less.
     */
    public final static int MAX_SHORT_VALUE_STRING_BYTES = 64;

    /**
     * Maximum byte length for short ASCII names is 64.
     */
    public final static int MAX_SHORT_NAME_ASCII_BYTES = 64;

    /**
     * Maximum byte length for short non-ASCII names is slightly
     * less due to having to reserve bytes 0xF8 and above (but
     * we get one more as values 0 and 1 are not valid)
     */
    public final static int MAX_SHORT_NAME_UNICODE_BYTES = 56;

    /**
     * Regardless of ASCII/non-ASCII aspect, maximum byte length for any
     * short name is then 64 bytes.
     */
    public final static int MAX_SHORT_NAME_ANY_BYTES = 64;
    
    /**
     * Longest back reference we use for field names is 10 bits; no point
     * in keeping much more around
     */
    public final static int MAX_SHARED_NAMES = 1024;

    /**
     * Longest back reference we use for short shared String values is 10 bits,
     * so up to {@code (1 << 10)} values to keep track of.
     */
    public final static int MAX_SHARED_STRING_VALUES = 1024;

    /**
     * Also: whereas we can refer to names of any length, we will only consider
     * text values that are considered "tiny" or "short" (ones encoded with
     * length prefix); this value thereby has to be maximum length of Strings
     * that can be encoded as such.
     */
    public final static int MAX_SHARED_STRING_LENGTH_BYTES = 65;
    
    /**
     * And to make encoding logic tight and simple, we can always
     * require that output buffer has this amount of space
     * available before encoding possibly short String (3 bytes since
     * longest UTF-8 encoded Java char is 3 bytes).
     * Two extra bytes need to be reserved as well; first for token indicator,
     * and second for terminating null byte (in case it's not a short String after all)
     */
    public final static int MIN_BUFFER_FOR_POSSIBLE_SHORT_STRING = 1 + (3 * 65);

    /*
    /**********************************************************
    /* Byte markers
    /**********************************************************
     */
    
    /**
     * We need a byte marker to denote end of variable-length Strings. Although
     * null byte is commonly used, let's try to avoid using it since it can't
     * be embedded in Web Sockets content (similarly, 0xFF can't). There are
     * multiple candidates for bytes UTF-8 can not have; 0xFC is chosen to
     * allow reasonable ordering (highest values meaning most significant
     * framing function; 0xFF being end-of-content and so on)
     */
    public final static int INT_MARKER_END_OF_STRING = 0xFC;

    public final static byte BYTE_MARKER_END_OF_STRING = (byte) INT_MARKER_END_OF_STRING;
    
    /**
     * In addition we can use a marker to allow simple framing; splitting
     * of physical data (like file) into distinct logical sections like
     * JSON documents. 0xFF makes sense here since it is also used
     * as end marker for Web Sockets.
     */
    public final static byte BYTE_MARKER_END_OF_CONTENT = (byte) 0xFF;

    /*
    /**********************************************************
    /* Format header: put smile on your data...
    /**********************************************************
     */

    /**
     * First byte of data header (0x3A)
     */
    public final static byte HEADER_BYTE_1 = (byte) ':';

    /**
     * Second byte of data header (0x29)
     */
    public final static byte HEADER_BYTE_2 = (byte) ')';

    /**
     * Third byte of data header
     */
    public final static byte HEADER_BYTE_3 = (byte) '\n';

    /**
     * Current version consists of four zero bits (nibble)
     */
    public final static int HEADER_VERSION_0 = 0x0;

    /**
     * Fourth byte of data header; contains version nibble, may
     * have flags
     */
    public final static byte HEADER_BYTE_4 = (HEADER_VERSION_0 << 4);
    
    /**
     * Indicator bit that indicates whether encoded content may 
     * have Shared names (back references to recently encoded field
     * names). If no header available, must be
     * processed as if this was set to true.
     * If (and only if) header exists, and value is 0, can parser
     * omit storing of seen names, as it is guaranteed that no back
     * references exist.
     */
    public final static int HEADER_BIT_HAS_SHARED_NAMES = 0x01;

    /**
     * Indicator bit that indicates whether encoded content may
     * have shared String values (back references to recently encoded
     * 'short' String values, where short is defined as 64 bytes or less).
     * If no header available, can be assumed to be 0 (false).
     * If header exists, and bit value is 1, parsers has to store up
     * to 1024 most recently seen distinct short String values.
     */
    public final static int HEADER_BIT_HAS_SHARED_STRING_VALUES = 0x02;

    /**
     * Indicator bit that indicates whether encoded content may
     * contain raw (unquoted) binary values.
     * If no header available, can be assumed to be 0 (false).
     * If header exists, and bit value is 1, parser can not assume that
     * specific byte values always have default meaning (specifically,
     * content end marker 0xFF and header signature can be contained
     * in binary values)
     *<p>
     * Note that this bit being true does not automatically mean that
     * such raw binary content indeed exists; just that it may exist.
     * This because header is written before any binary data may be
     * written.
     */
    public final static int HEADER_BIT_HAS_RAW_BINARY = 0x04;
    
    /*
    /**********************************************************
    /* Type prefixes: 3 MSB of token byte
    /**********************************************************
     */

    public final static int TOKEN_PREFIX_INTEGER = 0x24;

    public final static int TOKEN_PREFIX_FP = 0x28;
    
    // Shared strings are back references for last 63 short (< 64 byte) string values
    // NOTE: 0x00 is reserved, not used with current version (may be used in future)
    public final static int TOKEN_PREFIX_SHARED_STRING_SHORT = 0x00;
    // literals are put between 0x20 and 0x3F to reserve markers (smiley), along with ints/doubles
    //public final static int TOKEN_PREFIX_MISC_NUMBERS = 0x20;

    public final static int TOKEN_PREFIX_SHARED_STRING_LONG = 0xEC;
    
    public final static int TOKEN_PREFIX_TINY_ASCII = 0x40;
    public final static int TOKEN_PREFIX_SMALL_ASCII = 0x60;
    public final static int TOKEN_PREFIX_TINY_UNICODE = 0x80;
    public final static int TOKEN_PREFIX_SHORT_UNICODE = 0xA0;

    // Small ints are 4-bit (-16 to +15) integer constants
    public final static int TOKEN_PREFIX_SMALL_INT = 0xC0;

    // And misc types have empty at the end too, to reserve 0xF8 - 0xFF
    public final static int TOKEN_PREFIX_MISC_OTHER = 0xE0;

    /*
    /**********************************************************
    /* Token literals, normal mode
    /**********************************************************
     */
    
    // First, non-structured literals

    public final static byte TOKEN_LITERAL_EMPTY_STRING = 0x20;
    public final static byte TOKEN_LITERAL_NULL = 0x21;
    public final static byte TOKEN_LITERAL_FALSE = 0x22;
    public final static byte TOKEN_LITERAL_TRUE = 0x23;

    // And then structured literals
    
    public final static byte TOKEN_LITERAL_START_ARRAY = (byte) 0xF8;
    public final static byte TOKEN_LITERAL_END_ARRAY = (byte) 0xF9;
    public final static byte TOKEN_LITERAL_START_OBJECT = (byte) 0xFA;
    public final static byte TOKEN_LITERAL_END_OBJECT = (byte) 0xFB;

    /*
    /**********************************************************
    /* Subtype constants for misc text/binary types
    /**********************************************************
     */

    public final static int INT_MISC_BINARY_7BIT = 0xE8;

    public final static int INT_MISC_BINARY_RAW = 0xFD;
    
    /**
     * Type (for misc, other) used for
     * variable length UTF-8 encoded text, when it is known to only contain ASCII chars.
     * Note: 2 LSB are reserved for future use; must be zeroes for now
     */
    public final static byte TOKEN_MISC_LONG_TEXT_ASCII = (byte) 0xE0;

    /**
     * Type (for misc, other) used
     * for variable length UTF-8 encoded text, when it is NOT known to only contain ASCII chars
     * (which means it MAY have multi-byte characters)
     * Note: 2 LSB are reserved for future use; must be zeroes for now
     */
    public final static byte TOKEN_MISC_LONG_TEXT_UNICODE = (byte) 0xE4;
    
    /**
     * Type (for misc, other) used
     * for "safe" (encoded by only using 7 LSB, giving 8/7 expansion ratio).
     * This is usually done to ensure that certain bytes are never included
     * in encoded data (like 0xFF)
     * Note: 2 LSB are reserved for future use; must be zeroes for now
     */
    public final static byte TOKEN_MISC_BINARY_7BIT = (byte) INT_MISC_BINARY_7BIT;

    /**
     * Raw binary data marker is specifically chosen as separate from
     * other types, since it can have significant impact on framing
     * (or rather fast scanning based on structure and framing markers).
     */
    public final static byte TOKEN_MISC_BINARY_RAW = (byte) INT_MISC_BINARY_RAW;

    /*
    /**********************************************************
    /* Modifiers for numeric entries
    /**********************************************************
     */

    /**
     * Numeric subtype (2 LSB)
     * indicating 32-bit integer (int)
     */
    public final static int TOKEN_MISC_INTEGER_32 = 0x00;

    /**
     * Numeric subtype (2 LSB)
     * indicating 32-bit integer (long)
     */
    public final static int TOKEN_MISC_INTEGER_64 = 0x01;

    /**
     * Numeric subtype (2 LSB) for
     * indicating {@link java.math.BigInteger} type.
     */
    public final static int TOKEN_MISC_INTEGER_BIG = 0x02;

    // Note: type 3 (0xF3) reserved for future use
    
    /**
     * Numeric subtype (2 LSB) for
     * indicating 32-bit IEEE single precision floating point number.
     */
    public final static int TOKEN_MISC_FLOAT_32 = 0x00;

    /**
     * Numeric subtype (2 LSB)
     * indicating 64-bit IEEE double precision floating point number.
     */
    public final static int TOKEN_MISC_FLOAT_64 = 0x01;

    /**
     * Numeric subtype (2 LSB) for
     * indicating {@link java.math.BigDecimal} type.
     */
    public final static int TOKEN_MISC_FLOAT_BIG = 0x02;

    // Note: type 3 (0xF7) reserved for future use
    
    /*
    /**********************************************************
    /* Token types for keys
    /**********************************************************
     */

    /**
     * Let's use same code for empty key as for empty String value
     */
    public final static byte TOKEN_KEY_EMPTY_STRING = 0x20;

    public final static int TOKEN_PREFIX_KEY_SHARED_LONG = 0x30;
    
    public final static byte TOKEN_KEY_LONG_STRING = 0x34;

    public final static int TOKEN_PREFIX_KEY_SHARED_SHORT = 0x40;
    
    public final static int TOKEN_PREFIX_KEY_ASCII = 0x80;

    public final static int TOKEN_PREFIX_KEY_UNICODE = 0xC0;

    /*
    /**********************************************************
    /* Basic UTF-8 decode/encode table
    /**********************************************************
     */
    
    /**
     * Additionally we can combine UTF-8 decoding info into similar
     * data table.
     * Values indicate "byte length - 1"; meaning -1 is used for
     * invalid bytes, 0 for single-byte codes, 1 for 2-byte codes
     * and 2 for 3-byte codes.
     */
    public final static int[] sUtf8UnitLengths;
    static {
        int[] table = new int[256];
        for (int c = 128; c < 256; ++c) {
            int code;

            // We'll add number of bytes needed for decoding
            if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
                code = 1;
            } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
                code = 2;
            } else if ((c & 0xF8) == 0xF0) {
                // 4 bytes; double-char with surrogates and all...
                code = 3;
            } else {
                // And -1 seems like a good "universal" error marker...
                code = -1;
            }
            table[c] = code;
        }
        sUtf8UnitLengths = table;
    }
}