lib/Parser/RegexParser.h

//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
#pragma once

namespace UnifiedRegex
{
    struct ParseError
    {
        bool isBody;
        CharCount pos;           // Position in unicode characters
        CharCount encodedPos;    // Position in underlying characters (eg utf-8 bytes)
        HRESULT error;

        ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error);
    };

    template <typename EncodingPolicy, const bool IsLiteral>
    class Parser : private EncodingPolicy, private Chars<char16>
    {
    private:
        typedef typename EncodingPolicy::EncodedChar EncodedChar;

        // A linked list node to track indices of surrogate pairs.
        struct SurrogatePairTracker
        {
            const EncodedChar* location;
            // If this surrogate pair is inside a range, then rangeLocation isn't null.
            const EncodedChar* rangeLocation;
            codepoint_t value;
            uint32 length;
            size_t multiUnits;
            SurrogatePairTracker* next;

            SurrogatePairTracker(const EncodedChar* location, codepoint_t value, uint32 length, size_t multiUnits)
                : location(location)
                , next(nullptr)
                , value(value)
                , length(length)
                , multiUnits(multiUnits)
                , rangeLocation(nullptr)
            {
            }

            SurrogatePairTracker(const EncodedChar* location, const EncodedChar* rangeLocation, codepoint_t value, uint32 length, size_t multiUnits)
                : location(location)
                , next(nullptr)
                , value(value)
                , length(length)
                , multiUnits(multiUnits)
                , rangeLocation(rangeLocation)
            {
            }

            bool IsInsideRange() const
            {
                return this->rangeLocation != nullptr;
            }
        };

        static const CharCount initLitbufSize = 16;

        Js::ScriptContext* scriptContext;
        // Arena for nodes and items needed only during compilation
        ArenaAllocator* ctAllocator;
        // Standard characters using raw encoding character representation (eg char for utf-8)
        StandardChars<EncodedChar>* standardEncodedChars;
        // Standard characters using final character representation (eg char16 for Unicode)
        StandardChars<Char>* standardChars;
#if ENABLE_REGEX_CONFIG_OPTIONS
        DebugWriter* w;
#endif

        const EncodedChar* input;
        const EncodedChar* inputLim;
        const EncodedChar* next;
        bool inBody;

        // Maximum number of capturing groups allowed, including the entire regexp, which is always
        // considered a capturing group.  Using INT16_MAX allows us to pass one value for each
        // group, plus a few additional values, to a JavaScript function without overflowing the
        // number of arguments.  This is important, for example, in the implementation of
        // String.prototype.replace, where the second argument is a function.
        static const uint16 MAX_NUM_GROUPS = INT16_MAX;

        uint16 numGroups; // determined in first parse

        // Keeps track of how many capturing groups we've seen during parsing.  We use an int, rather than
        // a uint16, to be sure we don't overflow during parsing and only check it against MAX_NUM_GROUPS at
        // the end.  (We know we can't overflow an int because strings and regex literals are limited to
        // 2G characters and therefore to 1G pairs of parentheses, which can fit into an int.  I'd prefer
        // to use size_t here, but making that change would go down a serious rabbit hole changing the
        // interface to UnifiedRegex::Node::AccumDefineGroups.)
        int nextGroupId;

        // Buffer accumulating all literals.
        // In compile-time allocator, must be transferred to runtime allocator when build program
        Char* litbuf;
        CharCount litbufLen;
        CharCount litbufNext;

        // During pass 0, if /u option for regex is provided, a linked list will be built up to
        // track positions of surrogate pairs in the buffer. During pass 1, these linked lists will be used
        // to figure out when to output a surrogate pair node.
        SurrogatePairTracker* surrogatePairList;
        SurrogatePairTracker* currentSurrogatePairNode;
        bool unicodeFlagPresent;
        bool caseInsensitiveFlagPresent;
        bool dotAllFlagPresent;

        // The following two variables are used to determine if the the surrogate pair has been encountered
        // First holds the temporary location, second holds the value of the codepoint
        const EncodedChar* tempLocationOfSurrogatePair;
        // This will be set to a location when we are parsing a range in TermPass0, and cleared when we are out of it.
        const EncodedChar* tempLocationOfRange;
        codepoint_t codePointAtTempLocation;

        // When a surrogate is added for tracking, this will be updated.
        const EncodedChar* positionAfterLastSurrogate;
        codepoint_t valueOfLastSurrogate;

        // deferred error state.
        ParseError* deferredIfNotUnicodeError;
        ParseError* deferredIfUnicodeError;

    private:

        //
        // Input buffer management
        //

        void SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody);

        // Current position in number of logical characters, regardless of underlying character encoding
        inline CharCount Pos();

        inline bool IsEOF();
        inline bool ECCanConsume(CharCount n = 1);
        inline EncodedChar ECLookahead(CharCount n = 0);
        inline EncodedChar ECLookback(CharCount n = 0);
        inline void ECConsume(CharCount n = 1);
        inline void ECConsumeMultiUnit(CharCount n = 1);
        inline void ECRevert(CharCount n = 1);

        //
        // Helpers
        //
        int TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair = false);
        void TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength);
        Node* CreateSurrogatePairAtom(char16 lower, char16 upper);
        AltNode* AppendSurrogateRangeToDisjunction(codepoint_t lowerCodePoint, codepoint_t upperCodePoint, AltNode *lastAlttNode);
        AltNode* AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAlttNode);

        //
        // Errors
        //

        void Fail(HRESULT error);
        void DeferredFailIfUnicode(HRESULT error);
        void DeferredFailIfNotUnicode(HRESULT error);
        inline void ECMust(EncodedChar ec, HRESULT error);
        inline Char NextChar();

        //
        // Patterns/Disjunctions/Alternatives
        //

        void PatternPass0();
        Node* PatternPass1();
        Node* UnionNodes(Node* prev, Node* curr);
        void DisjunctionPass0(int depth);
        Node* DisjunctionPass1();
        bool IsEndOfAlternative();
        void EnsureLitbuf(CharCount size);
        void AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode);
        Node* FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode);
        void AlternativePass0(int depth);
        Node* AlternativePass1();

        //
        // Terms
        //

        Node* NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body);
        bool AtQuantifier();
        bool OptNonGreedy();
        CharCount RepeatCount();
        void TermPass0(int depth);
        Node* TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
        bool AtomEscapePass0();
        bool AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
        bool SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);

        //
        // Classes
        //

        bool AtSecondSingletonClassAtom();
        void CharacterClassPass0();
        template <bool containsSurrogates>
        Node* CharacterClassPass1();
        bool ClassEscapePass0(Char& singleton, bool& previousSurrogatePart);
        Node* ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart);
        Node* GetNodeWithValidCharacterSet(EncodedChar ch);

        //
        // Options
        //

        void Options(RegexFlags& flags);

    public:

        Parser
            ( Js::ScriptContext* scriptContext
            , ArenaAllocator* ctAllocator
            , StandardChars<EncodedChar>* standardEncodedChars
            , StandardChars<Char>* standardChars
            , bool isUtf8
#if ENABLE_REGEX_CONFIG_OPTIONS
            , DebugWriter* w
#endif
            );

        //
        // Entry points
        //


        Node* ParseDynamic
            ( const EncodedChar* body           // non null, null terminated (may contain embedded nulls)
            , const EncodedChar* bodyLim        // points to terminating null of above
            , const EncodedChar* opts           // may be null if no options, otherwise null terminated
            , const EncodedChar* optsLim        // if above non-null, points to terminating null of above
            , RegexFlags& flags );

        // (*) For ParseLiteral:
        //  - input string must be null terminated
        //  - inputLim may point to the terminating null in above or before it
        //     - if the later, input is known to be syntactically well-formed so that the parser
        //       will find the natural end of the regex literal before passing inputLim
        //  - input may contain nulls before the inputLim

        Node* ParseLiteral
            ( const EncodedChar* input          // non null, null terminated (may contain embedded nulls)
            , const EncodedChar* inputLim       // see (*) above
            , CharCount& outBodyEncodedChars    // in encoded characters, not including trailing '/'
            , CharCount& outTotalEncodedChars   // in encoded characters, including trailing '/' and any options
            , CharCount& outBodyChars           // in unicode characters, not including ttrailing '/'
            , CharCount& outTotalChars          // in unicode characters, including trailing '/' and any options
            , RegexFlags& flags );

        void ParseLiteralNoAST
            ( const EncodedChar* input          // non null, null terminated
            , const EncodedChar* inputLim       // see (*) above
            , CharCount& outBodyEncodedChars
            , CharCount& outTotalEncodedChars
            , CharCount& outBodyChars
            , CharCount& outTotalChars );

        template<const bool buildAST>
        RegexPattern* CompileProgram
            ( Node* root,
              const EncodedChar*& currentCharacter,
              const CharCount totalLen,
              const CharCount bodyChars,
              const CharCount bodyEncodedChars,
              const CharCount totalChars,
              const RegexFlags flags );

        static void CaptureEmptySourceAndNoGroups(Program* program);

        // bodyChars is number of unicode characters in program body, which may be less than the number
        // of underlying UTF-8 characters
        void CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars);

        inline const Char* GetLitbuf() { return litbuf; }

        void FreeBody();

        size_t GetMultiUnits() { return this->m_cMultiUnits; }
    };
}