In [304]:
import re
from dataclasses import dataclass
from enum import Enum, auto
from queue import Queue

In [305]:
class Tag(Enum):
    ERROR = auto()
    TERM = auto()
    NTERM = auto()
    AXIOM = auto()
    COMMENT = auto()
    OPEN = auto()
    CLOSE = auto()
    ITERSTART = auto()
    ITEREND = auto()
    SEPARATOR = auto()
    GROUPSTART = auto()
    GROUPEND = auto()
    ASSIGNMENT = auto()
    ALT = auto()
    CONCAT = auto()
    UNMATCHED = auto()
    END = auto()


@dataclass
class Token:
    """Represents tokens for lexical analysis."""

    tag: Tag   
    value: str = ""


@dataclass
class CoordsToken(Token):
    start_idx: int = 0
    end_idx: int = 0

    def __repr__(self):
        return f"{self.tag} ({self.start_idx}, {self.end_idx}): {self.value}"

In [306]:
config = {
    "open": "<",
    "close": ">",
    "groupstart": "|",
    "groupend": "|",
    "iter_start": "{",
    "iter_end": "}",
    "concat": "",
    "assignment" : "\t",
    "alt": "\n",
    "terminal": r"[a-z\/\-\+\*\(\)]",
    "nonterminal": r"[A-Z]'?"
}

In [307]:
class Lexer:
    """Performs lexical analysis of input data."""

    def __init__(self, config: dict):
        self.config = config
        self._re_mapping = {
            r"'.*\n": Tag.COMMENT,
            r"<axiom <({})>>\n".format(config["nonterminal"]): Tag.AXIOM,
            config["open"]: Tag.OPEN,
            config["close"]: Tag.CLOSE,
            config["iter_start"]: Tag.ITERSTART,
            config["iter_end"]: Tag.ITEREND,
            config["terminal"]: Tag.TERM,
            config["alt"]: Tag.ALT,
           # config["groupstart"]: Tag.GROUPSTART,
           # config["groupend"]: Tag.GROUPEND,
            config["assignment"]: Tag.ASSIGNMENT,
            #config["concat"]: Tag.CONCAT,
            config["nonterminal"]: Tag.NTERM,   
        }

    def _match_token(self, input_str: str) -> Token:
        for pattern, tag in self._re_mapping.items():
            matched = re.match(pattern, input_str)
            if matched:
                return Token(tag=tag, value=matched.group())

        return Token(tag=Tag.UNMATCHED, value=input_str[0])

    def tokenize(self, input_str: str) -> Queue:
        tokens = Queue()
        idx = 0
        open_tags = 0

        while idx < len(input_str):
            token = self._match_token(input_str[idx:])
            if token.tag == Tag.UNMATCHED and token.value.isspace() or token.tag == Tag.COMMENT:
                idx += len(token.value)
            else:
                if token.tag == Tag.AXIOM:
                    axiom_value = re.search(self.config["nonterminal"], token.value).group(0) # type: ignore
                    tokens.put(CoordsToken(Tag.AXIOM, axiom_value, idx, idx + len(token.value)))
                else:
                    if token.tag == Tag.OPEN:
                        open_tags += 1
                    if token.tag == Tag.CLOSE:
                        open_tags -= 1
                    if token.tag == Tag.ALT and open_tags == 0:
                        token.tag = Tag.SEPARATOR
                    tokens.put(CoordsToken(token.tag, token.value, idx, idx + len(token.value)))
                idx += len(token.value)

        tokens.put(CoordsToken(Tag.END, "", idx+1, idx+1))
        return tokens

In [308]:
lexer = Lexer(config)

test_str = """<E\t<T {<
            <+>
            <->
          > T}>>
<T\t<F {< 
            <*> 
            </>
          > F}>>
<F\t<n>
      <- F>
      <( E )>>"""

tokens = lexer.tokenize(test_str)

In [309]:
# while not (tokens.empty()):
#     print(tokens.get())

In [310]:
from dataclasses import dataclass
from typing import List, Optional, Union


@dataclass
class Empty:
    symbol: str


@dataclass
class Terminal:
    symbol: str


@dataclass
class Nonterminal:
    symbol: str

    def __eq__(self, other):
        return isinstance(other, Nonterminal) and self.symbol == other.symbol

    def __hash__(self):
        return hash(self.symbol)


@dataclass
class GroupNode:
    value: Optional[None]


@dataclass
class OptionalNode:
    value: Optional[None]


@dataclass
class IterNode:
    value: Optional[None]


@dataclass
class AltNode:
    nodes: List[Union[Empty, Terminal, Nonterminal, GroupNode, OptionalNode, IterNode]]


@dataclass
class RHS:
    nodes: List[AltNode]


@dataclass
class Rule:
    lhs: Nonterminal
    rhs: RHS


@dataclass
class CFGRule:
    lhs: Nonterminal
    rhs: List[Union[Empty, Terminal, Nonterminal]]


@dataclass
class CFGrammar:
    start: Nonterminal
    rules: List[CFGRule]

In [311]:
class Parser:
    """Performs syntax analysis of input data."""

    def parse(self, tokens: Queue):
        nonterminals = set()

        def rules():
            return rest_rules([rule()])

        def rest_rules(rules):
            if tokens.queue[0].tag == Tag.SEPARATOR:
                tokens.get()
                rules.append(rule())
                return rest_rules(rules)
            return rules

        def rule():
            assert tokens.get().tag == Tag.OPEN
            lhs = tokens.get()
            #print(lhs)
            assert lhs.tag == Tag.NTERM
            nonterminals.add(Nonterminal(lhs.value))
            #assert tokens.get().tag == Tag.CLOSE
            assert tokens.get().tag == Tag.ASSIGNMENT
            rhs_ = rhs()
            return Rule(Nonterminal(lhs.value), rhs_)

        def rhs():
            return RHS(rest_rhs([rhs_term()]))

        def rest_rhs(terms):
            if tokens.queue[0].tag == Tag.ALT:
                tokens.get()
                terms.append(rhs_term())
                return rest_rhs(terms)
            return terms

        def rhs_term():
            return AltNode(rest_rhs_term([rhs_factor()]))

        def rest_rhs_term(factors):
            if tokens.queue[0].tag == Tag.CONCAT:
                tokens.get()
                factors.append(rhs_factor())
                return rest_rhs_term(factors)
            if tokens.queue[0].tag in [
                #Tag.EPS,
                Tag.TERM,
                Tag.OPEN,
                Tag.GROUPSTART,
                Tag.ITERSTART,
            ]:
                factors.append(rhs_factor())
                return rest_rhs_term(factors)
            return factors

        def rhs_factor():
            token = tokens.get()
            # if token.tag == Tag.EPS:
            #     return Empty(token.value)
            if token.tag == Tag.TERM:
                return Terminal(token.value)
            # if token.tag == Tag.OPEN:
            #     token = tokens.get()
            #     print(token.tag, '11')
            #     assert token.tag == Tag.NTERM
            #    # print(tokens.get().tag, '22')
            #     token = tokens.get()
            #    # assert tokens.tag == Tag.END
            #     nonterminals.add(Nonterminal(token.value))
            #     return Nonterminal(token.value)
            if token.tag == Tag.ALT:
                rhs_ = rhs()
                assert tokens.get().tag == Tag.GROUPEND
                return GroupNode(rhs_)
            if token.tag == Tag.ITERSTART:
                rhs_ = rhs()
                assert tokens.get().tag == Tag.ITEREND
                return IterNode(rhs_)
            raise Exception(f"Wrong Factor <{token.tag}> with value <{token.value}>")

        rules_ = rules()
        return rules_, nonterminals

In [312]:
rules, nont = Parser().parse(tokens=tokens)

Exception: Wrong Factor <Tag.OPEN> with value <<>

In [None]:
for rule in rules:
    print(rule)

Rule(lhs=Nonterminal(symbol='E'), rhs=RHS(nodes=[AltNode(nodes=[Nonterminal(symbol='{')])]))
