Aula 2 - Construindo derivação e árvores sintáticas

Célula 1 — Base: Lexer + Parser (LL(1)) + AST + impressão

In [2]:
#@title Base: Lexer + Parser (LL(1)) + AST + impressão
import re
from dataclasses import dataclass
from typing import List, Optional

# ----------------- Lexer -----------------
@dataclass
class Token:
    kind: str   # 'NUM','ID','PLUS','MINUS','MUL','DIV','LPAREN','RPAREN','EOF'
    lexeme: str
    pos: int

_token_spec = [
    ('NUM',   r'\d+'),
    ('ID',    r'[a-zA-Z_][a-zA-Z0-9_]*'),
    ('PLUS',  r'\+'),
    ('MINUS', r'-'),
    ('MUL',   r'\*'),
    ('DIV',   r'/'),
    ('LPAREN',r'\('),
    ('RPAREN',r'\)'),
    ('WS',    r'[ \t\n\r]+'),
]
_token_re = re.compile('|'.join(f'(?P<{k}>{p})' for k,p in _token_spec))

def lex(src: str) -> List[Token]:
    toks = []
    for m in _token_re.finditer(src):
        k = m.lastgroup
        if k == 'WS':
            continue
        toks.append(Token(k, m.group(), m.start()))
    toks.append(Token('EOF','', len(src)))
    return toks

# ------------- Parse Tree (derivação) -------------
class PTNode:
    """Nó da árvore de derivação (parse tree)."""
    def __init__(self, kind: str, symbol: str, lexeme: Optional[str]=None):
        # kind: 'NT' (não-terminal), 'T' (terminal), 'EPS' (epsilon)
        self.kind = kind
        self.symbol = symbol
        self.lexeme = lexeme
        self.children: List['PTNode'] = []
    def add(self, *kids: 'PTNode'):
        self.children.extend(kids); return self

# ------------------- AST -------------------
class AST: pass
class Num(AST):
    def __init__(self, value: int): self.value = value
    def __repr__(self): return f"Num({self.value})"
class Id(AST):
    def __init__(self, name: str): self.name = name
    def __repr__(self): return f"Id({self.name})"
class Bin(AST):
    def __init__(self, op: str, left: AST, right: AST):
        self.op, self.left, self.right = op, left, right
    def __repr__(self): return f"Bin('{self.op}', {self.left}, {self.right})"

# ----------------- Parser LL(1) -----------------
class ParseError(Exception): pass

class Parser:
    # Gramática (sem recursão à esquerda):
    #   E  → T E'
    #   E' → '+' T E' | '-' T E' | ε
    #   T  → F T'
    #   T' → '*' F T' | '/' F T' | ε
    #   F  → '(' E ')' | num | id
    def __init__(self, tokens: List[Token]):
        self.toks = tokens; self.i = 0
    def peek(self): return self.toks[self.i]
    def eat(self, kind: str):
        t = self.peek()
        if t.kind != kind:
            raise ParseError(f"Esperado {kind}, veio {t.kind} em pos {t.pos}")
        self.i += 1; return t

    def parse(self):
        root, ast = self.parse_E()
        self.eat('EOF')
        return root, ast

    def parse_E(self):
        node = PTNode('NT','E')
        t_node, t_ast = self.parse_T()
        ep_node, ep_fold = self.parse_Ep()
        node.add(t_node, ep_node)
        return node, ep_fold(t_ast)

    def parse_Ep(self):
        node = PTNode('NT',"E'")
        if self.peek().kind in ('PLUS','MINUS'):
            op_tok = self.eat(self.peek().kind)
            t_node, t_ast = self.parse_T()
            ep_node, ep_fold = self.parse_Ep()
            node.add(PTNode('T', op_tok.kind, op_tok.lexeme), t_node, ep_node)
            def fold(lhs):
                op = '+' if op_tok.kind=='PLUS' else '-'
                return ep_fold(Bin(op, lhs, t_ast))
            return node, fold
        eps = PTNode('EPS','ε'); node.add(eps)
        return node, (lambda x: x)

    def parse_T(self):
        node = PTNode('NT','T')
        f_node, f_ast = self.parse_F()
        tp_node, tp_fold = self.parse_Tp()
        node.add(f_node, tp_node)
        return node, tp_fold(f_ast)

    def parse_Tp(self):
        node = PTNode('NT',"T'")
        if self.peek().kind in ('MUL','DIV'):
            op_tok = self.eat(self.peek().kind)
            f_node, f_ast = self.parse_F()
            tp_node, tp_fold = self.parse_Tp()
            node.add(PTNode('T', op_tok.kind, op_tok.lexeme), f_node, tp_node)
            def fold(lhs):
                op = '*' if op_tok.kind=='MUL' else '/'
                return tp_fold(Bin(op, lhs, f_ast))
            return node, fold
        eps = PTNode('EPS','ε'); node.add(eps)
        return node, (lambda x: x)

    def parse_F(self):
        node = PTNode('NT','F')
        t = self.peek()
        if t.kind == 'LPAREN':
            lp = self.eat('LPAREN')
            e_node, e_ast = self.parse_E()
            rp = self.eat('RPAREN')
            node.add(PTNode('T','LPAREN', lp.lexeme), e_node, PTNode('T','RPAREN', rp.lexeme))
            return node, e_ast
        if t.kind == 'NUM':
            num = self.eat('NUM')
            node.add(PTNode('T','num', num.lexeme))
            return node, Num(int(num.lexeme))
        if t.kind == 'ID':
            ident = self.eat('ID')
            node.add(PTNode('T','id', ident.lexeme))
            return node, Id(ident.lexeme)
        raise ParseError(f"Token inesperado {t.kind} em pos {t.pos}")

# --------- Utilitários de impressão ----------
def parse_input(src: str):
    toks = lex(src); parser = Parser(toks)
    tree, ast = parser.parse(); return toks, tree, ast

def print_parse_tree(node: PTNode, prefix: str = "", is_last: bool = True):
    branch = "└── " if is_last else "├── "
    label = node.symbol if node.kind != 'T' else (node.symbol if node.lexeme is None else f"{node.symbol}('{node.lexeme}')")
    print(prefix + branch + label)
    new_prefix = prefix + ("    " if is_last else "│   ")
    for i, c in enumerate(node.children):
        print_parse_tree(c, new_prefix, i == len(node.children)-1)

def ast_to_infix(ast: AST) -> str:
    if isinstance(ast, Num): return str(ast.value)
    if isinstance(ast, Id):  return ast.name
    if isinstance(ast, Bin): return f"({ast_to_infix(ast.left)} {ast.op} {ast_to_infix(ast.right)})"
    return "<??>"

def print_ast(ast: AST):
    print("AST:", ast)
    print("Infixo:", ast_to_infix(ast))

print("✓ Base pronta: use as próximas células para LMD/RMD e exemplos.")


✓ Base pronta: use as próximas células para LMD/RMD e exemplos.


Célula 2 — LMD/RMD (passo a passo) + exemplos prontos

In [3]:
#@title LMD/RMD passo a passo + exemplos prontos
# NTs da gramática
NT_SET = {'E', "E'", 'T', "T'", 'F'}

def pt_to_sentential(node: PTNode):
    """Converte a expansão do nó em símbolos da forma sentencial."""
    seq = []
    for c in node.children:
        if c.kind == 'NT':
            seq.append(c.symbol)
        elif c.kind == 'T':
            mapping = {'PLUS':'+','MINUS':'-','MUL':'*','DIV':'/','LPAREN':'(','RPAREN':')'}
            seq.append(mapping.get(c.symbol, c.symbol))  # 'num'/'id' mantidos
        # EPS (ε) não acrescenta nada
    return seq

def derivation_steps(root: PTNode, strategy='LMD'):
    """Gera a sequência de formas sentenciais da raiz até a string final."""
    form = ['E']   # símbolo inicial
    nodes = [root]
    steps = [' '.join(form)]
    while True:
        nt_pos = [i for i,s in enumerate(form) if s in NT_SET]
        if not nt_pos: break
        i = nt_pos[0] if strategy=='LMD' else nt_pos[-1]
        node = nodes[i]
        rhs = pt_to_sentential(node)
        # substitui na forma e nos nós alinhados
        form = form[:i] + rhs + form[i+1:]
        new_nodes = []
        for c in node.children:
            if c.kind in ('NT','T'):
                new_nodes.append(c)
            # EPS: ignora
        nodes = nodes[:i] + new_nodes + nodes[i+1:]
        steps.append(' '.join(form))
    return steps

def show(src: str):
    print(f"\nEntrada: {src}")
    _, tree, ast = parse_input(src)
    lmd = derivation_steps(tree, 'LMD')
    rmd = derivation_steps(tree, 'RMD')
    print("\nLMD (Leftmost Derivation):")
    for k, s in enumerate(lmd): print(f"{k:02d}: {s}")
    print("\nRMD (Rightmost Derivation):")
    for k, s in enumerate(rmd): print(f"{k:02d}: {s}")
    print("\nÁrvore de derivação (parse tree):")
    print_parse_tree(tree)
    print("\nAST:")
    print_ast(ast)

# Exemplos prontos
examples = [
    "2 + 3 * 4",
    "(2 + 3) * 4",
    "x * (y + 1) - 5"
]
for ex in examples:
    show(ex)

print("\n✓ Exemplos prontos executados. Use a próxima célula para testar livremente.")



Entrada: 2 + 3 * 4

LMD (Leftmost Derivation):
00: E
01: T E'
02: F T' E'
03: num T' E'
04: num E'
05: num + T E'
06: num + F T' E'
07: num + num T' E'
08: num + num * F T' E'
09: num + num * num T' E'
10: num + num * num E'
11: num + num * num

RMD (Rightmost Derivation):
00: E
01: T E'
02: T + T E'
03: T + T
04: T + F T'
05: T + F * F T'
06: T + F * F
07: T + F * num
08: T + num * num
09: F T' + num * num
10: F + num * num
11: num + num * num

Árvore de derivação (parse tree):
└── E
    ├── T
    │   ├── F
    │   │   └── num('2')
    │   └── T'
    │       └── ε
    └── E'
        ├── PLUS('+')
        ├── T
        │   ├── F
        │   │   └── num('3')
        │   └── T'
        │       ├── MUL('*')
        │       ├── F
        │       │   └── num('4')
        │       └── T'
        │           └── ε
        └── E'
            └── ε

AST:
AST: Bin('+', Num(2), Bin('*', Num(3), Num(4)))
Infixo: (2 + (3 * 4))

Entrada: (2 + 3) * 4

LMD (Leftmost Derivation):
00: E
01: T E'
02: F T

Célula 3 — Teste livre (sua expressão em uma linha)

In [4]:
#@title Teste livre (edite e execute)
expr = "2 + 3 * (x - 1)"  #@param {type:"string"}
try:
    show(expr)
except Exception as e:
    print("Erro:", e)



Entrada: 2 + 3 * (x - 1)

LMD (Leftmost Derivation):
00: E
01: T E'
02: F T' E'
03: num T' E'
04: num E'
05: num + T E'
06: num + F T' E'
07: num + num T' E'
08: num + num * F T' E'
09: num + num * ( E ) T' E'
10: num + num * ( T E' ) T' E'
11: num + num * ( F T' E' ) T' E'
12: num + num * ( id T' E' ) T' E'
13: num + num * ( id E' ) T' E'
14: num + num * ( id - T E' ) T' E'
15: num + num * ( id - F T' E' ) T' E'
16: num + num * ( id - num T' E' ) T' E'
17: num + num * ( id - num E' ) T' E'
18: num + num * ( id - num ) T' E'
19: num + num * ( id - num ) E'
20: num + num * ( id - num )

RMD (Rightmost Derivation):
00: E
01: T E'
02: T + T E'
03: T + T
04: T + F T'
05: T + F * F T'
06: T + F * F
07: T + F * ( E )
08: T + F * ( T E' )
09: T + F * ( T - T E' )
10: T + F * ( T - T )
11: T + F * ( T - F T' )
12: T + F * ( T - F )
13: T + F * ( T - num )
14: T + F * ( F T' - num )
15: T + F * ( F - num )
16: T + F * ( id - num )
17: T + num * ( id - num )
18: F T' + num * ( id - num )
19: F 

Aula 5 - Construindo árvore sintática manualmente

Célula 1 — Base: Lexer + AST + Parser (recursivo-descendente)

In [5]:
#@title Base: Lexer + AST + Parser (recursivo-descendente)
import re
from dataclasses import dataclass

# ===================== LEXER =====================

@dataclass
class Token:
    kind: str   # 'NUM','ID','PLUS','MINUS','MUL','DIV','LPAREN','RPAREN','EOF'
    lexeme: str
    pos: int    # índice no texto (0-based)

_token_spec = [
    ('NUM',    r'\d+'),
    ('ID',     r'[a-zA-Z_][a-zA-Z0-9_]*'),
    ('PLUS',   r'\+'),
    ('MINUS',  r'-'),
    ('MUL',    r'\*'),
    ('DIV',    r'/'),
    ('LPAREN', r'\('),
    ('RPAREN', r'\)'),
    ('WS',     r'[ \t\n\r]+'),
]
_token_re = re.compile('|'.join(f'(?P<{k}>{p})' for k,p in _token_spec))

def lex(src: str):
    toks = []
    for m in _token_re.finditer(src):
        k = m.lastgroup
        if k == 'WS':
            continue
        toks.append(Token(k, m.group(), m.start()))
    toks.append(Token('EOF','', len(src)))
    return toks

# Utilitário para mensagens de erro com linha:coluna e seta ^
def show_error(src: str, pos: int, msg: str):
    # calcula linha, coluna e mostra contexto curto
    linha_ini = src.rfind('\n', 0, pos) + 1
    linha_fim = src.find('\n', pos)
    if linha_fim == -1: linha_fim = len(src)
    linha = src[linha_ini:linha_fim]
    col = pos - linha_ini
    print(f"Erro de sintaxe na coluna {col+1}: {msg}")
    print(linha)
    print(' ' * col + '^')

# ====================== AST ======================

class AST: pass

@dataclass
class Num(AST):
    value: int

@dataclass
class Id(AST):
    name: str

@dataclass
class Bin(AST):
    op: str
    left: AST
    right: AST

def to_infix(ast: AST) -> str:
    if isinstance(ast, Num): return str(ast.value)
    if isinstance(ast, Id):  return ast.name
    if isinstance(ast, Bin): return f"({to_infix(ast.left)} {ast.op} {to_infix(ast.right)})"
    return "<?>"

# ==================== PARSER =====================

class ParseError(Exception): ...

class Parser:
    """
    Gramática (sem recursão à esquerda):
      Expr   → Term (('+'|'-') Term)*
      Term   → Factor (('*'|'/') Factor)*
      Factor → '(' Expr ')' | NUM | ID
    """
    def __init__(self, src: str):
        self.src = src
        self.toks = lex(src)
        self.i = 0

    def peek(self):
        return self.toks[self.i]

    def eat(self, kind: str):
        t = self.peek()
        if t.kind != kind:
            raise ParseError((t.pos, f"esperado {kind}; veio {t.kind}"))
        self.i += 1
        return t

    def parse(self) -> AST:
        ast = self.parse_expr()
        self.eat('EOF')
        return ast

    # Expr -> Term (('+'|'-') Term)*
    def parse_expr(self) -> AST:
        left = self.parse_term()
        while self.peek().kind in ('PLUS','MINUS'):
            op_tok = self.eat(self.peek().kind)
            op = '+' if op_tok.kind=='PLUS' else '-'
            right = self.parse_term()
            left = Bin(op, left, right)  # associatividade à esquerda
        return left

    # Term -> Factor (('*'|'/') Factor)*
    def parse_term(self) -> AST:
        left = self.parse_factor()
        while self.peek().kind in ('MUL','DIV'):
            op_tok = self.eat(self.peek().kind)
            op = '*' if op_tok.kind=='MUL' else '/'
            right = self.parse_factor()
            left = Bin(op, left, right)
        return left

    # Factor -> '(' Expr ')' | NUM | ID
    def parse_factor(self) -> AST:
        t = self.peek()
        if t.kind == 'LPAREN':
            self.eat('LPAREN')
            node = self.parse_expr()
            if self.peek().kind != 'RPAREN':
                # erro claro: fecha parêntese esperado
                raise ParseError((self.peek().pos, "esperado RPAREN; veio " + self.peek().kind))
            self.eat('RPAREN')
            return node
        if t.kind == 'NUM':
            n = self.eat('NUM')
            return Num(int(n.lexeme))
        if t.kind == 'ID':
            n = self.eat('ID')
            return Id(n.lexeme)
        # token inesperado no início de Factor
        raise ParseError((t.pos, "token inesperado em Factor: " + t.kind))

def parse_text(src: str):
    parser = Parser(src)
    try:
        ast = parser.parse()
        return ast, None
    except ParseError as e:
        pos, msg = e.args[0]
        return None, (pos, msg)


Célula 2 — Exemplos prontos (AST + forma infixa + erros claros)

In [6]:
#@title Exemplos prontos (AST + infixo + erros)
def demo(src: str):
    print("="*60)
    print("Entrada:", src)
    ast, err = parse_text(src)
    if err is None:
        print("AST:", ast)
        print("Infixo:", to_infix(ast))
    else:
        pos, msg = err
        show_error(src, pos, msg)

# Casos que mostram os princípios:
# 1) Precedência: * antes de +
demo("2 + 3 * 4")       # esperado: (2 + (3 * 4))
# 2) Parênteses mudam a ordem
demo("(2 + 3) * 4")     # esperado: ((2 + 3) * 4)
# 3) Identificadores e números mistos
demo("x * (y + 1) - 5") # esperado: ((x * (y + 1)) - 5)
# 4) Erro simples (token inesperado)
demo("2 + * 3")         # esperado: mensagem clara apontando '*'
# 5) Falta de ')'
demo("(2 + 3 * 4")      # esperado: mensagem clara pedindo ')'


Entrada: 2 + 3 * 4
AST: Bin(op='+', left=Num(value=2), right=Bin(op='*', left=Num(value=3), right=Num(value=4)))
Infixo: (2 + (3 * 4))
Entrada: (2 + 3) * 4
AST: Bin(op='*', left=Bin(op='+', left=Num(value=2), right=Num(value=3)), right=Num(value=4))
Infixo: ((2 + 3) * 4)
Entrada: x * (y + 1) - 5
AST: Bin(op='-', left=Bin(op='*', left=Id(name='x'), right=Bin(op='+', left=Id(name='y'), right=Num(value=1))), right=Num(value=5))
Infixo: ((x * (y + 1)) - 5)
Entrada: 2 + * 3
Erro de sintaxe na coluna 5: token inesperado em Factor: MUL
2 + * 3
    ^
Entrada: (2 + 3 * 4
Erro de sintaxe na coluna 11: esperado RPAREN; veio EOF
(2 + 3 * 4
          ^
