Ferramentas modernas (LLVM e Clang)

In [1]:
# Gera IR "estilo LLVM" para funções aritméticas simples e executa um intérprete didático.
from dataclasses import dataclass
from typing import List, Tuple, Dict

# AST mínima: soma/sub/mul e retorno
@dataclass
class Func: name:str; params:List[str]; body:List[Tuple[str,Tuple]]
# Instruções: ("add", dst, a, b) | ("sub", dst, a, b) | ("mul", dst, a, b) | ("ret", v)

def emit_ir(f: Func) -> str:
    head = f"define i32 @{f.name}({', '.join('i32 %'+p for p in f.params)}) {{"
    lines = [head]
    for op in f.body:
        k, args = op[0], op[1]
        if k in ("add","sub","mul"):
            dst,a,b = args
            opcode = {"add":"add","sub":"sub","mul":"mul"}[k]
            lines.append(f"  %{dst} = {opcode} i32 %{a}, %{b}")
        elif k == "ret":
            v, = args
            lines.append(f"  ret i32 %{v}")
    lines.append("}")
    return "\n".join(lines)

def exec_ir(f: Func, *argv:int) -> int:
    env: Dict[str,int] = {p:v for p,v in zip(f.params, argv)}
    tmp: Dict[str,int] = {}
    for op in f.body:
        k, args = op[0], op[1]
        if k == "add":
            dst,a,b = args; tmp[dst] = env.get(a, tmp.get(a)) + env.get(b, tmp.get(b))
        elif k == "sub":
            dst,a,b = args; tmp[dst] = env.get(a, tmp.get(a)) - env.get(b, tmp.get(b))
        elif k == "mul":
            dst,a,b = args; tmp[dst] = env.get(a, tmp.get(a)) * env.get(b, tmp.get(b))
        elif k == "ret":
            v, = args; return env.get(v, tmp.get(v))
    raise RuntimeError("Função sem ret")

# Demonstração: soma e combinação
f1 = Func("soma", ["a","b"], [("add",("t1","a","b")), ("ret",("t1",))])
f2 = Func("expr", ["x","y"], [("mul",("t1","x","y")), ("sub",("t2","t1","x")), ("ret",("t2",))])

print("IR de 'soma':\n", emit_ir(f1), sep="")
print("\nExec soma(3,4) →", exec_ir(f1,3,4), "(esperado 7)")
print("\nIR de 'expr':\n", emit_ir(f2), sep="")
print("\nExec expr(5,2) →", exec_ir(f2,5,2), "(esperado 5)")

IR de 'soma':
define i32 @soma(i32 %a, i32 %b) {
  %t1 = add i32 %a, %b
  ret i32 %t1
}

Exec soma(3,4) → 7 (esperado 7)

IR de 'expr':
define i32 @expr(i32 %x, i32 %y) {
  %t1 = mul i32 %x, %y
  %t2 = sub i32 %t1, %x
  ret i32 %t2
}

Exec expr(5,2) → 5 (esperado 5)


Análise estática de código

In [2]:
# Linguagem mini: let, assign, print, if, while, blocos { } e expr inteiras.
# Gera CFG, roda reachability + constant propagation, e emite diagnósticos.
import re
from dataclasses import dataclass
from typing import List, Optional, Dict, Tuple

# ---------- Lexer ----------
TOKS = [
    ("LET", r"let\b"), ("IF", r"if\b"), ("ELSE", r"else\b"),
    ("WHILE", r"while\b"), ("PRINT", r"print\b"),
    ("LBR", r"\{"), ("RBR", r"\}"), ("LP", r"\("), ("RP", r"\)"), ("SEMI", r";"),
    ("ASSIGN", r"="), ("PLUS", r"\+"), ("MINUS", r"-"), ("MUL", r"\*"), ("DIV", r"/"),
    ("NUM", r"\d+"),
    ("ID", r"[A-Za-z_]\w*"),
    ("WS", r"[ \t\r\n]+"), ("COM", r"//[^\n]*"),
]
MASTER = re.compile("|".join(f"(?P<{n}>{p})" for n,p in TOKS))

@dataclass
class Tok: t:str; v:str; line:int; col:int
def lex(src:str)->List[Tok]:
    line=1; line_start=0; out=[]
    for m in MASTER.finditer(src):
        k=m.lastgroup; v=m.group(); col=m.start()-line_start+1
        if k in ("WS","COM"):
            for i,ch in enumerate(v):
                if ch=="\n": line+=1; line_start=m.start()+i+1
            continue
        out.append(Tok(k,v,line,col))
        if "\n" in v:
            for i,ch in enumerate(v):
                if ch=="\n": line+=1; line_start=m.start()+i+1
    out.append(Tok("EOF","",line,1)); return out

# ---------- AST ----------
@dataclass
class Node: line:int; col:int
@dataclass
class Program(Node): body:List['Stmt']
@dataclass
class Block(Node): body:List['Stmt']
@dataclass
class Let(Node): name:str; expr:'Expr'
@dataclass
class Assign(Node): name:str; expr:'Expr'
@dataclass
class Print(Node): expr:'Expr'
@dataclass
class If(Node): cond:'Expr'; then:Block; els:Optional[Block]
@dataclass
class While(Node): cond:'Expr'; body:Block

@dataclass
class Expr(Node): pass
@dataclass
class Num(Expr): val:int
@dataclass
class Var(Expr): name:str
@dataclass
class Bin(Expr): op:str; l:Expr; r:Expr

# ---------- Parser ----------
class P:
    def __init__(self,toks): self.t=toks; self.i=0
    def cur(self): return self.t[self.i]
    def eat(self,k):
        if self.cur().t==k: tok=self.cur(); self.i+=1; return tok
        raise SyntaxError(f"esperado {k}, obtido {self.cur().t} em {self.cur().line}:{self.cur().col}")
    def parse(self)->Program:
        body=[];
        while self.cur().t!="EOF": body.append(self.stmt())
        return Program(1,1,body)
    def block(self)->Block:
        l=self.eat("LBR"); body=[]
        while self.cur().t!="RBR": body.append(self.stmt())
        self.eat("RBR"); return Block(l.line,l.col,body)
    def stmt(self):
        t=self.cur()
        if t.t=="LET":
            self.eat("LET"); name=self.eat("ID"); self.eat("ASSIGN"); e=self.expr(); self.eat("SEMI")
            return Let(t.line,t.col,name.v,e)
        if t.t=="ID" and self.t[self.i+1].t=="ASSIGN":
            name=self.eat("ID"); self.eat("ASSIGN"); e=self.expr(); self.eat("SEMI")
            return Assign(t.line,t.col,name.v,e)
        if t.t=="PRINT":
            self.eat("PRINT"); e=self.expr(); self.eat("SEMI"); return Print(t.line,t.col,e)
        if t.t=="IF":
            self.eat("IF"); self.eat("LP"); c=self.expr(); self.eat("RP"); th=self.block(); el=None
            if self.cur().t=="ELSE": self.eat("ELSE"); el=self.block()
            return If(t.line,t.col,c,th,el)
        if t.t=="WHILE":
            self.eat("WHILE"); self.eat("LP"); c=self.expr(); self.eat("RP"); return While(t.line,t.col,self.expr(),self.block())
        if t.t=="LBR": return self.block()
        raise SyntaxError(f"comando inválido em {t.line}:{t.col}")
    # expr: +, -, *, /
    def expr(self):
        e=self.term()
        while self.cur().t in ("PLUS","MINUS"):
            op=self.eat(self.cur().t).v; r=self.term(); e=Bin(self.cur().line,self.cur().col,op,e,r)
        return e
    def term(self):
        e=self.prim()
        while self.cur().t in ("MUL","DIV"):
            op=self.eat(self.cur().t).v; r=self.prim(); e=Bin(self.cur().line,self.cur().col,op,e,r)
        return e
    def prim(self):
        t=self.cur()
        if t.t=="NUM": self.eat("NUM"); return Num(t.line,t.col,int(t.v))
        if t.t=="ID":  self.eat("ID");  return Var(t.line,t.col,t.v)
        if t.t=="LP":  self.eat("LP"); e=self.expr(); self.eat("RP"); return e
        raise SyntaxError("expr inválida")

# ---------- CFG + análises ----------
@dataclass
class BlockCFG: id:int; stmts:List[Node]; succ:List[int]

def build_linear_cfg(prog:Program)->List[BlockCFG]:
    # simplificado: cada comando é um bloco; if/while viram blocos com saltos ingênuos
    cfg=[]
    for i,st in enumerate(prog.body):
        succ=[i+1] if i+1<len(prog.body) else []
        cfg.append(BlockCFG(i,[st],succ))
    return cfg

TOP, BOT = "Top", "Bot"
def join(a,b):
    if a==b: return a
    if a==BOT: return b
    if b==BOT: return a
    return TOP

def const_prop_and_reach(prog:Program):
    cfg = build_linear_cfg(prog)
    # tabela de constantes por variável
    C: List[Dict[str,object]] = [dict() for _ in cfg]
    reach = [False]*len(cfg); reach[0]=True
    changed=True
    while changed:
        changed=False
        for i,b in enumerate(cfg):
            if not reach[i]: continue
            env = dict(C[i])  # ambiente de entrada
            for st in b.stmts:
                if isinstance(st, Let):
                    val = eval_const(st.expr, env)
                    env[st.name] = val
                elif isinstance(st, Assign):
                    val = eval_const(st.expr, env)
                    env[st.name] = val
                elif isinstance(st, Print):
                    pass
            for s in b.succ:
                reach_before = reach[s]
                reach[s] = True or reach[s]
                # propaga ambiente por junção
                new = {}
                for k in set(env.keys()) | set(C[s].keys()):
                    new[k] = join(env.get(k, BOT), C[s].get(k, BOT))
                if new != C[s] or reach[s] != reach_before:
                    C[s]=new; changed=True
    return cfg, C, reach

def eval_const(e, env):
    if isinstance(e, Num): return e.val
    if isinstance(e, Var): return env.get(e.name, TOP)
    if isinstance(e, Bin):
        l = eval_const(e.l, env); r = eval_const(e.r, env)
        if isinstance(l,int) and isinstance(r,int):
            if e.op=="+": return l+r
            if e.op=="-": return l-r
            if e.op=="*": return l*r
            if e.op=="/":
                if r==0: return ("DIV0",)
                return l//r
        if l==("DIV0",) or r==("DIV0",): return ("DIV0",)
        return TOP
    return TOP

def diagnostics(prog:Program):
    cfg, C, reach = const_prop_and_reach(prog)
    warns=[]
    # dead-code: bloco não alcançável (exceto o primeiro se vazio)
    for b in cfg:
        if not reach[b.id]:
            st = b.stmts[0]
            warns.append(f"[dead-code] {st.line}:{st.col} bloco inalcançável. Dica: remova o trecho.")
    # div zero e undef-use simples por ordem
    defined=set()
    def walk_expr(e, line, col):
        nonlocal warns, defined
        if isinstance(e, Num): return
        if isinstance(e, Var):
            if e.name not in defined:
                warns.append(f"[undef-use] {line}:{col} '{e.name}' não definido. Dica: use 'let'.")
            return
        if isinstance(e, Bin):
            if isinstance(e.r, Num) and e.op=="/" and e.r.val==0:
                warns.append(f"[div-zero] {line}:{col} divisão por zero literal. Dica: garanta denominador != 0.")
            walk_expr(e.l, line, col); walk_expr(e.r, line, col)
    for b in cfg:
        if not reach[b.id]: continue
        st = b.stmts[0]
        if isinstance(st, Let):
            walk_expr(st.expr, st.line, st.col); defined.add(st.name)
        elif isinstance(st, Assign):
            walk_expr(st.expr, st.line, st.col); defined.add(st.name)
        elif isinstance(st, Print):
            walk_expr(st.expr, st.line, st.col)
    return warns

# Demonstração
src = """
let a=10;
let b=2;
let c=a/b;
if(0){ print c; }
print a;
"""
prog = P(lex(src)).parse()
print("Entrada:\n", src.strip(), "\n", sep="")
for w in diagnostics(prog):
    print(w)


Entrada:
let a=10;
let b=2;
let c=a/b;
if(0){ print c; }
print a;



Integração de ferramentas ao projeto final

In [3]:
# Driver didático integrando parse simples, dois passes e saída JSON.
import json, argparse, sys, io, contextlib, re
from dataclasses import dataclass

# Parser super simples linha-a-linha (let/assign/print)
@dataclass
class Stmt: kind:str; name:str; val:str; line:int
def parse_lines(src:str):
    out=[]
    for i,raw in enumerate(src.strip().splitlines(), start=1):
        s=raw.strip()
        if not s or s.startswith("//"): continue
        m = re.fullmatch(r"let\s+([A-Za-z_]\w*)\s*=\s*(.+);", s)
        if m: out.append(Stmt("LET", m.group(1), m.group(2), i)); continue
        m = re.fullmatch(r"([A-Za-z_]\w*)\s*=\s*(.+);", s)
        if m: out.append(Stmt("ASSIGN", m.group(1), m.group(2), i)); continue
        m = re.fullmatch(r"print\s+(.+);", s)
        if m: out.append(Stmt("PRINT", "", m.group(1), i)); continue
        out.append(Stmt("INVALID", "", s, i))
    return out

def pass_undef(stmts):
    defined=set(); diags=[]
    for st in stmts:
        if st.kind in ("LET","ASSIGN"): defined.add(st.name)
        if st.kind=="PRINT":
            for v in re.findall(r"[A-Za-z_]\w*", st.val):
                if v not in defined:
                    diags.append({"kind":"undef-use","line":st.line,"col":1,"msg":f"'{v}' não definido.","hint":"declare com 'let'."})
    return diags

def pass_unused(stmts):
    defs={}; uses=set()
    for st in stmts:
        if st.kind in ("LET","ASSIGN") and st.name not in defs:
            defs[st.name]=st.line
        if st.kind=="PRINT":
            for v in re.findall(r"[A-Za-z_]\w*", st.val):
                uses.add(v)
    return [{"kind":"unused","line":ln,"col":1,"msg":f"'{n}' nunca usado.","hint":"remova ou use."}
            for n,ln in defs.items() if n not in uses]

def run_pipeline(src:str, enable=("undef","unused"), json_out=False, werror=False):
    stmts = parse_lines(src)
    diags=[]
    if "undef" in enable: diags += pass_undef(stmts)
    if "unused" in enable: diags += pass_unused(stmts)
    if json_out:
        print(json.dumps(diags, ensure_ascii=False, indent=2))
    else:
        for d in diags:
            print(f"[{d['kind']}] {d['line']}:{d['col']} {d['msg']} Dica: {d['hint']}")
    exit_code = 1 if (werror and diags) else 0
    print(f"\nExit code: {exit_code}")
    return exit_code

# Demonstração integrada
codigo = """
let a = 10;
let b = 20;
print a;
"""
print("Entrada:\n", codigo.strip(), "\n", sep="")
run_pipeline(codigo, enable=("undef","unused"), json_out=True, werror=True)


Entrada:
let a = 10;
let b = 20;
print a;

[
  {
    "kind": "unused",
    "line": 2,
    "col": 1,
    "msg": "'b' nunca usado.",
    "hint": "remova ou use."
  }
]

Exit code: 1


1

Alternativas a Flex/Bison

In [4]:
# Parser de expressões e 'print' sem Flex/Bison: Pratt + regex.
import re
from dataclasses import dataclass

# ---------- Lexer ----------
TOK = re.compile(r"\s*(?:(print)|(\()|(\))|(\+)|(-)|(\*)|(/)|(\d+)|(.))")
def lex(s):
    out=[]
    i=0
    while i<len(s):
        m=TOK.match(s,i)
        if not m: raise SyntaxError("token inválido")
        i=m.end()
        if m.group(1): out.append(("PRINT","print"))
        elif m.group(2): out.append(("LP","("))
        elif m.group(3): out.append(("RP",")"))
        elif m.group(4): out.append(("PLUS","+"))
        elif m.group(5): out.append(("MINUS","-"))
        elif m.group(6): out.append(("MUL","*"))
        elif m.group(7): out.append(("DIV","/"))
        elif m.group(8): out.append(("NUM",int(m.group(8))))
        elif m.group(9): raise SyntaxError(f"caractere inválido: {m.group(9)}")
    out.append(("EOF",""))
    return out

# ---------- AST ----------
@dataclass
class Num: v:int
@dataclass
class Bin: op:str; l:object; r:object
@dataclass
class Unary: op:str; e:object
@dataclass
class Print: e:object

# ---------- Pratt ----------
BP = {"+":10,"-":10,"*":20,"/":20}
class Parser:
    def __init__(self,toks): self.t=toks; self.i=0
    def cur(self): return self.t[self.i]
    def eat(self,k):
        if self.cur()[0]==k: self.i+=1; return True
        raise SyntaxError(f"esperado {k}, obteve {self.cur()}")
    def parse(self):
        if self.cur()[0]=="PRINT":
            self.eat("PRINT")
            e=self.expr(0)
            self.eat("EOF")
            return Print(e)
        raise SyntaxError("esperado 'print'")
    def nud(self):
        t,v=self.cur()
        if t=="NUM":
            self.i+=1; return Num(v)
        if t=="MINUS":
            self.i+=1; return Unary("-", self.expr(100))
        if t=="LP":
            self.i+=1; e=self.expr(0); self.eat("RP"); return e
        raise SyntaxError("nud inválido")
    def led(self, left, op):
        t,opv=op
        self.i+=1
        right=self.expr(BP[opv])
        return Bin(opv, left, right)
    def expr(self, rbp):
        left=self.nud()
        while True:
            t,_=self.cur()
            if t not in ("PLUS","MINUS","MUL","DIV"): break
            op=self.cur()
            if BP[op[1]]<=rbp: break
            left=self.led(left, op)
        return left

def pretty(n):
    if isinstance(n, Num): return f"Num({n.v})"
    if isinstance(n, Unary): return f"Unary('{n.op}', {pretty(n.e)})"
    if isinstance(n, Bin): return f"Bin('{n.op}', {pretty(n.l)}, {pretty(n.r)})"
    if isinstance(n, Print): return f"Print({pretty(n.e)})"
    return str(n)

# Demonstração
src = "print 1 + 2 * 3"
tree = Parser(lex(src)).parse()
print("Entrada:", src)
print("AST:", pretty(tree))

Entrada: print 1 + 2 * 3
AST: Print(Bin('+', Num(1), Bin('*', Num(2), Num(3))))


Explorando compiladores reais de código aberto

In [6]:
# Cria uma árvore sintética inspirada em projetos reais e permite "consultas".
from pathlib import Path

root = Path("repo-demo")
(root/"clang/lib/Sema").mkdir(parents=True, exist_ok=True)
(root/"clang/lib/CodeGen").mkdir(parents=True, exist_ok=True)
(root/"llvm/lib/Transforms").mkdir(parents=True, exist_ok=True)
(root/"tools/driver").mkdir(parents=True, exist_ok=True)
(root/"tests").mkdir(parents=True, exist_ok=True)

files = {
    "README.md": "# Demo de Layout\nEste repositório simula um compilador.\n",
    "tools/driver/main.cpp": "// ponto de entrada\nint main(){/* args, flags, orquestra */}\n",
    "clang/lib/Sema/SemaExpr.cpp": "// verificação de tipos e regras\n// TODO: checar binários +, -, *\n",
    "clang/lib/CodeGen/CGExpr.cpp": "// lowering de expressões para IR\n// gera add/sub/mul\n",
    "llvm/lib/Transforms/InstCombine.cpp": "// otimizador local\n// junta padrões de instruções\n",
    "tests/arith.c": "int f(){return 1+2*3;}\n",
}
for p,content in files.items():
    path = root/p
    path.write_text(content, encoding="utf-8")

def grep(pattern:str):
    hits=[]
    for p in root.rglob("*"):
        if p.is_file():
            txt=p.read_text(encoding="utf-8", errors="ignore")
            if pattern in txt:
                hits.append(str(p))
    return hits

print("Árvore criada em ./repo-demo")
print("\nArquivos que mencionam 'add':")
for h in grep("add"): print(" -", h)

Árvore criada em ./repo-demo

Arquivos que mencionam 'add':
 - repo-demo/clang/lib/CodeGen/CGExpr.cpp
