In [3]:
import re

In [4]:
class Lexer:
    def __init__(self,source_code):
        self.source_code = source_code
        self.tokens = []

    def tokenize(self):
        token_specification = [
            ("FUNCTION", r'fn'),
            ("MAIN", r'main'),
            ("LET", r'let'),
            ("INT", r'int'),
            ("CHAR", r'char'),
            ("FLOAT", r'float'),
            ("IF", r'if'),
            ("ELSE", r'else'),
            ("WHILE", r'while'),
            ("PRINTLN", r'println'),
            ("RETURN", r'return'),
            ("LBRACKET", r'\('),
            ("RBRACKET", r'\)'),
            ("LBRACE", r'\{'),
            ("RBRACE", r'\}'),
            ("ARROW", r'->'),
            ("COLON", r':'),
            ("SEMICOLON", r';'),
            ("COMMA", r','),
            ("EQ", r'=='),
            ("NE", r'!='),
            ("GE", r'>='),
            ("LE", r'<='),
            ("GT", r'>'),
            ("LT", r'<'),
            ("ASSIGN", r'='),
            ("PLUS", r'\+'),
            ("MINUS", r'-'),
            ("MULT", r'\*'),
            ("DIV", r'/'),
            ("FLOAT_CONST", r'[0-9]+\.[0-9]+'),
            ("INT_CONST", r'[0-9]+'),
            ("CHAR_LITERAL", r"'[^']'"),
            ("FMT_STRING", r'"([^"\\]|\\.)*"'),
            ("ID", r'[a-zA-Z]([a-zA-Z0-9_])*'),
            ("SKIP", r'[ \t\n]+'),
            ("MISMATCH", r'.')
        ]

        tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
        for mo in re.finditer(tok_regex, self.source_code):
            kind = mo.lastgroup
            value = mo.group()
            if kind == "SKIP":
                continue
            elif kind == "MISMATCH":
                raise RuntimeError(f'Unexpected character: {value}')
            else:
                self.tokens.append((kind, value))

In [6]:
# ...existing code...

source_code = """
fn soma(x: int, y: int) -> int {
    return x + y;
}
fn main(){
    let a, b, c: int;
    b = 40;
    c = 39;
    a = soma(b, c);
    println("valor = %d", a);
}

fn calculadora(op: char, x: float, y: float) -> float {
    if op == '+' {
        return x + y;
    }
    else if op == '-'{
        return x - y;
    }
    else if op == '*'{
        return x * y;
    }
    else if op == '/' {
        if y == 0.0 {
            return 0.0;
        }
        return x / y;
    }
    return 0.0;
}
fn main(){
    let a, b: float;
    a = 1.8;
    b = 7.2;
    println("%f", calculadora('*', a, b));
}

fn main(){
    let i: int;
    i = 0;
    while i < 10 {
        println("%d", i);
        i = i + 1;
    }
}
"""

lexer = Lexer(source_code)
lexer.tokenize()

for token in lexer.tokens:
    print(token)

('FUNCTION', 'fn')
('ID', 'soma')
('LBRACKET', '(')
('ID', 'x')
('COLON', ':')
('INT', 'int')
('COMMA', ',')
('ID', 'y')
('COLON', ':')
('INT', 'int')
('RBRACKET', ')')
('ARROW', '->')
('INT', 'int')
('LBRACE', '{')
('RETURN', 'return')
('ID', 'x')
('PLUS', '+')
('ID', 'y')
('SEMICOLON', ';')
('RBRACE', '}')
('FUNCTION', 'fn')
('MAIN', 'main')
('LBRACKET', '(')
('RBRACKET', ')')
('LBRACE', '{')
('LET', 'let')
('ID', 'a')
('COMMA', ',')
('ID', 'b')
('COMMA', ',')
('ID', 'c')
('COLON', ':')
('INT', 'int')
('SEMICOLON', ';')
('ID', 'b')
('ASSIGN', '=')
('INT_CONST', '40')
('SEMICOLON', ';')
('ID', 'c')
('ASSIGN', '=')
('INT_CONST', '39')
('SEMICOLON', ';')
('ID', 'a')
('ASSIGN', '=')
('ID', 'soma')
('LBRACKET', '(')
('ID', 'b')
('COMMA', ',')
('ID', 'c')
('RBRACKET', ')')
('SEMICOLON', ';')
('PRINTLN', 'println')
('LBRACKET', '(')
('FMT_STRING', '"valor = %d"')
('COMMA', ',')
('ID', 'a')
('RBRACKET', ')')
('SEMICOLON', ';')
('RBRACE', '}')
('FUNCTION', 'fn')
('ID', 'calculadora')
('LBRACK

In [None]:
class Parser:
    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0

    def current_token(self):
        if self.pos < len(self.tokens):
            return self.tokens[self.pos]
        return ("EOF", None)

    def eat(self, token_type):
        if self.current_token()[0] == token_type:
            self.pos += 1
        else:
            raise RuntimeError(f'Unexpected token: {self.current_token()}, expected: {token_type}')

    def parse_var_decl(self):
        self.eat("LET")
        self.parse_var_list()
        self.eat("COLON")
        if self.current_token()[0] in ("INT", "CHAR", "FLOAT"):
            self.eat(self.current_token()[0])
        else:
            raise RuntimeError(f'Unexpected type: {self.current_token()}')
        self.eat("SEMICOLON")

    def parse_var_list(self):
        self.eat("ID")
        while self.current_token()[0] == "COMMA":
            self.eat("COMMA")
            self.eat("ID")

    def parse_function(self):
        self.eat("FUNCTION")
        if self.current_token()[0] in ("ID", "MAIN"):
            self.eat(self.current_token()[0])
        else:
            raise RuntimeError(f"Expected function name, got: {self.current_token()}")
        self.eat("LBRACKET")
        self.parse_param_list()
        self.eat("RBRACKET")
        if self.current_token()[0] == "ARROW":
            self.eat("ARROW")
            if self.current_token()[0] in ("INT", "FLOAT", "CHAR"):
                self.eat(self.current_token()[0])
            else:
                raise RuntimeError("Invalid return type")
        self.eat("LBRACE")
        while self.current_token()[0] != "RBRACE":
            self.parse_statement()
        self.eat("RBRACE")

    def parse_param_list(self):
        if self.current_token()[0] == "ID":
            self.eat("ID")
            self.eat("COLON")
            if self.current_token()[0] in ("INT", "FLOAT", "CHAR"):
                self.eat(self.current_token()[0])
            else:
                raise RuntimeError(f"Invalid parameter type: {self.current_token()}")
            while self.current_token()[0] == "COMMA":
                self.eat("COMMA")
                self.eat("ID")
                self.eat("COLON")
                if self.current_token()[0] in ("INT", "FLOAT", "CHAR"):
                    self.eat(self.current_token()[0])
                else:
                    raise RuntimeError(f"Invalid parameter type: {self.current_token()}")

    def parse_func_call(self, expect_semicolon=True):
        self.eat("ID")
        self.eat("LBRACKET")
        self.parse_args_list()
        self.eat("RBRACKET")
        if expect_semicolon and self.current_token()[0] == "SEMICOLON":
            self.eat("SEMICOLON")

    def parse_statement(self):
        token_type = self.current_token()[0]
        if token_type == "LET":
            self.parse_var_decl()
        elif token_type == "RETURN":
            self.eat("RETURN")
            self.parse_expression()
            self.eat("SEMICOLON")
        elif token_type == "PRINTLN":
            self.eat("PRINTLN")
            self.eat("LBRACKET")
            if self.current_token()[0] == "FMT_STRING":
                self.eat("FMT_STRING")
                if self.current_token()[0] == "COMMA":
                    self.eat("COMMA")
                    self.parse_args_list()
            self.eat("RBRACKET")
            self.eat("SEMICOLON")
        elif token_type == "IF":
            self.parse_if()
        elif token_type == "WHILE":
            self.parse_while()
        elif token_type == "ID":
            if self.pos + 1 < len(self.tokens):
                next_token = self.tokens[self.pos + 1][0]
                if next_token == "ASSIGN":
                    self.parse_assignment()
                elif next_token == "LBRACKET":
                    self.parse_func_call(expect_semicolon=True)
                else:
                    self.pos += 1
            else:
                self.pos += 1
        else:
            self.pos += 1

    # Exemplo de parsing de expressão com precedência (simplificado)
    def parse_expression(self):
        self.parse_rel()
        while self.current_token()[0] in ("EQ", "NE"):
            self.eat(self.current_token()[0])
            self.parse_rel()

    def parse_rel(self):
        self.parse_add()
        while self.current_token()[0] in ("LT", "LE", "GT", "GE"):
            self.eat(self.current_token()[0])
            self.parse_add()

    def parse_add(self):
        self.parse_term()
        while self.current_token()[0] in ("PLUS", "MINUS"):
            self.eat(self.current_token()[0])
            self.parse_term()

    def parse_term(self):
        self.parse_factor()
        while self.current_token()[0] in ("MULT", "DIV"):
            self.eat(self.current_token()[0])
            self.parse_factor()

    def parse_factor(self):
        token_type = self.current_token()[0]
        if token_type == "ID":
            self.eat("ID")
            if self.current_token()[0] == "LBRACKET":
                self.parse_func_call(expect_semicolon=False)
        elif token_type in ("INT_CONST", "FLOAT_CONST", "CHAR_LITERAL"):
            self.eat(token_type)
        elif token_type == "LBRACKET":
            self.eat("LBRACKET")
            self.parse_expression()
            self.eat("RBRACKET")
        else:
            raise RuntimeError(f"Unexpected token in factor: {self.current_token()}")

    def parse_assignment(self):
        self.eat("ID")
        self.eat("ASSIGN")
        self.parse_expression()
        self.eat("SEMICOLON")

    def parse_args_list(self):
        if self.current_token()[0] in ("ID", "INT_CONST", "FLOAT_CONST", "CHAR_LITERAL"):
            self.parse_arg()
            while self.current_token()[0] == "COMMA":
                self.eat("COMMA")
                self.parse_arg()

    def parse_arg(self):
        if self.current_token()[0] == "ID":
            self.eat("ID")
            if self.current_token()[0] == "LBRACKET":
                self.parse_func_call(expect_semicolon=False)
        elif self.current_token()[0] in ("INT_CONST", "FLOAT_CONST", "CHAR_LITERAL"):
            self.eat(self.current_token()[0])
        else:
            raise RuntimeError(f"Invalid argument: {self.current_token()}")

    def parse_if(self):
        self.eat("IF")
        self.eat("LBRACKET")
        self.parse_expression()
        self.eat("RBRACKET")
        self.eat("LBRACE")
        while self.current_token()[0] != "RBRACE":
            self.parse_statement()
        self.eat("RBRACE")
        if self.current_token()[0] == "ELSE":
            self.eat("ELSE")
            self.eat("LBRACE")
            while self.current_token()[0] != "RBRACE":
                self.parse_statement()
            self.eat("RBRACE")

    def parse_while(self):
        self.eat("WHILE")
        self.eat("LBRACKET")
        self.parse_expression()
        self.eat("RBRACKET")
        self.eat("LBRACE")
        while self.current_token()[0] != "RBRACE":
            self.parse_statement()
        self.eat("RBRACE")