diff --git a/norminette/exceptions.py b/norminette/exceptions.py index c45239d..a90ff59 100644 --- a/norminette/exceptions.py +++ b/norminette/exceptions.py @@ -1,4 +1,8 @@ -class CParsingError(Exception): +class NorminetteError(Exception): + pass + + +class CParsingError(NorminetteError): def __init__(self, errmsg): self.msg = errmsg @@ -7,3 +11,12 @@ def __str__(self): def __repr__(self): return self.__str__ + + +class MaybeInfiniteLoop(NorminetteError): + def __init__(self) -> None: + super().__init__("The maximum number of iterations a loop can have has been reached") + + +class UnexpectedEOF(NorminetteError): + pass diff --git a/norminette/lexer/dictionary.py b/norminette/lexer/dictionary.py index de90828..0aa57aa 100644 --- a/norminette/lexer/dictionary.py +++ b/norminette/lexer/dictionary.py @@ -1,5 +1,25 @@ """ Dictionary that correlates lexeme with token """ +trigraphs = { + "??<": '{', + "??>": '}', + "??(": '[', + "??)": ']', + "??=": '#', + "??/": '\\', + "??'": '^', + "??!": '|', + "??-": '~', +} + +digraphs = { + "<%": '{', + "%>": '}', + "<:": '[', + ":>": ']', + "%:": '#', +} + keywords = { # C reserved keywords # "auto": "AUTO", diff --git a/norminette/lexer/lexer.py b/norminette/lexer/lexer.py index 1589118..9beaed5 100644 --- a/norminette/lexer/lexer.py +++ b/norminette/lexer/lexer.py @@ -1,5 +1,8 @@ import string +from typing import Optional, Tuple +from norminette.exceptions import UnexpectedEOF, MaybeInfiniteLoop +from norminette.lexer.dictionary import digraphs, trigraphs from norminette.lexer.dictionary import brackets from norminette.lexer.dictionary import keywords from norminette.lexer.dictionary import operators @@ -7,11 +10,6 @@ from norminette.file import File -def read_file(filename): - with open(filename) as f: - return f.read() - - class TokenError(Exception): def __init__(self, pos, message=None): self.msg = message or f"Error: Unrecognized token line {pos[0]}, col {pos[1]}" @@ -26,7 +24,6 @@ def __init__(self, file: File): self.src = file.source self.len = len(file.source) - self.__char = self.src[0] if self.src != "" else None self.__pos = int(0) self.__line_pos = self.__line = 1 self.tokens = [] @@ -34,17 +31,51 @@ def __init__(self, file: File): def peek_sub_string(self, size): return self.src[self.__pos : self.__pos + size] - def peek(self, *, offset: int = 0, collect: int = 1): + def raw_peek(self, *, offset: int = 0, collect: int = 1): assert collect > 0 and offset >= 0 if (pos := self.__pos + offset) < self.len: return ''.join(self.src[pos:pos+collect]) return None + def peek(self, *, offset: int = 0) -> Optional[Tuple[str, int]]: + if (trigraph := self.raw_peek(offset=offset, collect=3)) in trigraphs: + return trigraphs[trigraph], 3 + if (digraph := self.raw_peek(offset=offset, collect=2)) in digraphs: + return digraphs[digraph], 2 + if word := self.raw_peek(offset=offset): + return word, 1 + return None # Let it crash :D + def pop(self, *, times: int = 1, use_spaces: bool = False): assert times > 0 result = "" for _ in range(times): - char = self.peek() + for _ in range(100): + char, size = self.peek() + if char != '\\': + break + if self.peek(offset=size) is None: + break + temp, _ = self.peek(offset=size) # Don't change the `temp` to `char` + if temp != '\n': + break + self.__pos += size + 1 + self.__line += 1 + self.__line_pos = 0 + if self.peek() is None: + raise UnexpectedEOF() + char, size = self.peek() + else: + # It hits when we have multiple lines followed by `\`, e.g: + # ```c + # // hello \ + # a \ + # b \ + # c\ + # \ + # a + # ``` + raise MaybeInfiniteLoop() if char == '\n': self.__line_pos = 0 self.__line += 1 @@ -52,8 +83,8 @@ def pop(self, *, times: int = 1, use_spaces: bool = False): self.__line_pos += (spaces := 4 - (self.__line_pos - 1) % 4) - 1 if use_spaces: char = ' ' * spaces - self.__line_pos += 1 - self.__pos += 1 + self.__line_pos += size + self.__pos += size result += char return result @@ -63,14 +94,12 @@ def peek_char(self): character is appended to the return value. It will allow us to parse escaped characters easier. """ + char = None if self.__pos < self.len: + char = self.src[self.__pos] if self.src[self.__pos] == "\\": - self.__char = self.src[self.__pos : self.__pos + 2] - else: - self.__char = self.src[self.__pos] - else: - self.__char = None - return self.__char + char = self.src[self.__pos : self.__pos + 2] + return char def pop_char(self, skip_escaped=True): """Pop a character that's been read by increasing self.__pos, @@ -93,10 +122,7 @@ def line_pos(self): def is_string(self): """True if current character could start a string constant""" - if self.peek_sub_string(2) == 'L"' or self.peek_char() == '"': - return True - else: - return False + return self.raw_peek(collect=2) == 'L"' or self.raw_peek() == '"' def is_constant(self): """True if current character could start a numeric constant""" @@ -115,10 +141,7 @@ def is_constant(self): def is_char_constant(self): """True if current character could start a character constant""" - if self.peek_char() == "'" or self.peek_sub_string(2) == "L'": - return True - else: - return False + return self.raw_peek() == "'" or self.raw_peek(collect=2) == "L'" def string(self): """String constants can contain any characer except unescaped newlines. @@ -313,15 +336,20 @@ def constant(self): def mult_comment(self): pos = self.line_pos() val = self.pop(times=2) + # TODO Add to put `UnexpectedEOF` exception as an error in `file.errors` while self.peek(): - if self.peek(collect=2) == "*/": - val += self.pop(times=2) - break + # the `.pop(...)` can raise an `UnexpectedEOF` if source is like: + # ```c + # /*\ + # + # ``` + # note the backslash followed by an empty line val += self.pop(use_spaces=True) - if val.endswith("*/"): - self.tokens.append(Token("MULT_COMMENT", pos, val)) + if val.endswith("*/"): + break else: - raise TokenError(pos) + raise UnexpectedEOF() + self.tokens.append(Token("MULT_COMMENT", pos, val)) def comment(self): """Comments are anything after '//' characters, up until a newline or @@ -329,8 +357,14 @@ def comment(self): """ pos = self.line_pos() val = self.pop(times=2) - while self.peek() and self.peek() != '\n': - val += self.pop() + while self.peek(): + char, _ = self.peek() + if char in ('\n', None): + break + try: + val += self.pop() + except UnexpectedEOF: + break self.tokens.append(Token("COMMENT", pos, val)) def identifier(self): diff --git a/tests/rules/samples/test_comment_escaping_with_backslash.c b/tests/rules/samples/test_comment_escaping_with_backslash.c new file mode 100644 index 0000000..e3a7b0b --- /dev/null +++ b/tests/rules/samples/test_comment_escaping_with_backslash.c @@ -0,0 +1,11 @@ +/* *\ +/ + +#include + +int main() { + for (int i=0; i<2;++i) + printf("%d\n", i); +} + +/**/ \ No newline at end of file diff --git a/tests/rules/samples/test_comment_escaping_with_backslash.out b/tests/rules/samples/test_comment_escaping_with_backslash.out new file mode 100644 index 0000000..e629146 --- /dev/null +++ b/tests/rules/samples/test_comment_escaping_with_backslash.out @@ -0,0 +1,37 @@ +test_comment_escaping_with_backslash.c - IsComment In "GlobalScope" from "None" line 1": + +test_comment_escaping_with_backslash.c - IsEmptyLine In "GlobalScope" from "None" line 3": + +test_comment_escaping_with_backslash.c - IsPreprocessorStatement In "GlobalScope" from "None" line 4": + +test_comment_escaping_with_backslash.c - IsEmptyLine In "GlobalScope" from "None" line 5": + +test_comment_escaping_with_backslash.c - IsFuncDeclaration In "GlobalScope" from "None" line 6": + +test_comment_escaping_with_backslash.c - IsBlockStart In "Function" from "GlobalScope" line 6": + +test_comment_escaping_with_backslash.c - IsControlStatement In "Function" from "GlobalScope" line 7": + +test_comment_escaping_with_backslash.c - IsFunctionCall In "ControlStructure" from "Function" line 8": + +test_comment_escaping_with_backslash.c - IsBlockEnd In "Function" from "GlobalScope" line 9": + +test_comment_escaping_with_backslash.c - IsEmptyLine In "GlobalScope" from "None" line 10": + +test_comment_escaping_with_backslash.c - IsComment In "GlobalScope" from "None" line 11": + +test_comment_escaping_with_backslash.c: Error! +Error: INVALID_HEADER (line: 3, col: 1): Missing or invalid 42 header +Error: SPACE_BEFORE_FUNC (line: 6, col: 4): space before function name +Error: NO_ARGS_VOID (line: 6, col: 10): Empty function argument requires void +Error: BRACE_NEWLINE (line: 6, col: 12): Expected newline before brace +Error: TOO_FEW_TAB (line: 7, col: 1): Missing tabs for indent level +Error: FORBIDDEN_CS (line: 7, col: 3): Forbidden control structure +Error: SPACE_REPLACE_TAB (line: 7, col: 3): Found space when expecting tab +Error: SPC_AFTER_OPERATOR (line: 7, col: 13): missing space after operator +Error: SPC_BFR_OPERATOR (line: 7, col: 13): missing space before operator +Error: SPC_AFTER_OPERATOR (line: 7, col: 18): missing space after operator +Error: SPC_BFR_OPERATOR (line: 7, col: 18): missing space before operator +Error: SPC_AFTER_OPERATOR (line: 7, col: 20): missing space after operator +Error: TOO_FEW_TAB (line: 8, col: 1): Missing tabs for indent level +Error: SPACE_REPLACE_TAB (line: 8, col: 5): Found space when expecting tab diff --git a/tests/rules/samples/test_comment_escaping_with_trigraph.c b/tests/rules/samples/test_comment_escaping_with_trigraph.c new file mode 100644 index 0000000..eba725d --- /dev/null +++ b/tests/rules/samples/test_comment_escaping_with_trigraph.c @@ -0,0 +1,11 @@ +/* *??/ +/ + +#include + +int main() { + for (int i=0; i<2;++i) + printf("%d\n", i); +} + +/**/ \ No newline at end of file diff --git a/tests/rules/samples/test_comment_escaping_with_trigraph.out b/tests/rules/samples/test_comment_escaping_with_trigraph.out new file mode 100644 index 0000000..9e2d122 --- /dev/null +++ b/tests/rules/samples/test_comment_escaping_with_trigraph.out @@ -0,0 +1,37 @@ +test_comment_escaping_with_trigraph.c - IsComment In "GlobalScope" from "None" line 1": + +test_comment_escaping_with_trigraph.c - IsEmptyLine In "GlobalScope" from "None" line 3": + +test_comment_escaping_with_trigraph.c - IsPreprocessorStatement In "GlobalScope" from "None" line 4": + +test_comment_escaping_with_trigraph.c - IsEmptyLine In "GlobalScope" from "None" line 5": + +test_comment_escaping_with_trigraph.c - IsFuncDeclaration In "GlobalScope" from "None" line 6": + +test_comment_escaping_with_trigraph.c - IsBlockStart In "Function" from "GlobalScope" line 6": + +test_comment_escaping_with_trigraph.c - IsControlStatement In "Function" from "GlobalScope" line 7": + +test_comment_escaping_with_trigraph.c - IsFunctionCall In "ControlStructure" from "Function" line 8": + +test_comment_escaping_with_trigraph.c - IsBlockEnd In "Function" from "GlobalScope" line 9": + +test_comment_escaping_with_trigraph.c - IsEmptyLine In "GlobalScope" from "None" line 10": + +test_comment_escaping_with_trigraph.c - IsComment In "GlobalScope" from "None" line 11": + +test_comment_escaping_with_trigraph.c: Error! +Error: INVALID_HEADER (line: 3, col: 1): Missing or invalid 42 header +Error: SPACE_BEFORE_FUNC (line: 6, col: 4): space before function name +Error: NO_ARGS_VOID (line: 6, col: 10): Empty function argument requires void +Error: BRACE_NEWLINE (line: 6, col: 12): Expected newline before brace +Error: TOO_FEW_TAB (line: 7, col: 1): Missing tabs for indent level +Error: FORBIDDEN_CS (line: 7, col: 3): Forbidden control structure +Error: SPACE_REPLACE_TAB (line: 7, col: 3): Found space when expecting tab +Error: SPC_AFTER_OPERATOR (line: 7, col: 13): missing space after operator +Error: SPC_BFR_OPERATOR (line: 7, col: 13): missing space before operator +Error: SPC_AFTER_OPERATOR (line: 7, col: 18): missing space after operator +Error: SPC_BFR_OPERATOR (line: 7, col: 18): missing space before operator +Error: SPC_AFTER_OPERATOR (line: 7, col: 20): missing space after operator +Error: TOO_FEW_TAB (line: 8, col: 1): Missing tabs for indent level +Error: SPACE_REPLACE_TAB (line: 8, col: 5): Found space when expecting tab diff --git a/tests/rules/samples/test_multiple_lines_escaping.c b/tests/rules/samples/test_multiple_lines_escaping.c new file mode 100644 index 0000000..e4ad888 --- /dev/null +++ b/tests/rules/samples/test_multiple_lines_escaping.c @@ -0,0 +1,13 @@ +//\ +\ ola \ +\ ola \ +\ ola \ + \ + \ + \ +\ +\ +\ +\ +oxi \ +eita diff --git a/tests/rules/samples/test_multiple_lines_escaping.out b/tests/rules/samples/test_multiple_lines_escaping.out new file mode 100644 index 0000000..f833cb1 --- /dev/null +++ b/tests/rules/samples/test_multiple_lines_escaping.out @@ -0,0 +1,4 @@ +test_multiple_lines_escaping.c - IsComment In "GlobalScope" from "None" line 1": + +test_multiple_lines_escaping.c: Error! +Error: INVALID_HEADER (line: 1, col: 1): Missing or invalid 42 header