diff --git a/norminette/__main__.py b/norminette/__main__.py index 60143a69..8ee31204 100644 --- a/norminette/__main__.py +++ b/norminette/__main__.py @@ -7,7 +7,7 @@ import argparse from norminette.errors import formatters from norminette.file import File -from norminette.lexer import Lexer, TokenError +from norminette.lexer import Lexer from norminette.exceptions import CParsingError from norminette.registry import Registry from norminette.context import Context @@ -127,10 +127,10 @@ def main(): for file in files: try: lexer = Lexer(file) - tokens = lexer.get_tokens() + tokens = list(lexer) context = Context(file, tokens, debug, args.R) registry.run(context) - except (TokenError, CParsingError) as e: + except CParsingError as e: print(file.path + f": Error!\n\t{colors(e.msg, 'red')}") sys.exit(1) except KeyboardInterrupt: diff --git a/norminette/errors.py b/norminette/errors.py index e3972c89..e8818f7c 100644 --- a/norminette/errors.py +++ b/norminette/errors.py @@ -3,8 +3,17 @@ import os import json from dataclasses import dataclass, field, asdict -from functools import cmp_to_key -from typing import TYPE_CHECKING, Sequence, Union, Literal, Optional, List +from typing import ( + TYPE_CHECKING, + Sequence, + Union, + Literal, + Optional, + List, + overload, + Any, + Type, +) from norminette.norm_error import NormError, NormWarning, errors as errors_dict @@ -12,15 +21,6 @@ from norminette.file import File -def sort_errs(a: Error, b: Error): - # TODO Add to Error and Highlight dataclasses be sortable to remove this fn - ah: Highlight = a.highlights[0] - bh: Highlight = b.highlights[0] - if ah.column == bh.column and ah.lineno == bh.lineno: - return 1 if a.name > b.name else -1 - return ah.column - bh.column if ah.lineno == bh.lineno else ah.lineno - bh.lineno - - @dataclass class Highlight: lineno: int @@ -28,29 +28,121 @@ class Highlight: length: Optional[int] = field(default=None) hint: Optional[str] = field(default=None) + def __lt__(self, other: Any) -> bool: + assert isinstance(other, Highlight) + if self.lineno == other.lineno: + if self.column == other.column: + return len(self.hint or '') > len(other.hint or '') + return self.column > other.column + return self.lineno > other.lineno + @dataclass class Error: name: str text: str - level: Literal["Error", "Notice"] - highlights: List[Highlight] + level: Literal["Error", "Notice"] = field(default="Error") + highlights: List[Highlight] = field(default_factory=list) + + @classmethod + def from_name(cls: Type[Error], /, name: str, **kwargs) -> Error: + return cls(name, errors_dict[name], **kwargs) + + def __lt__(self, other: Any) -> bool: + assert isinstance(other, Error) + if not self.highlights: + return bool(other.highlights) or self.name > other.name + if not other.highlights: + return bool(self.highlights) or other.name > self.name + ah, bh = min(self.highlights), min(other.highlights) + if ah.column == bh.column and ah.lineno == bh.lineno: + return self.name < other.name + return (ah.lineno, ah.column) < (bh.lineno, bh.column) + + @overload + def add_highlight( + self, + lineno: int, + column: int, + length: Optional[int] = None, + hint: Optional[str] = None, + ) -> None: ... + @overload + def add_highlight(self, highlight: Highlight, /) -> None: ... + + def add_highlight(self, *args, **kwargs) -> None: + if len(args) == 1: + highlight, = args + else: + highlight = Highlight(*args, **kwargs) + self.highlights.append(highlight) class Errors: __slots__ = "_inner" def __init__(self) -> None: - self._inner = [] + self._inner: List[Error] = [] + + def __repr__(self) -> str: + return repr(self._inner) def __len__(self) -> int: return len(self._inner) def __iter__(self): - self._inner.sort(key=cmp_to_key(sort_errs)) + self._inner.sort() return iter(self._inner) - # TODO Add `add(...)` method to allow creating `Highlight`s and `Error`s easily + @overload + def add(self, error: Error) -> None: + """Add an `Error` instance to the errors. + """ + ... + + @overload + def add(self, name: str, *, level: Literal["Error", "Notice"] = "Error", highlights: List[Highlight] = ...) -> None: + """Builds an `Error` instance from a name in `errors_dict` and adds it to the errors. + + ```python + >>> errors.add("TOO_MANY_LINES") + >>> errors.add("INVALID_HEADER") + >>> errors.add("GLOBAL_VAR_DETECTED", level="Notice") + ``` + """ + ... + + @overload + def add( + self, + /, + name: str, + text: str, + *, + level: Literal["Error", "Notice"] = "Error", + highlights: List[Highlight] = ..., + ) -> None: + """Builds an `Error` instance and adds it to the errors. + + ```python + >>> errors.add("BAD_IDENTATION", "You forgot an column here") + >>> errors.add("CUSTOM_ERROR", f"name {not_defined!r} is not defined. Did you mean: {levenshtein_distance}?") + >>> errors.add("NOOP", "Empty if statement", level="Notice") + ``` + """ + ... + + def add(self, *args, **kwargs) -> None: + kwargs.setdefault("level", "Error") + error = None + if len(args) == 1: + error = args[0] + if isinstance(error, str): + error = Error.from_name(error, **kwargs) + if len(args) == 2: + error = Error(*args, **kwargs) + assert isinstance(error, Error), "bad function call" + return self._inner.append(error) @property def status(self) -> Literal["OK", "Error"]: @@ -60,15 +152,17 @@ def append(self, value: Union[NormError, NormWarning]) -> None: # TODO Remove NormError and NormWarning since it does not provide `length` data assert isinstance(value, (NormError, NormWarning)) level = "Error" if isinstance(value, NormError) else "Notice" - value = Error(value.errno, value.error_msg, level, highlights=[ + error = Error(value.errno, value.error_msg, level, highlights=[ Highlight(value.line, value.col, None), ]) - self._inner.append(value) + self._inner.append(error) class _formatter: + name: str + def __init__(self, files: Union[File, Sequence[File]]) -> None: - if not isinstance(files, list): + if not isinstance(files, Sequence): files = [files] self.files = files @@ -82,10 +176,9 @@ def __str__(self) -> str: for file in self.files: output += f"{file.basename}: {file.errors.status}!" for error in file.errors: - brief = errors_dict.get(error.name, "Error not found") highlight = error.highlights[0] output += f"\n{error.level}: {error.name:<20} " - output += f"(line: {highlight.lineno:>3}, col: {highlight.column:>3}):\t{brief}" + output += f"(line: {highlight.lineno:>3}, col: {highlight.column:>3}):\t{error.text}" output += '\n' return output @@ -102,7 +195,7 @@ def __str__(self): output = { "files": files, } - return json.dumps(output, separators=",:") + '\n' + return json.dumps(output, separators=(',', ':')) + '\n' formatters = ( diff --git a/norminette/file.py b/norminette/file.py index ac44a115..4f682946 100644 --- a/norminette/file.py +++ b/norminette/file.py @@ -15,7 +15,7 @@ def __init__(self, path: str, source: Optional[str] = None) -> None: @property def source(self) -> str: - if not self._source: + if self._source is None: with open(self.path) as file: self._source = file.read() return self._source diff --git a/norminette/lexer/__init__.py b/norminette/lexer/__init__.py index 1f219c5a..1d14758c 100644 --- a/norminette/lexer/__init__.py +++ b/norminette/lexer/__init__.py @@ -1,5 +1,4 @@ from norminette.lexer.lexer import Lexer -from norminette.lexer.lexer import TokenError from norminette.lexer.tokens import Token -__all__ = ["Lexer", "TokenError", "Token"] +__all__ = ["Lexer", "Token"] diff --git a/norminette/lexer/dictionary.py b/norminette/lexer/dictionary.py index 0aa57aa7..a971e125 100644 --- a/norminette/lexer/dictionary.py +++ b/norminette/lexer/dictionary.py @@ -108,6 +108,7 @@ ">>": "RIGHT_SHIFT", "<<": "LEFT_SHIFT", "?": "TERN_CONDITION", + "#": "HASH", } brackets = { diff --git a/norminette/lexer/lexer.py b/norminette/lexer/lexer.py index 9beaed5e..a3c79e1f 100644 --- a/norminette/lexer/lexer.py +++ b/norminette/lexer/lexer.py @@ -1,5 +1,6 @@ +import re import string -from typing import Optional, Tuple +from typing import Optional, Tuple, cast from norminette.exceptions import UnexpectedEOF, MaybeInfiniteLoop from norminette.lexer.dictionary import digraphs, trigraphs @@ -8,63 +9,179 @@ from norminette.lexer.dictionary import operators from norminette.lexer.tokens import Token from norminette.file import File - - -class TokenError(Exception): - def __init__(self, pos, message=None): - self.msg = message or f"Error: Unrecognized token line {pos[0]}, col {pos[1]}" - - def __repr__(self): - return self.msg +from norminette.errors import Error, Highlight as H + + +def c(a: str, b: str): + a = a.lower() + b = b.lower() + return ( + a + b, a.upper() + b, a + b.upper(), a.upper() + b.upper(), + b + a, b.upper() + a, b + a.upper(), b.upper() + a.upper(), + ) + + +quote_prefixes = (*"lLuU", "u8") +octal_digits = "01234567" +hexadecimal_digits = "0123456789abcdefABCDEF" +integer_suffixes = ( + '', + *"uUlLzZ", + "ll", "LL", + "wb", "WB", + "i64", "I64", + *c('u', 'l'), + *c('u', "ll"), + *c('u', 'z'), + *c('u', "wb"), + *c('u', "i64"), +) +float_suffixes = ( + '', + *"lLfFdD", + "dd", "DD", + "df", "DF", + "dl", "DL", + *c('f', 'i'), + *c('f', 'j'), +) + +INT_LITERAL_PATTERN = re.compile(r""" +^ +# (?P[-+]*) +(?P # prefix can be + 0[bBxX]* # 0, 0b, 0B, 0x, 0X, 0bb, 0BB, ... + | # or empty +) +(?P + # BUG If prefix is followed by two or more x, it doesn't works correctly + (?<=0[xX]) # is prefix for hex digits? + [\da-fA-F]+ # so, collect hex digits + | # otherwise + \d+ # collect decimal digits +) +(?P + (?<=[eE]) # is constant ending with an `E`? + [\w\d+\-.]* # so, collect `+` and `-` operators + | # otherwise + \w # collect suffixes that starts with an letter + [\w\d.]* # and letters, digits and dots that follows it + | # finally, do suffix be optional (empty) +) +""", re.VERBOSE) + + +def _float_pattern(const: str, digit: str, exponent: Tuple[str, str]): + pattern = r""" + ^ + (?P{0}) + (?P + (?: + [{2}]+[-+]{3}+ + |[{2}]+{3}+ + |(?:[{2}][+-]?(?:[.{3}]+)?)+ + ){1} + ) + (?P[\w\d._]*|) + """.format(const, *exponent, digit) + return re.compile(pattern, re.VERBOSE) + + +FLOAT_EXPONENT_LITERAL_PATTERN = _float_pattern(r"\d+", digit=r"\d", exponent=('', "eE")) +FLOAT_FRACTIONAL_LITERAL_PATTERN = _float_pattern(r"(?:\d+)?\.\d+|\d+\.", digit=r"\d", exponent=('?', "eE")) +FLOAT_HEXADECIMAL_LITERAL_PATTERN = _float_pattern(r"0[xX]+[\da-fA-F]+(?:\.[\da-fA-F]+)?", + digit=r"[\da-fA-F]", exponent=('?', "pP")) class Lexer: def __init__(self, file: File): self.file = file - self.src = file.source - self.len = len(file.source) self.__pos = int(0) self.__line_pos = self.__line = 1 - self.tokens = [] - - def peek_sub_string(self, size): - return self.src[self.__pos : self.__pos + size] def raw_peek(self, *, offset: int = 0, collect: int = 1): - assert collect > 0 and offset >= 0 - if (pos := self.__pos + offset) < self.len: - return ''.join(self.src[pos:pos+collect]) + if (pos := self.__pos + offset) < len(self.file.source): + return ''.join(self.file.source[pos:pos+collect]) return None - def peek(self, *, offset: int = 0) -> Optional[Tuple[str, int]]: - if (trigraph := self.raw_peek(offset=offset, collect=3)) in trigraphs: - return trigraphs[trigraph], 3 - if (digraph := self.raw_peek(offset=offset, collect=2)) in digraphs: - return digraphs[digraph], 2 - if word := self.raw_peek(offset=offset): - return word, 1 + def peek(self, *, times: int = 1, offset: int = 0) -> Optional[Tuple[str, int]]: + char, size = '', 0 + for _ in range(times): + if (trigraph := self.raw_peek(offset=offset + size, collect=3)) in trigraphs: + char += trigraphs[trigraph] + size += 3 + elif (digraph := self.raw_peek(offset=offset + size, collect=2)) in digraphs: + char += digraphs[digraph] + size += 2 + elif word := self.raw_peek(offset=offset + size): + char += word + size += 1 + else: + break + if size: + return char, size return None # Let it crash :D - def pop(self, *, times: int = 1, use_spaces: bool = False): - assert times > 0 + def pop( + self, + *, + times: int = 1, + use_spaces: bool = False, + use_escape: bool = False, + ) -> str: result = "" for _ in range(times): for _ in range(100): - char, size = self.peek() + if peek := self.peek(): + char, size = peek + else: + raise UnexpectedEOF() if char != '\\': break - if self.peek(offset=size) is None: + peek = self.peek(offset=size) + if peek is None: break - temp, _ = self.peek(offset=size) # Don't change the `temp` to `char` + temp, _ = peek # Don't change the `temp` to `char` if temp != '\n': + if use_escape: + if temp in r"abefnrtv\"'?": + size += 1 + char += temp + elif temp == 'x': + size += 1 + char += temp + # BUG It is just considering one `byte` (0x0 to 0xFF), so it not works correctly + # with prefixed strings like `L"\0x1234"`. + peek = self.raw_peek(offset=size, collect=2) + if peek is None or peek[0] not in hexadecimal_digits: + error = Error.from_name("NO_HEX_DIGITS", level="Notice") + error.add_highlight(self.__line, self.__line_pos + size - 1, length=1) + self.file.errors.add(error) + else: + for digit in peek: + if digit not in hexadecimal_digits: + break + size += 1 + char += digit + elif temp in octal_digits: + while (temp := self.raw_peek(offset=size)) and temp in octal_digits: + size += 1 + char += temp + else: + error = Error.from_name("UNKNOWN_ESCAPE", level="Notice") + error.add_highlight(self.__line, self.__line_pos + size, length=1) + self.file.errors.add(error) + char += temp + size += 1 break self.__pos += size + 1 self.__line += 1 self.__line_pos = 0 - if self.peek() is None: + peek = self.peek() + if peek is None: raise UnexpectedEOF() - char, size = self.peek() + char, size = peek else: # It hits when we have multiple lines followed by `\`, e.g: # ```c @@ -88,107 +205,129 @@ def pop(self, *, times: int = 1, use_spaces: bool = False): result += char return result - def peek_char(self): - """Return current character being checked, - if the character is a backslash character the following - character is appended to the return value. It will allow us to - parse escaped characters easier. - """ - char = None - if self.__pos < self.len: - char = self.src[self.__pos] - if self.src[self.__pos] == "\\": - char = self.src[self.__pos : self.__pos + 2] - return char - - def pop_char(self, skip_escaped=True): - """Pop a character that's been read by increasing self.__pos, - for escaped characters self.__pos will be increased twice - """ - if self.peek_char() == "\t": - self.__line_pos += 4 - (self.__line_pos - 1 & 3) - else: - self.__line_pos += len(self.peek_char()) - if self.__pos < self.len and skip_escaped and self.src[self.__pos] == "\\": - self.__pos += 1 - self.__pos += 1 - return self.peek_char() - - def peek_token(self): - return self.tokens[-1] - def line_pos(self): return self.__line, self.__line_pos - def is_string(self): - """True if current character could start a string constant""" - return self.raw_peek(collect=2) == 'L"' or self.raw_peek() == '"' - - def is_constant(self): - """True if current character could start a numeric constant""" - if self.peek_char() in string.digits: - return True - elif self.peek_char() == ".": - for i in range(0, self.len - self.__pos): - if self.src[self.__pos + i] == ".": - i += 1 - elif self.src[self.__pos + i] in "0123456789": - return True - else: - return False + def parse_char_literal(self) -> Optional[Token]: + pos = lineno, column = self.line_pos() + value = '' + for prefix in quote_prefixes: + length = len(prefix) + result = self.raw_peek(collect=length + 1) + if not result: + return + if result.startswith(prefix) and result.endswith('\''): + value += self.pop(times=length) + break + if self.raw_peek() != '\'': + return + chars = 0 + value += self.pop() + for _ in range(100): + try: + char = self.pop(use_escape=True) + except UnexpectedEOF: + error = Error.from_name("UNEXPECTED_EOF_CHR", highlights=[ + H(lineno, column, length=len(value)), + ]) + self.file.errors.add(error) + break + if char == '\n': + error = Error.from_name("UNEXPECTED_EOL_CHR", highlights=[ + H(lineno, column, length=len(value)), + H(lineno, column + len(value), length=1, hint="Perhaps you forgot a single quote (')?") + ]) + self.file.errors.add(error) + break + value += char + if char == '\'': + break + chars += 1 else: - return False - - def is_char_constant(self): - """True if current character could start a character constant""" - return self.raw_peek() == "'" or self.raw_peek(collect=2) == "L'" - - def string(self): + raise MaybeInfiniteLoop() + if value == "''": + error = Error.from_name("EMPTY_CHAR", highlights=[H(*pos, length=2)]) + self.file.errors.add(error) + if chars > 1 and value.endswith('\''): + error = Error.from_name("CHAR_AS_STRING", highlights=[ + H(*pos, length=len(value)), + H(*pos, length=1, + hint="Perhaps you want a string (double quote, \") instead of a char (single quote, ')?"), + ]) + self.file.errors.add(error) + return Token("CHAR_CONST", pos, value=value) + + def parse_string_literal(self): """String constants can contain any characer except unescaped newlines. An unclosed string or unescaped newline is a fatal error and thus parsing will stop here. """ - pos = self.line_pos() - tkn_value = "" - if self.peek_char() == "L": - tkn_value += self.peek_char() - self.pop_char() - tkn_value += self.peek_char() - self.pop_char() - while self.peek_char() not in [None]: - tkn_value += self.peek_char() - if self.peek_sub_string(2) == "\\\n": - self.__line += 1 - self.__line_pos = 1 - if self.peek_char() == '"': + if not self.peek(): + return + pos = lineno, column = self.line_pos() + val = '' + for prefix in quote_prefixes: + length = len(prefix) + result = self.raw_peek(collect=length + 1) + if not result: + return + if result.startswith(prefix) and result.endswith('"'): + val += self.pop(times=length) + break + if self.raw_peek() != '"': + return + val += self.pop() + while self.peek() is not None: + char = self.pop(use_escape=True) + val += char + if char == '"': break - if self.peek_char() == '\n': - raise TokenError(pos, f"String literal unterminated detected at line {pos[0]}") - self.pop_char() else: - raise TokenError(pos) + error = Error.from_name("UNEXPECTED_EOF_STR") + error.add_highlight(*pos, length=len(val)) + error.add_highlight(lineno, column + len(val), length=1, hint="Perhaps you forgot a double quote (\")?") + self.file.errors.add(error) + return Token("STRING", pos, val) + + def parse_integer_literal(self): + # TODO Add to support single quote (') to separate digits according to C23 + + match = INT_LITERAL_PATTERN.match(self.file.source[self.__pos:]) + if match is None: return - self.tokens.append(Token("STRING", pos, tkn_value)) - self.pop_char() - def char_constant(self): - """Char constants follow pretty much the same rule as string constants""" - pos = self.line_pos() - tkn_value = "'" - self.pop_char() - while self.peek_char(): - tkn_value += self.peek_char() - if self.peek_char() == "\n": - self.pop_char() - raise TokenError(pos) - if self.peek_char() == "'": - self.pop_char() - self.tokens.append(Token("CHAR_CONST", pos, tkn_value)) - return - self.pop_char() - raise TokenError(pos) + pos = lineno, column = self.line_pos() + token = Token("CONSTANT", pos, slice := self.pop(times=match.end())) - def constant(self): + if match["Suffix"] not in integer_suffixes: + suffix_length = len(match["Suffix"]) + string_length = len(slice) - suffix_length + if match["Suffix"][0] in "+-": + error = Error.from_name("MAXIMAL_MUNCH") + error.add_highlight(lineno, column + string_length, length=1, hint="Perhaps you forgot a space ( )?") + else: + error = Error.from_name("INVALID_SUFFIX") + error.add_highlight(lineno, column + string_length, length=suffix_length) + self.file.errors.add(error) + + def _check_bad_prefix(name: str, bucket: str): + error = Error.from_name(f"INVALID_{name}_INT") + for index, char in enumerate(match["Constant"], start=len(match["Prefix"])): + if char not in bucket: + error.add_highlight(lineno, column + index, length=1) + if error.highlights: + self.file.errors.add(error) + + if match["Prefix"] in ("0b", "0B"): + _check_bad_prefix("BIN", "01") + elif match["Prefix"] == '0': + _check_bad_prefix("OCT", "01234567") + elif match["Prefix"] in ("0x", "0X"): + _check_bad_prefix("HEX", "0123456789abcdefABCDEF") + + return token + + def parse_float_literal(self): """Numeric constants can take many forms: - integer constants only allow digits [0-9] - real number constant only allow digits [0-9], @@ -206,320 +345,193 @@ def constant(self): a numeric constant could start with a '.' (dot character) """ - pos = self.line_pos() - tkn_value = "" - bucket = ".0123456789aAbBcCdDeEfFlLuUxX-+" - while self.peek_char() and ( - self.peek_char() in bucket or self.peek_char() == "\\\n" - ): - if self.peek_char() in "xX": - if tkn_value.startswith("0") is False or len(tkn_value) > 1: - raise TokenError(pos) - for c in "xX": - if c in tkn_value: - raise TokenError(pos) - - elif self.peek_char() in "bB": - if ( - tkn_value != "0" - and tkn_value.startswith("0x") is False - and tkn_value.startswith("0X") is False - ): - raise TokenError(pos) - - elif self.peek_char() in "+-": - if ( - tkn_value.endswith("e") is False - and tkn_value.endswith("E") is False - or self.peek_sub_string(2) in ["++", "--"] - ): - break - - elif ( - self.peek_char() in "eE" - and "0x" not in tkn_value - and "0X" not in tkn_value - ): - if ( - "e" in tkn_value - or "E" in tkn_value - or "f" in tkn_value - or "F" in tkn_value - or "u" in tkn_value - or "U" in tkn_value - or "l" in tkn_value - or "L" in tkn_value - ): - raise TokenError(pos) - - elif self.peek_char() in "lL": - lcount = tkn_value.count("l") + tkn_value.count("L") - if ( - lcount > 1 - or (lcount == 1 and tkn_value[-1] not in "lL") - or ("f" in tkn_value or "F" in tkn_value) - and "0x" not in tkn_value - and "0X" not in tkn_value - ): - raise TokenError(pos) - elif ( - self.peek_char() == "l" - and "L" in tkn_value - or self.peek_char() == "L" - and "l" in tkn_value - ): - raise TokenError(pos) - - elif self.peek_char() in "uU": - if ( - "u" in tkn_value - or "U" in tkn_value - or ( - ( - "e" in tkn_value - or "E" in tkn_value - or "f" in tkn_value - or "F" in tkn_value - ) - and ("0x" not in tkn_value and "0X" not in tkn_value) - ) - ): - raise TokenError(pos) - - elif self.peek_char() in "Ff": - if ( - tkn_value.startswith("0x") is False - and tkn_value.startswith("0X") is False - and ("." not in tkn_value or "f" in tkn_value or "F" in tkn_value) - and "e" not in tkn_value - or "u" in tkn_value - or "U" in tkn_value - or "l" in tkn_value - or "L" in tkn_value - ): - raise TokenError(pos) - - elif ( - self.peek_char() in "aAbBcCdDeE" - and tkn_value.startswith("0x") is False - and tkn_value.startswith("0X") is False - or "u" in tkn_value - or "U" in tkn_value - or "l" in tkn_value - or "L" in tkn_value - ): - raise TokenError(pos) - - elif ( - self.peek_char() in "0123456789" - and "u" in tkn_value - or "U" in tkn_value - or "l" in tkn_value - or "L" in tkn_value - ): - raise TokenError(pos) - - elif self.peek_char() == "." and "." in tkn_value: - raise TokenError(pos) - - tkn_value += self.peek_char() - self.pop_char() - if ( - tkn_value[-1] in "eE" - and tkn_value.startswith("0x") is False - or tkn_value[-1] in "xX" - ): - raise TokenError(pos) + constant = self.raw_peek() + if constant is None: + return + pos = lineno, column = self.line_pos() + src = self.file.source[self.__pos:] + if match := FLOAT_EXPONENT_LITERAL_PATTERN.match(src): + type = "exponent" + elif match := FLOAT_FRACTIONAL_LITERAL_PATTERN.match(src): + type = "fractional" + elif match := FLOAT_HEXADECIMAL_LITERAL_PATTERN.match(src): + type = "hexadecimal" else: - self.tokens.append(Token("CONSTANT", pos, tkn_value)) - - def mult_comment(self): - pos = self.line_pos() + return + error = None + suffix = len(match["Suffix"]) + column += len(match["Constant"]) + badhex = match["Constant"].strip(hexadecimal_digits + '.') + if type == "exponent" and not re.match(r"[eE][-+]?\d+", match["Exponent"]): + error = Error.from_name("BAD_EXPONENT") + error.add_highlight(lineno, column, length=len(match["Exponent"]) + suffix) + elif type == "hexadecimal" and '.' not in match["Constant"] and not match["Exponent"]: + return # Hexadecimal Integer + elif type == "hexadecimal" and badhex not in ('x', 'X'): + error = Error.from_name("MULTIPLE_X") + error.add_highlight(lineno, column - len(match["Constant"]) + 1, length=len(badhex)) + elif match["Constant"].count('.') == 1 and match["Suffix"].count('.') > 0: + error = Error.from_name("MULTIPLE_DOTS") + error.add_highlight(lineno, column, length=len(match["Exponent"]) + suffix) + elif match["Suffix"] not in float_suffixes: + error = Error.from_name("BAD_FLOAT_SUFFIX") + error.add_highlight(lineno, column + len(match["Exponent"]), length=suffix) + if error: + self.file.errors.add(error) + return Token("CONSTANT", pos, self.pop(times=match.end())) + + def parse_multi_line_comment(self) -> Optional[Token]: + if self.raw_peek(collect=2) != "/*": + return + pos = lineno, column = self.line_pos() val = self.pop(times=2) - # TODO Add to put `UnexpectedEOF` exception as an error in `file.errors` + eof = False while self.peek(): - # the `.pop(...)` can raise an `UnexpectedEOF` if source is like: - # ```c - # /*\ - # - # ``` - # note the backslash followed by an empty line - val += self.pop(use_spaces=True) + try: + val += self.pop(use_spaces=True) + except UnexpectedEOF: + eof = True + break if val.endswith("*/"): break else: - raise UnexpectedEOF() - self.tokens.append(Token("MULT_COMMENT", pos, val)) - - def comment(self): + eof = True + if eof: + # TODO Add a better highlight since it is a multi-line token + error = Error.from_name("UNEXPECTED_EOF_MC") + error.add_highlight(lineno, column, length=len(val)) + self.file.errors.add(error) + return Token("MULT_COMMENT", pos, val) + + def parse_line_comment(self) -> Optional[Token]: """Comments are anything after '//' characters, up until a newline or end of file """ + if self.raw_peek(collect=2) != "//": + return pos = self.line_pos() val = self.pop(times=2) - while self.peek(): - char, _ = self.peek() - if char in ('\n', None): + while result := self.peek(): + char, _ = result + if char == '\n': break try: val += self.pop() except UnexpectedEOF: break - self.tokens.append(Token("COMMENT", pos, val)) + return Token("COMMENT", pos, val) - def identifier(self): + def parse_identifier(self) -> Optional[Token]: """Identifiers can start with any letter [a-z][A-Z] or an underscore and contain any letters [a-z][A-Z] digits [0-9] or underscores """ + char = self.raw_peek() + if not char or char not in string.ascii_letters + '_': + return pos = self.line_pos() - tkn_value = "" - while self.peek_char() and ( - self.peek_char() in string.ascii_letters + "0123456789_" - or self.peek_char() == "\\\n" - ): - if self.peek_char() == "\\\n": - self.pop_char() - continue - tkn_value += self.peek_char() - self.pop_char() - if tkn_value in keywords: - self.tokens.append(Token(keywords[tkn_value], pos)) - - else: - self.tokens.append(Token("IDENTIFIER", pos, tkn_value)) + val = self.pop() + while char := self.raw_peek(): + if char not in string.ascii_letters + "0123456789_": + break + val += self.pop() + if val in keywords: + return Token(keywords[val], pos) + return Token("IDENTIFIER", pos, val) - def operator(self): + def parse_operator(self): """Operators can be made of one or more sign, so the longest operators need to be looked up for first in order to avoid false positives eg: '>>' being understood as two 'MORE_THAN' operators instead of one 'RIGHT_SHIFT' operator """ + result = self.peek() + if not result: + return + char, _ = result + if char not in "+-*/,<>^&|!=%;:.~?#": + return pos = self.line_pos() - if self.peek_char() in ".+-*/%<>^&|!=": - if self.peek_sub_string(3) in [">>=", "<<=", "..."]: - self.tokens.append(Token(operators[self.peek_sub_string(3)], pos)) - self.pop_char(), self.pop_char(), self.pop_char() - - elif self.peek_sub_string(2) in [">>", "<<", "->"]: - self.tokens.append(Token(operators[self.peek_sub_string(2)], pos)) - self.pop_char(), self.pop_char() - - elif self.peek_sub_string(2) == self.peek_char() + "=": - self.tokens.append(Token(operators[self.peek_sub_string(2)], pos)) - self.pop_char(), self.pop_char() - - elif self.peek_char() in "+-<>=&|": - if self.peek_sub_string(2) == self.peek_char() * 2: - self.tokens.append(Token(operators[self.peek_sub_string(2)], pos)) - self.pop_char() - self.pop_char() - - else: - self.tokens.append(Token(operators[self.peek_char()], pos)) - self.pop_char() - - else: - self.tokens.append(Token(operators[self.peek_char()], pos)) - self.pop_char() - - else: - self.tokens.append(Token(operators[self.src[self.__pos]], pos)) - self.pop_char() - - def get_next_token(self): + if char in ".+-*/%<>^&|!=": + if self.raw_peek(collect=3) in (">>=", "<<=", "..."): + return Token(operators[self.pop(times=3)], pos) + temp, _ = self.peek(times=2) # type: ignore + if temp in (">>", "<<", "->"): + return Token(operators[self.pop(times=2)], pos) + if temp == char + "=": + return Token(operators[self.pop(times=2)], pos) + if char in "+-<>=&|": + if temp == char * 2: + return Token(operators[self.pop(times=2)], pos) + char = self.pop() + return Token(operators[char], pos) + + def parse_whitespace(self) -> Optional[Token]: + char = self.raw_peek() + if char is None or char not in "\n\t ": + return + if char == ' ': + token = Token("SPACE", self.line_pos()) + elif char == "\t": + token = Token("TAB", self.line_pos()) + elif char == "\n": + token = Token("NEWLINE", self.line_pos()) + self.pop() + return token + + def parse_brackets(self) -> Optional[Token]: + result = self.peek() + if result is None: + return + char, _ = result + if char not in brackets: + return + start = self.line_pos() + value = self.pop() + return Token(brackets[value], start) + + parsers = ( + parse_float_literal, # Need to be above: + # `parse_operator` to avoid `` + # `parse_integer_literal` to avoid `\d+` + parse_integer_literal, + parse_char_literal, + parse_string_literal, + parse_identifier, # Need to be bellow `char` and `string` + parse_whitespace, + parse_line_comment, + parse_multi_line_comment, + parse_operator, + parse_brackets, + ) + + def get_next_token(self) -> Optional[Token]: """Peeks one character and tries to match it to a token type, if it doesn't match any of the token types, an error will be raised and current file's parsing will stop """ - while self.peek_char() is not None: - if self.is_string(): - self.string() - - elif ( - self.peek_char().isalpha() and self.peek_char().isascii() - ) or self.peek_char() == "_": - self.identifier() - - elif self.is_constant(): - self.constant() - - elif self.is_char_constant(): - self.char_constant() - - elif self.peek_char() == "#": - self.tokens.append(Token("HASH", self.line_pos())) - self.pop_char() - - elif self.src[self.__pos :].startswith("/*"): - self.mult_comment() - - elif self.src[self.__pos :].startswith("//"): - self.comment() - - elif self.peek_char() in "+-*/,<>^&|!=%;:.~?": - self.operator() - - elif self.peek_char() == " ": - self.tokens.append(Token("SPACE", self.line_pos())) - self.pop_char() - - elif self.peek_char() == "\t": - self.tokens.append(Token("TAB", self.line_pos())) - self.pop_char() - - elif self.peek_char() == "\n": # or ord(self.peek_char()) == 8203: - self.tokens.append(Token("NEWLINE", self.line_pos())) - self.pop_char() - self.__line_pos = 1 + while self.raw_peek(): + if self.raw_peek(collect=2) == "\\\n" or self.raw_peek(collect=4) == "??/\n": + # Avoid using `.pop()` here since it ignores the escaped + # newline and pops and upcomes after it. E.g, if we have + # `\\\nab` and use `.pop()`, the parsers funcs will see `b``. + _, size = self.peek() # type: ignore + self.__pos += cast(int, size) + 1 self.__line += 1 - - elif self.peek_char() == "\\\n": - self.tokens.append(Token("ESCAPED_NEWLINE", self.line_pos())) - self.pop_char() self.__line_pos = 1 - self.__line += 1 - - elif self.peek_char() in brackets: - self.tokens.append(Token(brackets[self.peek_char()], self.line_pos())) - self.pop_char() else: - raise TokenError(self.line_pos()) - - return self.peek_token() - - return None - - def get_tokens(self): - """Iterate through self.get_next_token() to convert source code into a - token list - """ - while self.get_next_token(): - continue - return self.tokens - - def print_tokens(self): - if self.tokens == []: - return - for t in self.tokens: - if t.type == "NEWLINE": - print(t) - else: - print(t, end="") - if self.tokens[-1].type != "NEWLINE": - print("") + break + for parser in self.parsers: + if result := parser(self): + return result + if char := self.raw_peek(): + error = Error("BAD_LEXEME", f"No matchable token for '{char}' lexeme") + error.add_highlight(*self.line_pos(), length=1) + self.file.errors.add(error) + self.__pos += 1 + self.__line_pos += 1 + # BUG If we have multiples bad lexemes, it can raise RecursionError + return self.get_next_token() - def check_tokens(self): - """ - Only used for testing - """ - if self.tokens == []: - self.get_tokens() - if self.tokens == []: - return "" - ret = "" - for i in range(0, len(self.tokens)): - ret += self.tokens[i].test() - ret += "" if self.tokens[i].type != "NEWLINE" else "\n" - if self.tokens[-1].type != "NEWLINE": - ret += "\n" - return ret + def __iter__(self): + while token := self.get_next_token(): + yield token diff --git a/norminette/lexer/tokens.py b/norminette/lexer/tokens.py index 3506c1ae..46320198 100644 --- a/norminette/lexer/tokens.py +++ b/norminette/lexer/tokens.py @@ -1,25 +1,25 @@ +from typing import Optional, Tuple +from dataclasses import dataclass, field + + +@dataclass(eq=True, repr=True) class Token: - def __init__(self, tkn_type, pos, tkn_value=None): - self.type = str(tkn_type) - self.pos = pos - if tkn_value is not None: - self.value = str(tkn_value) - self.length = len(tkn_value) - else: - self.value = None - self.length = 0 + type: str + pos: Tuple[int, int] + value: Optional[str] = field(default=None) + + @property + def length(self) -> int: + return len(self.value or '') @property def line_column(self): return self.pos[1] - def __repr__(self): + def __str__(self): """ Token representation for debugging, using the format or simply when value is None """ r = f"<{self.type}={self.value}>" if self.value else f"<{self.type}>" return r - - def test(self): - return self.__repr__() diff --git a/norminette/norm_error.py b/norminette/norm_error.py index f326a7ce..e62211ac 100644 --- a/norminette/norm_error.py +++ b/norminette/norm_error.py @@ -122,18 +122,32 @@ "FORBIDDEN_STRUCT": "Struct declaration are not allowed in .c files", "FORBIDDEN_UNION": "Union declaration are not allowed in .c files", "FORBIDDEN_ENUM": "Enum declaration are not allowed in .c files", + "UNEXPECTED_EOF_CHR": "Unexpected end of file (EOF) while parsing a char", + "UNEXPECTED_EOL_CHR": "Unexpected end of line (EOL) while parsing a char", + "UNEXPECTED_EOF_MC": "Unexpected end of file (EOF) while parsing a multiline comment", + "UNEXPECTED_EOF_STR": "Unexpected end of file (EOF) while parsing a string", + "EMPTY_CHAR": "Empty character constant", + "CHAR_AS_STRING": "Character constants can have only one character", + "INVALID_SUFFIX": "This suffix is invalid", + "BAD_FLOAT_SUFFIX": "Invalid suffix for float/double literal constant", + "INVALID_BIN_INT": "Invalid binary integer literal", + "INVALID_OCT_INT": "Invalid octal integer literal", + "INVALID_HEX_INT": "Invalid hexadecimal integer literal", + "MAXIMAL_MUNCH": "Potential maximal munch detected", + "NO_HEX_DIGITS": "No hexadecimal digits followed by the \\x", + "UNKNOWN_ESCAPE": "Unknown escape sequence", + "BAD_EXPONENT": "Exponent has no digits", + "MULTIPLE_DOTS": "Multiple dots in float constant", + "MULTIPLE_X": "Multiple 'x' in hexadecimal float constant", } class NormError: - def __init__(self, errno, line, col=None): + def __init__(self, errno, line, col): self.errno = errno self.line = line self.col = col - if col is not None: - self.error_pos = f"(line: {(str(self.line)).rjust(3)}, col: {(str(self.col)).rjust(3)}):\t" - else: - self.error_pos = f"(line: {(str(self.line)).rjust(3)}):\t " + self.error_pos = f"(line: {(str(self.line)).rjust(3)}, col: {(str(self.col)).rjust(3)}):\t" self.prefix = f"Error: {self.errno:<20} {self.error_pos:>21}" self.error_msg = f"{errors.get(self.errno, 'ERROR NOT FOUND')}" @@ -142,14 +156,11 @@ def __str__(self): class NormWarning: - def __init__(self, errno, line, col=None): + def __init__(self, errno, line, col): self.errno = errno self.line = line self.col = col - if col is not None: - self.error_pos = f"(line: {(str(self.line)).rjust(3)}, col: {(str(self.col)).rjust(3)}):\t" - else: - self.error_pos = f"(line: {(str(self.line)).rjust(3)}):\t " + self.error_pos = f"(line: {(str(self.line)).rjust(3)}, col: {(str(self.col)).rjust(3)}):\t" self.prefix = f"Notice: {self.errno:<20} {self.error_pos:>21}" self.error_msg = f"{errors.get(self.errno, 'WARNING NOT FOUND')}" diff --git a/norminette/registry.py b/norminette/registry.py index 3d9b5e17..9e10068e 100644 --- a/norminette/registry.py +++ b/norminette/registry.py @@ -58,7 +58,7 @@ def run(self, context): raise CParsingError( f"Error: Unrecognized line {unrecognized_tkns[0].pos} while parsing line {unrecognized_tkns}" # noqa: E501 ) - print("uncaught -> ", context.filename) + print("uncaught -> ", context.file.name) print("uncaught -> ", unrecognized_tkns) unrecognized_tkns = [] context.dprint(rule.name, jump) diff --git a/norminette/rules/rule.py b/norminette/rules/rule.py index 9c7c51fd..7c0465c1 100644 --- a/norminette/rules/rule.py +++ b/norminette/rules/rule.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import Tuple, Any from norminette.context import Context @@ -18,14 +18,14 @@ def __repr__(self) -> str: def __hash__(self) -> int: return hash(self.name) - def __eq__(self, value) -> bool: + def __eq__(self, value: Any) -> bool: if isinstance(value, str): return self.name == value - if hasattr(value, "name"): + if isinstance(value, Rule): return self.name == value.name return super().__eq__(value) - def __ne__(self, value) -> bool: + def __ne__(self, value: Any) -> bool: return not (self == value) @@ -80,7 +80,7 @@ class Primary: priority: int scope: Tuple[str, ...] - def __init_subclass__(cls, **kwargs): + def __init_subclass__(cls, **kwargs: Any): cls.priority = kwargs.pop("priority", 0) if not hasattr(cls, "scope"): cls.scope = () diff --git a/tests/lexer/brackets_tokens_test.py b/tests/lexer/brackets_tokens_test.py deleted file mode 100644 index 999b7a31..00000000 --- a/tests/lexer/brackets_tokens_test.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer - -brackets = ( - ('{', "LBRACE"), - ('}', "RBRACE"), - ("(", "LPARENTHESIS"), - (")", "RPARENTHESIS"), - ("[", "LBRACKET"), - ("]", "RBRACKET"), -) - - -@pytest.mark.parametrize("lexeme,name", brackets) -def test_brackets_tokens(lexeme, name): - token = Lexer(File("", lexeme)).get_next_token() - assert token.type == name diff --git a/tests/lexer/char_constant_tokens_test.py b/tests/lexer/char_constant_tokens_test.py deleted file mode 100644 index b21adb7b..00000000 --- a/tests/lexer/char_constant_tokens_test.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer, TokenError - -char_constants = ( - ("'*'", ""), - ("'\\n'", ""), - ("'\\042'", ""), - ("'0x042'", ""), - ("'\n1'", None), - ("'\\n\n'", None), - ("'A", None), -) - - -@pytest.mark.parametrize("lexeme,expected", char_constants) -def test_char_constants_tokens(lexeme, expected): - lexer = Lexer(File("", lexeme)) - if expected is None: - with pytest.raises(TokenError): - lexer.get_next_token() - return - token = lexer.get_next_token() - assert token.test() == expected diff --git a/tests/lexer/constant_tokens_test.py b/tests/lexer/constant_tokens_test.py deleted file mode 100644 index 07adbde6..00000000 --- a/tests/lexer/constant_tokens_test.py +++ /dev/null @@ -1,48 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer, TokenError - -constants = ( - ("42", "\n"), - ("+42", "\n"), - ("-42", "\n"), - ("+-42", "\n"), - ("4.2", "\n"), - (".42", "\n"), - ("4e2", "\n"), - (".4e2", "\n"), - ("4e2f", "\n"), - (".4e2f", "\n"), - ("042", "\n"), - ("0x42", "\n"), - ("-0x4e2", "\n"), - ("42l", "\n"), - ("42ul", "\n"), - ("42ll", "\n"), - ("42ull", "\n"), - ("42u", "\n"), - ( - "-+-+-+-+-+-+-+-0Xe4Ae2", - "" - "\n" - ), - (".e42", "\n"), - ("4.4.4", None), - ("4e4e4", None), - ("4x4x4", None), - ("42uul", None), - ("42Lllu", None), - ("42lul", None), - (".42e", None), -) - - -@pytest.mark.parametrize("lexeme,expected", constants) -def test_constants_tokens(lexeme, expected): - lexer = Lexer(File("", lexeme)) - if expected is None: - with pytest.raises(TokenError): - lexer.get_next_token() - return - assert lexer.check_tokens() == expected diff --git a/tests/lexer/identifiers_tokens_test.py b/tests/lexer/identifiers_tokens_test.py deleted file mode 100644 index ee4e1b37..00000000 --- a/tests/lexer/identifiers_tokens_test.py +++ /dev/null @@ -1,46 +0,0 @@ -import unittest - -from norminette.file import File -from norminette.lexer.lexer import Lexer - - -def eat_tokens(line): - lex = Lexer(File("", line)) - line = "" - while lex.get_next_token(): - line += lex.peek_token().test() - if lex.peek_token().type in ["EOF", "ERROR"]: - break - return line - - -class IdentifiersTokensTest(unittest.TestCase): - def test_simple_identifier(self): - self.assertEqual(eat_tokens("foo"), "") - - def test_underscore_identifier(self): - self.assertEqual(eat_tokens("_foo"), "") - - def test_underscore_with_number_identifier(self): - self.assertEqual(eat_tokens("_foo42"), "") - - def test_double_underscore_with_number_identifier(self): - self.assertEqual(eat_tokens("_foo__42"), "") - - def test_underscore_and_uppercase_identifier(self): - self.assertEqual(eat_tokens("_FOO"), "") - - def test_underscore_at_the_end_and_uppercase_identifier(self): - self.assertEqual(eat_tokens("FOO_"), "") - - def test_identifier_can_not_start_with_a_number(self): - self.assertNotEqual(eat_tokens("5_FOO_"), "") - - def test_identifier_can_not_have_a_space(self): - self.assertNotEqual(eat_tokens("foo 1"), "", - ) diff --git a/tests/lexer/keywords_tokens_test.py b/tests/lexer/keywords_tokens_test.py deleted file mode 100644 index 9d21ad6f..00000000 --- a/tests/lexer/keywords_tokens_test.py +++ /dev/null @@ -1,162 +0,0 @@ -import unittest - -from norminette.file import File -from norminette.lexer.lexer import Lexer - - -def eat_tokens(line): - lex = Lexer(File("", line)) - tokens = [] - while lex.get_next_token(): - tokens.append(lex.peek_token().test()) - if len(tokens) == 1: - return tokens[0] - return tokens - - -class TokensKeywordsTest(unittest.TestCase): - def test_auto_keyword(self): - self.assertEqual(eat_tokens("auto"), "") - - def test_break_keyword(self): - self.assertEqual(eat_tokens("break"), "") - - def test_case_keyword(self): - self.assertEqual(eat_tokens("case"), "") - - def test_char_keyword(self): - self.assertEqual(eat_tokens("char"), "") - - def test_const_keyword(self): - self.assertEqual(eat_tokens("const"), "") - - def test_continue_keyword(self): - self.assertEqual(eat_tokens("continue"), "") - - def test_default_keyword(self): - self.assertEqual(eat_tokens("default"), "") - - def test_do_keyword(self): - self.assertEqual(eat_tokens("do"), "") - - def test_double_keyword(self): - self.assertEqual(eat_tokens("double"), "") - - def test_else_keyword(self): - self.assertEqual(eat_tokens("else"), "") - - def test_enum_keyword(self): - self.assertEqual(eat_tokens("enum"), "") - - def test_extern_keyword(self): - self.assertEqual(eat_tokens("extern"), "") - - def test_float_keyword(self): - self.assertEqual(eat_tokens("float"), "") - - def test_for_keyword(self): - self.assertEqual(eat_tokens("for"), "") - - def test_goto_keyword(self): - self.assertEqual(eat_tokens("goto"), "") - - def test_if_keyword(self): - self.assertEqual(eat_tokens("if"), "") - - def test_int_keyword(self): - self.assertEqual(eat_tokens("int"), "") - - def test_long_keyword(self): - self.assertEqual(eat_tokens("long"), "") - - def test_register_keyword(self): - self.assertEqual(eat_tokens("register"), "") - - def test_return_keyword(self): - self.assertEqual(eat_tokens("return"), "") - - def test_signed_keyword(self): - self.assertEqual(eat_tokens("signed"), "") - - def test_sizeof_keyword(self): - self.assertEqual(eat_tokens("sizeof"), "") - - def test_static_keyword(self): - self.assertEqual(eat_tokens("static"), "") - - def test_struct_keyword(self): - self.assertEqual(eat_tokens("struct"), "") - - def test_switch_keyword(self): - self.assertEqual(eat_tokens("switch"), "") - - def test_typedef_keyword(self): - self.assertEqual(eat_tokens("typedef"), "") - - def test_union_keyword(self): - self.assertEqual(eat_tokens("union"), "") - - def test_unsigned_keyword(self): - self.assertEqual(eat_tokens("unsigned"), "") - - def test_void_keyword(self): - self.assertEqual(eat_tokens("void"), "") - - def test_volatile_keyword(self): - self.assertEqual(eat_tokens("volatile"), "") - - def test_while_keyword(self): - self.assertEqual(eat_tokens("while"), "") - - def test_define_keyword(self): - self.assertEqual(eat_tokens("#define"), ["", ""]) - self.assertEqual(eat_tokens("# define "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#define "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#define //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#define//bla "), ["", "", ""]) - - def test_error_keyword(self): - self.assertEqual(eat_tokens("#error"), ["", ""]) - self.assertEqual(eat_tokens("# error "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#error "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#error //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#error//bla "), ["", "", ""]) - - def test_ifndef_keyword(self): - self.assertEqual(eat_tokens("#ifndef"), ["", ""]) - self.assertEqual(eat_tokens("# ifndef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifndef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifndef //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifndef//bla "), ["", "", ""]) - - def test_ifdef_keyword(self): - self.assertEqual(eat_tokens("#ifdef"), ["", ""]) - self.assertEqual(eat_tokens("# ifdef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifdef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifdef //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifdef//bla "), ["", "", ""]) - - def test_include_keyword(self): - self.assertEqual(eat_tokens("#include"), ["", ""]) - self.assertEqual(eat_tokens("# include "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#include "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#include //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#include//bla "), ["", "", ""]) - - def test_pragma_keyword(self): - self.assertEqual(eat_tokens("#pragma"), ["", ""]) - self.assertEqual(eat_tokens("# pragma "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#pragma "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#pragma //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#pragma//bla "), ["", "", ""]) - - def test_undef_keyword(self): - self.assertEqual(eat_tokens("#undef"), ["", ""]) - self.assertEqual(eat_tokens("# undef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#undef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#undef //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#undef//bla "), ["", "", ""]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/lexer/operators_tokens_test.py b/tests/lexer/operators_tokens_test.py deleted file mode 100644 index 781978c8..00000000 --- a/tests/lexer/operators_tokens_test.py +++ /dev/null @@ -1,53 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer - -operators = ( - (">>=", "RIGHT_ASSIGN"), - ("<<=", "LEFT_ASSIGN"), - ("+=", "ADD_ASSIGN"), - ("-=", "SUB_ASSIGN"), - ("*=", "MUL_ASSIGN"), - ("/=", "DIV_ASSIGN"), - ("%=", "MOD_ASSIGN"), - ("&=", "AND_ASSIGN"), - ("^=", "XOR_ASSIGN"), - ("|=", "OR_ASSIGN"), - ("<=", "LESS_OR_EQUAL"), - (">=", "GREATER_OR_EQUAL"), - ("==", "EQUALS"), - ("!=", "NOT_EQUAL"), - ("=", "ASSIGN"), - (";", "SEMI_COLON"), - (":", "COLON"), - (",", "COMMA"), - (".", "DOT"), - ("!", "NOT"), - ("-", "MINUS"), - ("+", "PLUS"), - ("*", "MULT"), - ("/", "DIV"), - ("%", "MODULO"), - ("<", "LESS_THAN"), - (">", "MORE_THAN"), - ("...", "ELLIPSIS"), - ("++", "INC"), - ("--", "DEC"), - ("->", "PTR"), - ("&&", "AND"), - ("||", "OR"), - ("^", "BWISE_XOR"), - ("|", "BWISE_OR"), - ("~", "BWISE_NOT"), - ("&", "BWISE_AND"), - (">>", "RIGHT_SHIFT"), - ("<<", "LEFT_SHIFT"), - ("?", "TERN_CONDITION"), -) - - -@pytest.mark.parametrize("operator,type", operators) -def test_operators_tokens(operator, type): - token = Lexer(File("", operator)).get_next_token() - assert token.type == type diff --git a/tests/lexer/string_tokens_test.py b/tests/lexer/string_tokens_test.py deleted file mode 100644 index bccbd605..00000000 --- a/tests/lexer/string_tokens_test.py +++ /dev/null @@ -1,17 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer - -strings = ( - ('"Basic string"', ''), - ('L"Basic string"', ''), - ('"Basic \\"string\\""', ''), - ('"Escaped \\\\\\"string\\\\\\\\\\"\\\\"', ''), -) - - -@pytest.mark.parametrize("string,expected", strings) -def test_string_tokens(string, expected): - token = Lexer(File("", string)).get_next_token() - assert token.test() == expected diff --git a/tests/rules/rules_generator_test.py b/tests/rules/rules_generator_test.py index d8aa6c63..19f86caa 100644 --- a/tests/rules/rules_generator_test.py +++ b/tests/rules/rules_generator_test.py @@ -22,7 +22,7 @@ def test_rule_for_file(file, capsys): file = File(file, file_to_lex) lexer = Lexer(file) - context = Context(file, lexer.get_tokens(), debug=2) + context = Context(file, list(lexer), debug=2) registry.run(context) errors = HumanizedErrorsFormatter(file) print(errors, end='') diff --git a/tests/rules/samples/check_preprocessor_define.out b/tests/rules/samples/check_preprocessor_define.out index bbf7d2e0..a3415a4a 100644 --- a/tests/rules/samples/check_preprocessor_define.out +++ b/tests/rules/samples/check_preprocessor_define.out @@ -17,7 +17,7 @@ check_preprocessor_define.c - IsEmptyLine In "GlobalScope" from "None" line 9": check_preprocessor_define.c - IsPreprocessorStatement In "GlobalScope" from "None" line 10": - + check_preprocessor_define.c - IsEmptyLine In "GlobalScope" from "None" line 12": check_preprocessor_define.c - IsPreprocessorStatement In "GlobalScope" from "None" line 13": diff --git a/tests/rules/samples/integer_constants.out b/tests/rules/samples/integer_constants.out index 718c161e..24b4b9ad 100644 --- a/tests/rules/samples/integer_constants.out +++ b/tests/rules/samples/integer_constants.out @@ -15,9 +15,9 @@ integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 8": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 9": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 10": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 11": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 12": @@ -31,9 +31,9 @@ integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 16": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 17": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 18": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 19": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 20": @@ -47,9 +47,9 @@ integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 24": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 25": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 26": - + integer_constants.c - IsBlockEnd In "Function" from "GlobalScope" line 27": integer_constants.c - IsEmptyLine In "GlobalScope" from "None" line 28": diff --git a/tests/rules/samples/ok_func_classic.out b/tests/rules/samples/ok_func_classic.out index 9c275518..0e4b1cde 100644 --- a/tests/rules/samples/ok_func_classic.out +++ b/tests/rules/samples/ok_func_classic.out @@ -84,3 +84,4 @@ ok_func_classic.c: Error! Notice: GLOBAL_VAR_DETECTED (line: 1, col: 1): Global variable present in file. Make sure it is a reasonable choice. Error: INVALID_HEADER (line: 1, col: 1): Missing or invalid 42 header +Error: CHAR_AS_STRING (line: 48, col: 21): Character constants can have only one character diff --git a/tests/rules/samples/rule_tester.py b/tests/rules/samples/rule_tester.py deleted file mode 100644 index d77b7210..00000000 --- a/tests/rules/samples/rule_tester.py +++ /dev/null @@ -1,75 +0,0 @@ -import difflib -import glob -import sys -from io import StringIO - -from norminette.context import Context -from norminette.lexer import Lexer -from registry import Registry - - -registry = Registry() - - -def read_file(filename): - with open(filename) as f: - return f.read() - - -class norminetteRuleTester: - def __init__(self): - self.__tests = 0 - self.__failed = 0 - self.__success = 0 - self.result = [] - - def assertEqual(self, test, ref): - if test == ref: - self.__success += 1 - print("OK") - self.result.append("✓ ") - else: - self.__failed += 1 - print("Error") - diff = difflib.ndiff( - test.splitlines(keepends=True), ref.splitlines(keepends=True) - ) - diff = list(diff) - self.result.append("✗ ") - print("".join(diff)) - - def test_file(self, filename): - stdout = sys.stdout - sys.stdout = buff = StringIO() - lexer = Lexer(read_file(filename)) - context = Context(filename.split("/")[-1], lexer.get_tokens(), debug=2) - registry.run(context, read_file(filename)) - reference_output = read_file(filename.split(".")[0] + ".out") - sys.stdout = stdout - self.assertEqual(buff.getvalue(), reference_output) - - def run_tests(self): - files = glob.glob("tests/rules/*.[ch]") - files.sort() - for f in files: - self.__tests += 1 - print("TESTER -", f.split("/")[-1], end=": ") - try: - self.test_file(f) - except Exception as e: - self.__failed += 1 - print("Error") - print(e) - self.result.append("✗ ") - continue - print("----------------------------------") - print(f"Total {self.__tests}") - print("".join(self.result)) - print(f"Success {self.__success}, Failed {self.__failed}: ", end="") - print("✅ OK!" if self.__failed == 0 else "❌ Error!") - - sys.exit(0 if self.__failed == 0 else 1) - - -if __name__ == "__main__": - norminetteRuleTester().run_tests() diff --git a/tests/rules/samples/test_file_210223.out b/tests/rules/samples/test_file_210223.out index e1784e94..a93c258a 100644 --- a/tests/rules/samples/test_file_210223.out +++ b/tests/rules/samples/test_file_210223.out @@ -3,12 +3,11 @@ test_file_210223.c - IsBlockStart In "Function" from "GlobalScope" line 2": test_file_210223.c - IsControlStatement In "Function" from "GlobalScope" line 3": - + test_file_210223.c - IsExpressionStatement In "ControlStructure" from "Function" line 5": test_file_210223.c - IsFunctionCall In "Function" from "GlobalScope" line 6": - + test_file_210223.c - IsBlockEnd In "Function" from "GlobalScope" line 8": test_file_210223.c - IsEmptyLine In "GlobalScope" from "None" line 9": @@ -25,3 +24,5 @@ test_file_210223.c: Error! Error: INVALID_HEADER (line: 1, col: 1): Missing or invalid 42 header +Error: MIXED_SPACE_TAB (line: 3, col: 61): Mixed spaces and tabs +Error: TAB_INSTEAD_SPC (line: 4, col: 1): Found tab when expecting space diff --git a/tests/test_errors.py b/tests/test_errors.py index ac47ae3f..6ecce44c 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -1,4 +1,6 @@ import json +from typing import List +from dataclasses import astuple import pytest from unittest.mock import patch @@ -8,6 +10,7 @@ from norminette.context import Context from norminette.registry import Registry from norminette.errors import JSONErrorsFormatter +from norminette.errors import Error, Errors, Highlight as H from norminette.errors import HumanizedErrorsFormatter @@ -51,13 +54,13 @@ }, ] ]) -def test_humanized_formatter_errored_file(files, expected_result): +def test_humanized_formatter_errored_file(files: List[File], expected_result: str): registry = Registry() with patch("norminette.rules.check_header.CheckHeader.run") as _: for file in files: lexer = Lexer(file) - context = Context(file, lexer.get_tokens()) + context = Context(file, list(lexer)) registry.run(context) formatter = HumanizedErrorsFormatter(files) @@ -96,8 +99,55 @@ def test_humanized_formatter_errored_file(files, expected_result): @pytest.mark.parametrize("file,test", [it.values() for it in tests]) def test_json_formatter_errored_file(file, test): lexer = Lexer(file) - context = Context(file, lexer.get_tokens()) + context = Context(file, list(lexer)) Registry().run(context) formatter = JSONErrorsFormatter(file) - assert str(formatter) == json.dumps(test, separators=",:") + '\n' + assert str(formatter) == json.dumps(test, separators=(',', ':')) + '\n' + + +def test_error_from_name(): + Error.from_name("NO_ARGS_VOID") + with pytest.raises(KeyError): + Error.from_name("KeyThatDoesNoExists") + + +@pytest.mark.parametrize("errors", [ + [ + Error("BAD_NAME", "Names can't be started with an '_'", "Error", [H(3, 5, 5)]), + Error("GLOBAL_VAR", "Global variables detected, take care", "Notice", [H(2, 1, 1)]), + Error("test", "ola", "Error", [H(1, 1, 1)]), + ], +]) +def test_add_error_signature(errors: List[Error]): + sequence = Errors() + + for error in errors: + sequence.add(error) + + assert len(sequence) == len(errors) + assert list(errors) == errors + + +@pytest.mark.parametrize("args, kwargs", [ + [["NO_ARGS_VOID",], {"highlights": [H(1, 1, 2)]}], +]) +def test_add_name_signature(args, kwargs): + assert isinstance(args, list) and len(args) == 1 + assert set() == set(kwargs) - {"level", "highlights"} + + errors = Errors() + errors.add(*args, **kwargs) + + +def test_error_add_highlight(): + highlights = [ + H(lineno=1, column=1, length=1), + H(lineno=1, column=2, length=1), + ] + + error = Error("42", "42") + error.add_highlight(highlights[0]) + error.add_highlight(*astuple(highlights[1])) + + assert error.highlights == highlights diff --git a/tests/test_lexer.py b/tests/test_lexer.py new file mode 100644 index 00000000..ad647b00 --- /dev/null +++ b/tests/test_lexer.py @@ -0,0 +1,587 @@ +from itertools import chain +from typing import Dict, Any, List, Optional, Tuple + +import pytest + +from norminette.lexer import Token as T +from norminette.lexer.dictionary import keywords, operators, brackets +from norminette.errors import Error as E, Highlight as H +from norminette.exceptions import UnexpectedEOF +from tests.utils import ( + dict_to_pytest_param, + lexer_from_source, +) + + +@pytest.mark.parametrize("source, parameters, expected", dict_to_pytest_param({ + "No args": ["oi", {}, 'o'], + "Empty source": ['', {}, None], + "Collect over than source length": ["hello", {"collect": 10}, "hello"], + "Collect with empty source": ['', {"collect": 3}, None], + "Offset in empty source": ['', {"offset": 3}, None], + "Offset": ["Hello", {"offset": 2}, 'l'], + "Offset with collect": ["Hello, world!", {"offset": 7, "collect": 5}, "world"], + "Offset over than source length with collect": ["Hello, world!", {"offset": 14, "collect": 3}, None], + "Newline": ["\naa", {}, '\n'], + "Escaped newline": ["\\\n", {}, '\\'], +})) +def test_lexer_raw_peek(source: str, parameters: Dict[str, Any], expected: Optional[str]): + lexer = lexer_from_source(source) + + assert lexer.raw_peek(**parameters) == expected + + +@pytest.mark.parametrize("source, parameters, expected", dict_to_pytest_param({ + "Single source char": ['{', {}, ('{', 1)], + "Single digraph source": ["<%", {}, ('{', 2)], + "Single trigraph source": ["??<", {}, ('{', 3)], + "Newline": ['\n', {}, ('\n', 1)], + "Escaped newline": ["\\\n", {}, ('\\', 1)], + "Times with exact chars": ["abc", {"times": 3}, ("abc", 3)], + "Times with trigraphs": [r"??", []], + "ASCII number": ["'9'", "", []], + "Single quote escaped": [r"'\''", r"", []], + "Newline": [r"'\n'", r"", []], + "Empty char": ["''", "", [ + E.from_name("EMPTY_CHAR", highlights=[H(lineno=1, column=1, length=2)])], + ], + "String quote": ['"a"', "None", []], + "Int literal": ['1', "None", []], + "Null": [r"'\0'", r"", []], + "Hexadecimal char E9 (é)": [r"'\xE9'", R"", []], + "Hexadecimal char without sequence": [r"'\x'", R"", [ + E.from_name("NO_HEX_DIGITS", level="Notice", highlights=[ + H(lineno=1, column=3, length=1), + ]), + ]], + "Escape sequence that doesn't exists": [r"'\j'", r"", [ + E.from_name("UNKNOWN_ESCAPE", level="Notice", highlights=[ + H(lineno=1, column=3, length=1), + ]), + ]], + "Char too long": ["'John Galt'", "", [ + E.from_name("CHAR_AS_STRING", highlights=[ + H(lineno=1, column=1, length=len("'John Galt'")), + H(lineno=1, column=1, length=1, + hint="Perhaps you want a string (double quote, \") instead of a char (single quote, ')?"), + ]) + ]], + "Char with L prefix": ["L'a'", "", []], + "Char escaped with L prefix": [r"L'\n'", r"", []], + "Hex with one digit": [r"'\xA'", r"", []], + "Hex with two digits": [r"'\x3F'", r"", []], + "U prefixed char": ["U'h'", "", []], + "u8 prefixed char": ["u8'h'", "", []], + "Bad prefixed char": ["s'h'", "None", []], +})) +def test_lexer_parse_char_literal(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_char_literal() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected, errors", dict_to_pytest_param({ + "Empty string": ["\"\"", "", []], + "ASCII normal string": ["\"x+1=2, where x=1\"", "", []], + "Single quote string": ["'teste'", "None", []], + "Unexpected EOF with empty string": ['\"', "", [ + E.from_name("UNEXPECTED_EOF_STR", highlights=[ + H(lineno=1, column=1, length=1), + H(lineno=1, column=2, length=1, hint="Perhaps you forgot a double quote (\")?"), + ]), + ]], + "Unexpected EOF": ['\"asd', "", [ + E.from_name("UNEXPECTED_EOF_STR", highlights=[ + H(lineno=1, column=1, length=4), + H(lineno=1, column=5, length=1, hint="Perhaps you forgot a double quote (\")?"), + ]), + ]], + "String with escaped new line": ["\"first\\\n second\"", "", []], + "Basic string": ["\"Basic string\"", "", []], + "L basic string": ["L\"Basic string\"", "", []], + "U prefixed string": ["U\"hIGH\"", "", []], + "u8 prefixed string": ["u8\"hIGH\"", "", []], + "Bad prefixed string": ["s\"hIGH\"", "None", []], + "String with escaped quotes": ["\"Basic \\\"string\\\"\"", "", []], + "Multiples escapes and escaped quote": [r'"Escaped \\\"string\\\\\"\\"', + r'', + []], +})) +def test_lexer_parse_string_literal(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_string_literal() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected", dict_to_pytest_param({ + "Empty comment": ["//", ""], + "Comment at EOF": ["// The sky is falling", ""], + "Comment at EOL": ["// The sky is falling\n", ""], + "Comment with escaped line in EOF": ["// The sky is falling\\", r""], + "Comment with escaped line in EOF using trigraph": [r"// The sky is falling??/", + r""], + "Comment with escaped line": ["// The sky is falling\\\n!", ""], + "Comment with escaped line using trigraph": ["// The sky is falling??/\n!", ""], +})) +def test_lexer_parse_line_comment(source: str, str_expected: str): + lexer = lexer_from_source(source) + token = lexer.parse_line_comment() + + assert str(token) == str_expected + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("source, str_expected, errors", dict_to_pytest_param({ + "Multi-line comment in single line at EOF": ["/* The sky is falling*/", + "", []], + "Multi-line comment in multiples lines at EOF": ["/*\na\nb\n\n\n*/", "", []], + "Multi-line comment with escaped line": ["/*\\\na*/", "", []], + "Multi-line comment with escaped line using trigraph": ["/*??/\na*/", "", []], + "Multi-line comment not terminated with escaped line before EOF": ["/*\\\n", "", [ + E.from_name("UNEXPECTED_EOF_MC", highlights=[ + H(lineno=1, column=1, length=len("/*")), + ]), + ]], + "Multi-line comment not terminated": ["/* uepaaa\ne agora??", "", [ + E.from_name("UNEXPECTED_EOF_MC", highlights=[ + H(lineno=1, column=1, length=len("/* uepaaa\ne agora??")), + ]), + ]], + "Multi-line comment not terminate ending with a backslash": ["/*\\", r"", [ + E.from_name("UNEXPECTED_EOF_MC", highlights=[ + H(lineno=1, column=1, length=len("/*\\")), + ]), + ]], + "Comment (not multi-line)": ["// hey, i'm not a multi-line comment", "None", []], + "Space before a multi-line comment": [" /* */", "None", []], +})) +def test_lexer_parse_multi_line_comment(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_multi_line_comment() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected, errors", dict_to_pytest_param({ + "Decimal integer": ["1234567890", "", []], + "Decimal integer with UL as suffix": ["1234567890UL", "", []], + "Decimal integer with bad suffix": ["1234567890ABC", "", [ + E.from_name("INVALID_SUFFIX", highlights=[ + H(lineno=1, column=11, length=len("ABC")), + ]), + ]], + "Binary integer": ["0b1101011", "", []], + "Binary integer with U as suffix": ["0b000001U", "", []], + "Binary integer with bad digits": ["0b1210491011", "", [ + E.from_name("INVALID_BIN_INT", highlights=[ + H(lineno=1, column=4, length=1, hint=None), # 2 + H(lineno=1, column=7, length=1, hint=None), # 4 + H(lineno=1, column=8, length=1, hint=None), # 9 + ]), + ]], + "Binary with bad suffix": ["0b0101e", "", [ + E.from_name("INVALID_SUFFIX", highlights=[H(lineno=1, column=7, length=1)]), + ]], + "Octal integer": ["01234567123", "", []], + "Octal integer with U as suffix": ["0123u", "", []], + "Octal integer with bad digits": ["00072189", "", [ + E.from_name("INVALID_OCT_INT", highlights=[ + H(lineno=1, column=7, length=1, hint=None), # 8 + H(lineno=1, column=8, length=1, hint=None), # 9 + ]), + ]], + "Octal integer with bad suffix with dots": ["000123u.23", "", [ + E.from_name("INVALID_SUFFIX", highlights=[ + H(lineno=1, column=len("000123") + 1, length=len("u.23")), + ]), + ]], + "Hexadecimal with bad suffix": ["0x1uLl;", "", [ + E.from_name("INVALID_SUFFIX", highlights=[H(lineno=1, column=4, length=3)]), + ]], + "Integer with u suffix": ["123u", "", []], + "Integer with U suffix": ["123U", "", []], + "Integer with uz suffix": ["123uz", "", []], + "Integer with UZ suffix": ["123UZ", "", []], + "Integer with z suffix": ["123z", "", []], + "Integer with Z suffix": ["123Z", "", []], + "Integer with ul suffix": ["123ul", "", []], + "Integer with UL suffix": ["123UL", "", []], + "Integer with ull suffix": ["123ull", "", []], + "Integer with ULL suffix": ["123ULL", "", []], + "Integer with ll suffix": ["9000000000ll", "", []], + "Integer with LL suffix": ["9000000000LL", "", []], + "Integer with bad suffix": ["10Uu", "", [ + E.from_name("INVALID_SUFFIX", highlights=[H(lineno=1, column=3, length=len("10"))]), + ]], +})) +def test_lexer_parse_integer_literal(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_integer_literal() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected, errors", dict_to_pytest_param({ + "Integer": ["1234567890", "None", []], + "Integer with exponent-part": ["1e2", "", []], + "Integer with exponent-part and f as suffix": ["1e2f", "", []], + "Integer with bad exponent-part": ["1eeee2xf", "", [ + E.from_name("BAD_EXPONENT", highlights=[H(lineno=1, column=2, length=7)]), + ]], + "Exponent with sign": ["1e+3", "", []], + "Bad float followed by an unary expression": ["45e++ai", "", [ + E.from_name("BAD_EXPONENT", highlights=[H(lineno=1, column=3, length=2)]), + ]], + "Identifier with numbers": ["e42", "None", []], + "Fractional exponent with bad suffix": [".0e4x;", "", [ + E.from_name("BAD_FLOAT_SUFFIX", highlights=[H(lineno=1, column=5, length=1)]), + ]], + "Integer with bad suffix": ["10uu", "None", []], + "Bad suffix with all parts": ["10.12fe10", "", [ + E.from_name("BAD_FLOAT_SUFFIX", highlights=[H(lineno=1, column=6, length=len("fe10"))]), + ]], + "Float without fractional part but with suffix": ["10.f", "", []], + "Float without fractional part but bad suffix": ["10.fU", "", [ + E.from_name("BAD_FLOAT_SUFFIX", highlights=[H(lineno=1, column=4, length=2)]), + ]], + "Real bad suffix": ["21.3E56E4654", "", [ + E.from_name("BAD_FLOAT_SUFFIX", highlights=[H(lineno=1, column=8, length=5)]), + ]], + "Exponent with D suffix": ["105e4d", "", []], + "Bad exponent followed by a suffix": ["105eu", "", [ + E.from_name("BAD_EXPONENT", highlights=[H(lineno=1, column=4, length=2)]), + ]], + "Multiple dots": ["1.1..2.3.4.5", "", [ + E.from_name("MULTIPLE_DOTS", highlights=[H(lineno=1, column=4, length=len("..2.3.4.5"))]), + ]], + "Hexadecimal multiple dots": ["0xF.22..2.3.4.5", "", [ + E.from_name("MULTIPLE_DOTS", highlights=[H(lineno=1, column=7, length=len("..2.3.4.5"))]), + ]], + "Hexadecimal with just constant": ["0xC0FFE", "None", []], + "Hexadecimal integer with suffix": ["0XA0000024u", "None", []], + "Hexadecimal integer with double suffix": ["0XA0000021uL", "None", []], + "Multiple X": ["0xxXxxX123.32f", "", [ + E.from_name("MULTIPLE_X", highlights=[H(lineno=1, column=2, length=len("xxXxxX"))]), + ]], + "Multiple X in an integer hexadecimal": ["0xX1", "None", []], + "Multiple X with exponent": ["0xxAp2", "", [ + E.from_name("MULTIPLE_X", highlights=[H(lineno=1, column=2, length=2)]), + ]], + **{ + # https://www.gnu.org/software/c-intro-and-ref/manual/html_node/Floating-Constants.html + f"Float GNU {number} {source!r}": [source, f"", []] + for number, source in enumerate(( + "1500.0", "15e2", "15e+2", "15.0e2", "1.5e+3", ".15e4", "15000e-1", + "1.0", "1000.", "3.14159", ".05", ".0005", "1e0", "1.0000e0", "100e1", + "100e+1", "100E+1", "1e3", "10000e-1", "3.14159e0", "5e-2", ".0005e+2", + "5E-2", ".0005E2", ".05e-2", "3.14159f", "3.14159e0f", "1000.f", "100E1F", + ".0005f", ".05e-2f", + "0xAp2", "0xAp-1", "0x2.0Bp4", "0xE.2p3", "0x123.ABCp0", + "0x123.ABCp4", "0x100p-8", "0x10p-4", "0x1p+4", "0x1p+8", + )) + } +})) +def test_lexer_parse_float_literal(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_float_literal() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected", dict_to_pytest_param({ + "Identifier starting with an integer": ["42_hello", "None"], + "Identifier starting with an underscore": ["_hello", ""], + "ft_printf identifier": ["ft_printf", ""], + "Identifier with just underscore": ['_', ""], + "Identifier with just one letter": ['a', ""], + "Identifier with uppercase letters": ["EGGS", ""], + "Identifier with mixedcase letters": ["AbCd", ""], + "Identifier with lowercase letters": ["duck", ""], + "Identifier with an hyphen": ["clojure-is-cool", ""], + "Identifier with integers, letters and underscores": ["ascii_2_bigint128", ""], + "String starting with an letter": ["L\"ola\"", ""], + "Char starting with an letter": ["L'1'", ""], +})) +def test_lexer_parse_identifier(source: str, str_expected: str): + lexer = lexer_from_source(source) + token = lexer.parse_identifier() + + assert str(token) == str_expected + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("keyword", keywords.keys()) +def test_lexer_parse_identifier_keyword_only(keyword: str): + lexer = lexer_from_source(keyword) + token = lexer.parse_identifier() + + assert str(token) == f"<{keyword.upper()}>" + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("operator, token_type", list(operators.items()) + [ + ["??=", "HASH"], + ["%:", "HASH"], + ["??'", "BWISE_XOR"], + ["??'=", "XOR_ASSIGN"], + ["??!", "BWISE_OR"], + ["??!??!", "OR"], + ["??!=", "OR_ASSIGN"], + ["??-", "BWISE_NOT"], +]) +def test_lexer_parse_operator(operator: str, token_type: str): + lexer = lexer_from_source(operator) + token = lexer.parse_operator() + + assert str(token) == f"<{token_type}>" + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("bracket, token_type", list(brackets.items()) + [ + ["<%", "LBRACE"], + ["??<", "LBRACE"], + ["%>", "RBRACE"], + ["??>", "RBRACE"], + ["<:", "LBRACKET"], + ["??(", "LBRACKET"], + [":>", "RBRACKET"], + ["??)", "RBRACKET"], +]) +def test_lexer_parse_brackets(bracket: str, token_type: str): + lexer = lexer_from_source(bracket) + token = lexer.parse_brackets() + + assert str(token) == f"<{token_type}>" + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("source, expected_tokens", dict_to_pytest_param({ + "Empty source": ['', []], + "Just space source": [" ", [ + T("SPACE", (1, 1)), + T("SPACE", (1, 2)), + T("SPACE", (1, 3)), + ]], + "Identifier followed by a comment": ["test//comment", [ + T("IDENTIFIER", (1, 1), "test"), + T("COMMENT", (1, 5), "//comment"), + ]], + "Main function prototype with void": ["int\tmain(void);", [ + T("INT", (1, 1)), + T("TAB", (1, 4)), + T("IDENTIFIER", (1, 5), value="main"), + T("LPARENTHESIS", (1, 9)), + T("VOID", (1, 10)), + T("RPARENTHESIS", (1, 14)), + T("SEMI_COLON", (1, 15)), + ]], + # Checks if `identifier` is bellow to `char` and `string` + "Wide char/string followed by identifier": ["L'a' L\"bcd\" name", [ + T("CHAR_CONST", (1, 1), value="L'a'"), + T("SPACE", (1, 5)), + T("STRING", (1, 6), value="L\"bcd\""), + T("SPACE", (1, 12)), + T("IDENTIFIER", (1, 13), value="name"), + ]], + "Integer": ["42", [T("CONSTANT", (1, 1), value="42")]], + "Integer with plus sign": ["+42", [ + T("PLUS", (1, 1)), + T("CONSTANT", (1, 2), value="42"), + ]], + "Integer with minus sign": ["-42", [ + T("MINUS", (1, 1)), + T("CONSTANT", (1, 2), value="42"), + ]], + "Integer with double sign": ["+-42", [ + T("PLUS", (1, 1)), + T("MINUS", (1, 2)), + T("CONSTANT", (1, 3), value="42"), + ]], + "Float": ["4.2", [T("CONSTANT", (1, 1), value="4.2")]], + "Float without integer part": [".42", [T("CONSTANT", (1, 1), value=".42")]], + "Float exponential": ["4e2", [T("CONSTANT", (1, 1), value="4e2")]], + "Float with exponential in fractional part without integer": [".4e2", [T("CONSTANT", (1, 1), value=".4e2")]], + "Float exponential with suffix": ["4e2f", [T("CONSTANT", (1, 1), value="4e2f")]], + "Float exponential in fractional part with suffix": [".4e2f", [T("CONSTANT", (1, 1), value=".4e2f")]], + "Octal": ["042", [T("CONSTANT", (1, 1), value="042")]], + "Hexadecimal": ["0x42", [T("CONSTANT", (1, 1), value="0x42")]], + "Negative hexadecimal": ["-0x4e2", [ + T("MINUS", (1, 1)), + T("CONSTANT", (1, 2), value="0x4e2"), + ]], + "Integer with l as suffix": ["42l", [T("CONSTANT", (1, 1), value="42l")]], + "Integer with ul as suffix": ["42ul", [T("CONSTANT", (1, 1), value="42ul")]], + "Integer with ll as suffix": ["42ll", [T("CONSTANT", (1, 1), value="42ll")]], + "Integer with ull as suffix": ["42ull", [T("CONSTANT", (1, 1), value="42ull")]], + "Integer with u suffix": ["42u", [T("CONSTANT", (1, 1), value="42u")]], + "Multiples signs": ["-+-+-+-+-+-+-+-0Xe4Ae2", [ + T("MINUS", (1, 1)), + T("PLUS", (1, 2)), + T("MINUS", (1, 3)), + T("PLUS", (1, 4)), + T("MINUS", (1, 5)), + T("PLUS", (1, 6)), + T("MINUS", (1, 7)), + T("PLUS", (1, 8)), + T("MINUS", (1, 9)), + T("PLUS", (1, 10)), + T("MINUS", (1, 11)), + T("PLUS", (1, 12)), + T("MINUS", (1, 13)), + T("PLUS", (1, 14)), + T("MINUS", (1, 15)), + T("CONSTANT", (1, 16), value="0Xe4Ae2"), + ]], + "Member expression with left part": [".e42", [ + T("DOT", (1, 1)), + T("IDENTIFIER", (1, 2), value="e42") + ]], + "Multiples dots in float": ["4.4.4", [T("CONSTANT", (1, 1), value="4.4.4")]], + "Multiples exponents": ["4e4e4", [T("CONSTANT", (1, 1), value="4e4e4")]], + "Bad suffix 1": ["4x4x4", [T("CONSTANT", (1, 1), value="4x4x4")]], + "Bad suffix 2": ["42uul", [T("CONSTANT", (1, 1), value="42uul")]], + "Bad suffix 3": ["42Lllu", [T("CONSTANT", (1, 1), value="42Lllu")]], + "Bad suffix 4": ["42lul", [T("CONSTANT", (1, 1), value="42lul")]], + "Bad exponent": [".42e", [T("CONSTANT", (1, 1), ".42e")]], + "Escaped newline followed by an identifier": ["\\\nhello;", [ + T("IDENTIFIER", (2, 1), value="hello"), + T("SEMI_COLON", (2, 6)), + ]], + # TODO Add tests for digraphs/trigraphs + **dict(chain.from_iterable(map(dict.items, ( + { + f"Empty {name}": [f"#{name}", [ + T("HASH", (1, 1)), + T("IDENTIFIER", (1, 2), value=name), + ]], + f"Empty spaced {name}": [f"# {name} ", [ + T("HASH", (1, 1)), + T("SPACE", (1, 2)), + T("IDENTIFIER", (1, 3), value=name), + T("SPACE", (1, 3 + len(name))), + ]], + f"Empty {name} ending with withespaces": [f"#{name} ", [ + T("HASH", (1, 1)), + T("IDENTIFIER", (1, 2), value=name), + T("SPACE", (1, 2 + len(name))), + T("TAB", (1, 3 + len(name))), + ]], + f"Empty {name} ending with a comment separated by space": [f"#{name} //bla", [ + T("HASH", (1, 1)), + T("IDENTIFIER", (1 , 2), value=name), + T("SPACE", (1, 2 + len(name))), + T("COMMENT", (1, 3 + len(name)), value="//bla"), + ]], + f"Empty {name} followed by a comment": [f"#{name}//bla ", [ + T("HASH", (1, 1)), + T("IDENTIFIER", (1, 2), value=name), + T("COMMENT", (1, 2 + len(name)), value="//bla "), + ]], + } + for name in ("define", "error", "ifndef", "ifdef", "include", "pragma", "undef") + )))), +})) +def test_lexer_tokens(source: str, expected_tokens: List[T]): + lexer = lexer_from_source(source) + tokens = list(lexer) + + assert tokens == expected_tokens diff --git a/tests/tokenizer/samples/ok/file_token_test.py b/tests/tokenizer/samples/ok/file_token_test.py deleted file mode 100644 index 88a72576..00000000 --- a/tests/tokenizer/samples/ok/file_token_test.py +++ /dev/null @@ -1,84 +0,0 @@ -import difflib -import glob -import sys - -from norminette.lexer import Lexer -from norminette.lexer import TokenError - - -def read_file(filename): - with open(filename) as f: - return f.read() - - -class norminetteFileTester: - def __init__(self): - self.__tests = 0 - self.__failed = 0 - self.__success = 0 - self.result = [] - - def assertEqual(self, first, second): - if first == second: - self.__success += 1 - print("OK") - self.result.append("✓ ") - else: - print("Error") - self.__failed += 1 - diff = difflib.ndiff( - first.splitlines(keepends=True), second.splitlines(keepends=True) - ) - diff = list(diff) - self.result.append("✗ ") - print("".join(diff)) - - def assertRaises(self, test, ref): - try: - diff = "".join(test()) - self.__failed += 1 - print("Error") - print(diff, end="") - self.result.append("✗ ") - except TokenError as e: - if e.msg == ref: - self.__success += 1 - print("OK") - self.result.append("✓ ") - else: - self.__failed += 1 - print("Error") - diff = difflib.ndiff(e.msg.splitlines(), ref.splitlines()) - diff = list(diff) - self.result.append("✗ ") - print("".join(diff)) - - def test_files(self): - files = glob.glob("tests/lexer/files/*.c") - files.sort() - for f in files: - self.__tests += 1 - print(f.split("/")[-1], end=": ") - - try: - output = Lexer(read_file(f)).check_tokens() - except TokenError as t: - self.__failed += 1 - print("Error") - print(t) - self.result.append("✗ ") - continue - reference_output = read_file(f.split(".")[0] + ".tokens") - self.assertEqual(output, reference_output) - - print("----------------------------------") - print(f"Total {self.__tests}") - print("".join(self.result)) - print(f"Success {self.__success}, Failed {self.__failed}: ", end="") - print("✅ OK!" if self.__failed == 0 else "❌ Error!") - - sys.exit(0 if self.__failed == 0 else 1) - - -if __name__ == "__main__": - norminetteFileTester().test_files() diff --git a/tests/tokenizer/token_errors_test.py b/tests/tokenizer/token_errors_test.py deleted file mode 100644 index e19bda1b..00000000 --- a/tests/tokenizer/token_errors_test.py +++ /dev/null @@ -1,49 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer, TokenError - - -failed_tokens_tests = [ - {"text": "\tdouble f=45e++ai", "line": 1, "pos": 14}, - {"text": '\tchar *b = "e42\n\n', "line": 1, "pos": 15}, - {"text": "int\t\t\tn\t= 0x1uLl;", "line": 1, "pos": 19}, - {"text": 'char\t\t\t*yo\t\t\t= "', "line": 1, "pos": 31}, - {"text": "{return 1;}\\\\\\n", "line": 1, "pos": 12}, - {"text": "int a = a+++++a;\ndouble b = .0e4x;", "line": 2, "pos": 12}, - {"text": "int a = 1;\nint b = 10ul;\nint c = 10lul;\n", "line": 3, "pos": 9}, - {"text": "int number = 0x1uLl;", "line": 1, "pos": 14}, - {"text": "int number = 0x1ULl;", "line": 1, "pos": 14}, - {"text": "int number = 0x1lL;", "line": 1, "pos": 14}, - {"text": "int number = 0x1Ll;", "line": 1, "pos": 14}, - {"text": "int number = 0x1UlL;", "line": 1, "pos": 14}, - {"text": "int number = 10ullll", "line": 1, "pos": 14}, - {"text": "int number = 10lul", "line": 1, "pos": 14}, - {"text": "int number = 10lUl", "line": 1, "pos": 14}, - {"text": "int number = 10LUl", "line": 1, "pos": 14}, - {"text": "int number = 10uu", "line": 1, "pos": 14}, - {"text": "int number = 10Uu", "line": 1, "pos": 14}, - {"text": "int number = 10UU", "line": 1, "pos": 14}, - {"text": "int number = 0b0101e", "line": 1, "pos": 14}, - {"text": "int number = 0b0101f", "line": 1, "pos": 14}, - {"text": "int number = 0b0X101f", "line": 1, "pos": 14}, - {"text": "int number = 0X101Uf", "line": 1, "pos": 14}, - {"text": "int number = 0101f", "line": 1, "pos": 14}, - {"text": "float number=10.12fe10", "line": 1, "pos": 14}, - {"text": "float number=10.fU", "line": 1, "pos": 14}, - {"text": "float number=21.3E56E4654", "line": 1, "pos": 14}, - {"text": "float number=105e4d", "line": 1, "pos": 14}, - {"text": "float number=105flu", "line": 1, "pos": 14}, - {"text": "float number=105fu", "line": 1, "pos": 14}, - {"text": "float number=105eu", "line": 1, "pos": 14}, -] - - -@pytest.mark.parametrize( - "data", failed_tokens_tests, ids=[data["text"] for data in failed_tokens_tests] -) -def test_tokenizing_errors(data): - text, line, pos = data.values() - - with pytest.raises(TokenError, match=f"({line}, {pos})"): - Lexer(File("", text)).check_tokens() diff --git a/tests/tokenizer/token_generator_test.py b/tests/tokenizer/token_generator_test.py index 4061f92a..67d91236 100644 --- a/tests/tokenizer/token_generator_test.py +++ b/tests/tokenizer/token_generator_test.py @@ -15,6 +15,14 @@ def test_rule_for_file(file): with open(f"{file.split('.')[0]}.tokens") as out_file: out_content = out_file.read() - output = Lexer(File(file)).check_tokens() + lexer = Lexer(File(file)) + + output = '' + tokens = list(lexer) + if tokens: + for token in tokens: + output += str(token) + '\n' * int(token.type == "NEWLINE") + if tokens[-1].type != "NEWLINE": + output += "\n" assert output == out_content diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..ecf7631b --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,20 @@ +from typing import Dict, Any, List + +import pytest +from _pytest.mark.structures import ParameterSet + +from norminette.file import File +from norminette.lexer import Lexer + + +def lexer_from_source(source: str, /) -> Lexer: + file = File("", source) + return Lexer(file) + + +def dict_to_pytest_param(data: Dict[str, List[Any]]) -> List[ParameterSet]: + params: List[ParameterSet] = [] + for id, values in data.items(): + param = pytest.param(*values, id=id) + params.append(param) + return params