Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add to translate (di|tri)graphs in Lexer, fix #474 #486

Merged
merged 5 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 14 additions & 1 deletion norminette/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
class CParsingError(Exception):
class NorminetteError(Exception):
pass


class CParsingError(NorminetteError):
def __init__(self, errmsg):
self.msg = errmsg

Expand All @@ -7,3 +11,12 @@ def __str__(self):

def __repr__(self):
return self.__str__


class MaybeInfiniteLoop(NorminetteError):
def __init__(self) -> None:
super().__init__("The maximum number of iterations a loop can have has been reached")


class UnexpectedEOF(NorminetteError):
pass
20 changes: 20 additions & 0 deletions norminette/lexer/dictionary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
""" Dictionary that correlates lexeme with token """

trigraphs = {
"??<": '{',
"??>": '}',
"??(": '[',
"??)": ']',
"??=": '#',
"??/": '\\',
"??'": '^',
"??!": '|',
"??-": '~',
}

digraphs = {
"<%": '{',
"%>": '}',
"<:": '[',
":>": ']',
"%:": '#',
}

keywords = {
# C reserved keywords #
"auto": "AUTO",
Expand Down
90 changes: 58 additions & 32 deletions norminette/lexer/lexer.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
import string
from typing import Optional, Tuple

from norminette.exceptions import UnexpectedEOF, MaybeInfiniteLoop
from norminette.lexer.dictionary import digraphs, trigraphs
from norminette.lexer.dictionary import brackets
from norminette.lexer.dictionary import keywords
from norminette.lexer.dictionary import operators
from norminette.lexer.tokens import Token
from norminette.file import File


def read_file(filename):
with open(filename) as f:
return f.read()


class TokenError(Exception):
def __init__(self, pos, message=None):
self.msg = message or f"Error: Unrecognized token line {pos[0]}, col {pos[1]}"
Expand All @@ -26,34 +24,59 @@ def __init__(self, file: File):

self.src = file.source
self.len = len(file.source)
self.__char = self.src[0] if self.src != "" else None
self.__pos = int(0)
self.__line_pos = self.__line = 1
self.tokens = []

def peek_sub_string(self, size):
return self.src[self.__pos : self.__pos + size]

def peek(self, *, offset: int = 0, collect: int = 1):
def raw_peek(self, *, offset: int = 0, collect: int = 1):
assert collect > 0 and offset >= 0
if (pos := self.__pos + offset) < self.len:
return ''.join(self.src[pos:pos+collect])
return None

def peek(self, *, offset: int = 0) -> Optional[Tuple[str, int]]:
if (trigraph := self.raw_peek(offset=offset, collect=3)) in trigraphs:
return trigraphs[trigraph], 3
if (digraph := self.raw_peek(offset=offset, collect=2)) in digraphs:
return digraphs[digraph], 2
if word := self.raw_peek(offset=offset):
return word, 1
return None # Let it crash :D

def pop(self, *, times: int = 1, use_spaces: bool = False):
assert times > 0
result = ""
for _ in range(times):
char = self.peek()
for _ in range(100):
char, size = self.peek()
if char != '\\':
break
if self.peek(offset=size) is None:
break
temp, _ = self.peek(offset=size) # Don't change the `temp` to `char`
if temp != '\n':
break
self.__pos += size + 1
self.__line += 1
self.__line_pos = 0
if self.peek() is None:
raise UnexpectedEOF()
char, size = self.peek()
break
else:
raise MaybeInfiniteLoop()
matthieu42Network marked this conversation as resolved.
Show resolved Hide resolved
if char == '\n':
self.__line_pos = 0
self.__line += 1
if char == '\t':
self.__line_pos += (spaces := 4 - (self.__line_pos - 1) % 4) - 1
if use_spaces:
char = ' ' * spaces
self.__line_pos += 1
self.__pos += 1
self.__line_pos += size
self.__pos += size
result += char
return result

Expand All @@ -63,14 +86,12 @@ def peek_char(self):
character is appended to the return value. It will allow us to
parse escaped characters easier.
"""
char = None
if self.__pos < self.len:
char = self.src[self.__pos]
if self.src[self.__pos] == "\\":
self.__char = self.src[self.__pos : self.__pos + 2]
else:
self.__char = self.src[self.__pos]
else:
self.__char = None
return self.__char
char = self.src[self.__pos : self.__pos + 2]
return char

def pop_char(self, skip_escaped=True):
"""Pop a character that's been read by increasing self.__pos,
Expand All @@ -93,10 +114,7 @@ def line_pos(self):

def is_string(self):
"""True if current character could start a string constant"""
if self.peek_sub_string(2) == 'L"' or self.peek_char() == '"':
return True
else:
return False
return self.raw_peek(collect=2) == 'L"' or self.raw_peek() == '"'

def is_constant(self):
"""True if current character could start a numeric constant"""
Expand All @@ -115,10 +133,7 @@ def is_constant(self):

def is_char_constant(self):
"""True if current character could start a character constant"""
if self.peek_char() == "'" or self.peek_sub_string(2) == "L'":
return True
else:
return False
return self.raw_peek() == "'" or self.raw_peek(collect=2) == "L'"

def string(self):
"""String constants can contain any characer except unescaped newlines.
Expand Down Expand Up @@ -313,24 +328,35 @@ def constant(self):
def mult_comment(self):
pos = self.line_pos()
val = self.pop(times=2)
# TODO Add to put `UnexpectedEOF` exception as an error in `file.errors`
while self.peek():
if self.peek(collect=2) == "*/":
val += self.pop(times=2)
break
# the `.pop(...)` can raise an `UnexpectedEOF` if source is like:
# ```c
# /*\
#
# ```
# note the backslash followed by an empty line
val += self.pop(use_spaces=True)
if val.endswith("*/"):
self.tokens.append(Token("MULT_COMMENT", pos, val))
if val.endswith("*/"):
break
else:
raise TokenError(pos)
raise UnexpectedEOF()
self.tokens.append(Token("MULT_COMMENT", pos, val))

def comment(self):
"""Comments are anything after '//' characters, up until a newline or
end of file
"""
pos = self.line_pos()
val = self.pop(times=2)
while self.peek() and self.peek() != '\n':
val += self.pop()
while self.peek():
char, _ = self.peek()
if char in ('\n', None):
break
try:
val += self.pop()
except UnexpectedEOF:
break
self.tokens.append(Token("COMMENT", pos, val))

def identifier(self):
Expand Down
11 changes: 11 additions & 0 deletions tests/rules/samples/test_comment_escaping_with_backslash.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/* *\
/

#include <stdio.h>

int main() {
for (int i=0; i<2;++i)
printf("%d\n", i);
}

/**/
37 changes: 37 additions & 0 deletions tests/rules/samples/test_comment_escaping_with_backslash.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
test_comment_escaping_with_backslash.c - IsComment In "GlobalScope" from "None" line 1":
<MULT_COMMENT=/* */> <NEWLINE>
test_comment_escaping_with_backslash.c - IsEmptyLine In "GlobalScope" from "None" line 3":
<NEWLINE>
test_comment_escaping_with_backslash.c - IsPreprocessorStatement In "GlobalScope" from "None" line 4":
<HASH> <IDENTIFIER=include> <SPACE> <LESS_THAN> <IDENTIFIER=stdio> <DOT> <IDENTIFIER=h> <MORE_THAN> <NEWLINE>
test_comment_escaping_with_backslash.c - IsEmptyLine In "GlobalScope" from "None" line 5":
<NEWLINE>
test_comment_escaping_with_backslash.c - IsFuncDeclaration In "GlobalScope" from "None" line 6":
<INT> <SPACE> <IDENTIFIER=main> <LPARENTHESIS> <RPARENTHESIS> <SPACE>
test_comment_escaping_with_backslash.c - IsBlockStart In "Function" from "GlobalScope" line 6":
<LBRACE> <NEWLINE>
test_comment_escaping_with_backslash.c - IsControlStatement In "Function" from "GlobalScope" line 7":
<SPACE> <SPACE> <FOR> <SPACE> <LPARENTHESIS> <INT> <SPACE> <IDENTIFIER=i> <ASSIGN> <CONSTANT=0> <SEMI_COLON> <SPACE> <IDENTIFIER=i> <LESS_THAN> <CONSTANT=2> <SEMI_COLON> <INC> <IDENTIFIER=i> <RPARENTHESIS> <NEWLINE>
test_comment_escaping_with_backslash.c - IsFunctionCall In "ControlStructure" from "Function" line 8":
<SPACE> <SPACE> <SPACE> <SPACE> <IDENTIFIER=printf> <LPARENTHESIS> <STRING="%d\n"> <COMMA> <SPACE> <IDENTIFIER=i> <RPARENTHESIS> <SEMI_COLON> <NEWLINE>
test_comment_escaping_with_backslash.c - IsBlockEnd In "Function" from "GlobalScope" line 9":
<RBRACE> <NEWLINE>
test_comment_escaping_with_backslash.c - IsEmptyLine In "GlobalScope" from "None" line 10":
<NEWLINE>
test_comment_escaping_with_backslash.c - IsComment In "GlobalScope" from "None" line 11":
<MULT_COMMENT=/**/>
test_comment_escaping_with_backslash.c: Error!
Error: INVALID_HEADER (line: 3, col: 1): Missing or invalid 42 header
Error: SPACE_BEFORE_FUNC (line: 6, col: 4): space before function name
Error: NO_ARGS_VOID (line: 6, col: 10): Empty function argument requires void
Error: BRACE_NEWLINE (line: 6, col: 12): Expected newline before brace
Error: TOO_FEW_TAB (line: 7, col: 1): Missing tabs for indent level
Error: FORBIDDEN_CS (line: 7, col: 3): Forbidden control structure
Error: SPACE_REPLACE_TAB (line: 7, col: 3): Found space when expecting tab
Error: SPC_AFTER_OPERATOR (line: 7, col: 13): missing space after operator
Error: SPC_BFR_OPERATOR (line: 7, col: 13): missing space before operator
Error: SPC_AFTER_OPERATOR (line: 7, col: 18): missing space after operator
Error: SPC_BFR_OPERATOR (line: 7, col: 18): missing space before operator
Error: SPC_AFTER_OPERATOR (line: 7, col: 20): missing space after operator
Error: TOO_FEW_TAB (line: 8, col: 1): Missing tabs for indent level
Error: SPACE_REPLACE_TAB (line: 8, col: 5): Found space when expecting tab
11 changes: 11 additions & 0 deletions tests/rules/samples/test_comment_escaping_with_trigraph.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/* *??/
/

#include <stdio.h>

int main() {
for (int i=0; i<2;++i)
printf("%d\n", i);
}

/**/
37 changes: 37 additions & 0 deletions tests/rules/samples/test_comment_escaping_with_trigraph.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
test_comment_escaping_with_trigraph.c - IsComment In "GlobalScope" from "None" line 1":
<MULT_COMMENT=/* */> <NEWLINE>
test_comment_escaping_with_trigraph.c - IsEmptyLine In "GlobalScope" from "None" line 3":
<NEWLINE>
test_comment_escaping_with_trigraph.c - IsPreprocessorStatement In "GlobalScope" from "None" line 4":
<HASH> <IDENTIFIER=include> <SPACE> <LESS_THAN> <IDENTIFIER=stdio> <DOT> <IDENTIFIER=h> <MORE_THAN> <NEWLINE>
test_comment_escaping_with_trigraph.c - IsEmptyLine In "GlobalScope" from "None" line 5":
<NEWLINE>
test_comment_escaping_with_trigraph.c - IsFuncDeclaration In "GlobalScope" from "None" line 6":
<INT> <SPACE> <IDENTIFIER=main> <LPARENTHESIS> <RPARENTHESIS> <SPACE>
test_comment_escaping_with_trigraph.c - IsBlockStart In "Function" from "GlobalScope" line 6":
<LBRACE> <NEWLINE>
test_comment_escaping_with_trigraph.c - IsControlStatement In "Function" from "GlobalScope" line 7":
<SPACE> <SPACE> <FOR> <SPACE> <LPARENTHESIS> <INT> <SPACE> <IDENTIFIER=i> <ASSIGN> <CONSTANT=0> <SEMI_COLON> <SPACE> <IDENTIFIER=i> <LESS_THAN> <CONSTANT=2> <SEMI_COLON> <INC> <IDENTIFIER=i> <RPARENTHESIS> <NEWLINE>
test_comment_escaping_with_trigraph.c - IsFunctionCall In "ControlStructure" from "Function" line 8":
<SPACE> <SPACE> <SPACE> <SPACE> <IDENTIFIER=printf> <LPARENTHESIS> <STRING="%d\n"> <COMMA> <SPACE> <IDENTIFIER=i> <RPARENTHESIS> <SEMI_COLON> <NEWLINE>
test_comment_escaping_with_trigraph.c - IsBlockEnd In "Function" from "GlobalScope" line 9":
<RBRACE> <NEWLINE>
test_comment_escaping_with_trigraph.c - IsEmptyLine In "GlobalScope" from "None" line 10":
<NEWLINE>
test_comment_escaping_with_trigraph.c - IsComment In "GlobalScope" from "None" line 11":
<MULT_COMMENT=/**/>
test_comment_escaping_with_trigraph.c: Error!
Error: INVALID_HEADER (line: 3, col: 1): Missing or invalid 42 header
Error: SPACE_BEFORE_FUNC (line: 6, col: 4): space before function name
Error: NO_ARGS_VOID (line: 6, col: 10): Empty function argument requires void
Error: BRACE_NEWLINE (line: 6, col: 12): Expected newline before brace
Error: TOO_FEW_TAB (line: 7, col: 1): Missing tabs for indent level
Error: FORBIDDEN_CS (line: 7, col: 3): Forbidden control structure
Error: SPACE_REPLACE_TAB (line: 7, col: 3): Found space when expecting tab
Error: SPC_AFTER_OPERATOR (line: 7, col: 13): missing space after operator
Error: SPC_BFR_OPERATOR (line: 7, col: 13): missing space before operator
Error: SPC_AFTER_OPERATOR (line: 7, col: 18): missing space after operator
Error: SPC_BFR_OPERATOR (line: 7, col: 18): missing space before operator
Error: SPC_AFTER_OPERATOR (line: 7, col: 20): missing space after operator
Error: TOO_FEW_TAB (line: 8, col: 1): Missing tabs for indent level
Error: SPACE_REPLACE_TAB (line: 8, col: 5): Found space when expecting tab