# Mini-Compiler Project Phase 1
Ali Osman	211001561

Mohamed Ashraf Qushta   

Saif Eldeen Sameh     211001048

In [17]:
import re

## Tokens
- , -> comma
- a -> identifier

In [18]:
TOKEN_SPECIFICATIONS = [
    ('let', r'\bLET\b'),                                # Specific token for LET
    ('if', r'\bIF\b'),                                  # Specific token for IF
    ('then', r'\bTHEN\b'),                              # Specific token for THEN
    ('else', r'\bELSE\b'),                              # Specific token for ELSE
    ('endif', r'\bENDIF\b'),                            # Specific token for ENDIF
    ('while', r'\bWHILE\b'),                            # Specific token for WHILE
    ('do', r'\bDO\b'),                                  # Specific token for DO
    ('endwhile', r'\bENDWHILE\b'),                      # Specific token for ENDWHILE
    ('for', r'\bFOR\b'),                                # Specific token for FOR
    ('endfor', r'\bENDFOR\b'),                          # Specific token for ENDFOR
    ('call', r'\bCALL\b'),                              # Specific token for CALL
    ('func', r'\bFUNC\b'),                              # Specific token for FUNC
    ('return', r'\bRETURN\b'),                          # Specific token for RETURN
    ('repeat', r'\bREPEAT\b'),                          # Specific token for REPEAT
    ('until', r'\bUNTIL\b'),                            # Specific token for UNTIL
    ('not', r'\bNOT\b'),                                # Specific token for NOT
    ('and', r'\bAND\b'),                                # Specific token for AND
    ('or', r'\bOR\b'),                                  # Specific token for OR
    ('in', r'\bIN\b'),                                  # Specific token for IN
    ('to', r'\bTO\b'),                                  # Specific token for TO
    ('step', r'\bSTEP\b'),                              # Specific token for STEP
    ('begin', r'\bBEGIN\b'),                            # Specific token for BEGIN
    ('end', r'\bEND\b'),                                # Specific token for END
    ('number', r'-?\d+(\.\d+)?'),                       # Number
    ('identifier', r'[a-zA-Z_][a-zA-Z0-9_]*'),          # Identifiers & Function names
    ('equal', r'='),                                    # Specific token for =
    ('operator', r'!=|[<>]=?|[+\-*/]'),                 # Combined Arithmetic and Relational Operators
    ('increment', r'\+\+'),                             # Specific token for Increment
    ('decrement', r'--'),                               # Specific token for Decrement
    ('comment', r'\{[^}]*\}'),                          # Comments
    ('left_paren', r'\('),                              # Left parenthesis
    ('right_paren', r'\)'),                             # Right parenthesis
    ('left_brack', r'\['),                              # Left bracket
    ('right_brack', r'\]'),                             # Right bracket
    ('comma', r','),                                    # Comma
    ('space', r'\s+'),                                  # Whitespace
    ('unknown', r'[^\s]'),                              # Any other character
]

# Compile regex with case-insensitive flag
token_re = re.compile('|'.join(f'(?P<{pair[0]}>{pair[1]})' for pair in TOKEN_SPECIFICATIONS), re.IGNORECASE)


In [19]:
def tokenize(code):
    token_table = []
    symbol_table = {}
    current_function = None
    
    for match in token_re.finditer(code):
        token_type = match.lastgroup
        lexeme = match.group(token_type)
        
        if token_type in ('space', 'comment'):
            continue
            
        token_table.append(f"Token: {token_type}, Lexeme: {lexeme}")
        
        lexeme = lexeme.lower()
        if token_type == 'call':
            current_function = None
        elif token_type == 'identifier':
            if current_function is None and token_table[-2].endswith("CALL"):
                current_function = lexeme
                symbol_table[lexeme] = {'type': 'function', 'parameters': []}
            elif current_function and token_type == 'identifier':
                symbol_table[current_function]['parameters'].append(lexeme)
            elif token_table[-2].endswith("LET"):
                # Check next token for type determination
                next_tokens = token_re.finditer(code[match.end():])
                value_token = next(next_tokens, None)
                if value_token:
                    value_type = value_token.lastgroup
                    value = value_token.group()
                    
                    if '[' in code[match.end():match.end()+10]:
                        symbol_table[lexeme] = {'type': 'list'}
                    elif value_type == 'number' and '.' in value:
                        symbol_table[lexeme] = {'type': 'float'}
                    else:
                        symbol_table[lexeme] = {'type': 'integer'}
        
    formatted_table = []
    for name, info in symbol_table.items():
        if info['type'] == 'function':
            formatted_table.append(f"Name: {name}, Type: {info['type']} (with parameters: {', '.join(info['parameters'])})")
        else:
            formatted_table.append(f"Name: {name}, Type: {info['type']}")
            
    return token_table, formatted_table

In [None]:
# Test cases
test_case =  """
LET a = 5
LET b = 10
LET A = []
IF a < b
THEN
LET c = a + b
LET d = c * 2
ELSE
LET e = a - b
ENDIF
CALL myFunction(a, b,b,kk)
"""


# Run test cases
token_table, symbol_table = tokenize(test_case)
for token in token_table:
    print(token)

# Print Symbol Table
print("\nSymbol Table:")
for symbol in symbol_table:
    print(symbol)

Token: let, Lexeme: LET
Token: identifier, Lexeme: a
Token: equal, Lexeme: =
Token: number, Lexeme: 5
Token: let, Lexeme: LET
Token: identifier, Lexeme: b
Token: equal, Lexeme: =
Token: number, Lexeme: 10
Token: let, Lexeme: LET
Token: identifier, Lexeme: A
Token: equal, Lexeme: =
Token: identifier, Lexeme: aa
Token: if, Lexeme: IF
Token: identifier, Lexeme: a
Token: operator, Lexeme: <
Token: identifier, Lexeme: b
Token: then, Lexeme: THEN
Token: let, Lexeme: LET
Token: identifier, Lexeme: c
Token: equal, Lexeme: =
Token: identifier, Lexeme: a
Token: operator, Lexeme: +
Token: identifier, Lexeme: b
Token: let, Lexeme: LET
Token: identifier, Lexeme: d
Token: equal, Lexeme: =
Token: identifier, Lexeme: c
Token: operator, Lexeme: *
Token: number, Lexeme: 2
Token: else, Lexeme: ELSE
Token: let, Lexeme: LET
Token: identifier, Lexeme: e
Token: equal, Lexeme: =
Token: identifier, Lexeme: a
Token: operator, Lexeme: -
Token: identifier, Lexeme: b
Token: endif, Lexeme: ENDIF
Token: call, Lexeme