# Mini-Compiler Project Phase 1
Ali Osman	                211001561

Mohamed Ashraf Qushta       211001221

Saif Eldeen Sameh           211001048

In [7]:
import re

In [8]:
import json
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor


def save_and_run(token_table, output_file='token_output.json', next_notebook='Syntax_Analysis.ipynb'):
    # Save data (only token table)
    with open(output_file, 'w') as f:
        json.dump(token_table, f, indent=0)

    # Run next notebook
    with open(next_notebook) as f:
        nb = nbformat.read(f, as_version=4)

    ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
    ep.preprocess(nb)

    # Save executed notebook
    with open('Executed_' + next_notebook, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)


## Tokens
- , -> comma
- a -> identifier

In [9]:
TOKEN_SPECIFICATIONS = [
    ('let', r'\bLET\b'),                                # Specific token for LET
    ('if', r'\bIF\b'),                                  # Specific token for IF
    ('then', r'\bTHEN\b'),                              # Specific token for THEN
    ('else', r'\bELSE\b'),                              # Specific token for ELSE
    ('endif', r'\bENDIF\b'),                            # Specific token for ENDIF
    ('while', r'\bWHILE\b'),                            # Specific token for WHILE
    ('do', r'\bDO\b'),                                  # Specific token for DO
    ('endwhile', r'\bENDWHILE\b'),                      # Specific token for ENDWHILE
    ('for', r'\bFOR\b'),                                # Specific token for FOR
    ('endfor', r'\bENDFOR\b'),                          # Specific token for ENDFOR
    ('call', r'\bCALL\b'),                              # Specific token for CALL
    ('func', r'\bFUNC\b'),                              # Specific token for FUNC
    ('return', r'\bRETURN\b'),                          # Specific token for RETURN
    ('repeat', r'\bREPEAT\b'),                          # Specific token for REPEAT
    ('until', r'\bUNTIL\b'),                            # Specific token for UNTIL
    ('not', r'\bNOT\b'),                                # Specific token for NOT
    ('and', r'\bAND\b'),                                # Specific token for AND
    ('or', r'\bOR\b'),                                  # Specific token for OR
    ('in', r'\bIN\b'),                                  # Specific token for IN
    ('to', r'\bTO\b'),                                  # Specific token for TO
    ('step', r'\bSTEP\b'),                              # Specific token for STEP
    ('begin', r'\bBEGIN\b'),                            # Specific token for BEGIN
    ('end', r'\bEND\b'),                                # Specific token for END
    ('range', r'\bRANGE\('),
    ('number', r'-?\d+(\.\d+)?'),                       # Number
    ('listElement',r'([a-zA-Z_][a-zA-Z0-9_]*)\[\s*([a-zA-Z_][a-zA-Z0-9_]*|\d+)\s*\]'),
    ('identifier', r'[a-zA-Z_][a-zA-Z0-9_]*'),          # Identifiers & Function names
    ('compound_operator', r'[+\-*/]='),
    ('increment', r'\+\+'),                             # Specific token for Increment
    ('decrement', r'--'),                               # Specific token for Decrement
    ('equal', r'='),                                    # Specific token for =
    ('operator', r'!=|[<>]=?|[+\-*/]'),                 # Combined Arithmetic and Relational Operators
    ('comment', r'\{[^}]*\}'),                          # Comments
    ('string', r'(\'\'\'[^\'\'\']*\'\'\'|"""[^"""]*"""|\'[^\']*\'|\"[^"]*\")'), #String
    ('left_paren', r'\('),                              # Left parenthesis
    ('right_paren', r'\)'),                             # Right parenthesis
    ('left_brack', r'\['),                              # Left bracket
    ('right_brack', r'\]'),                             # Right bracket
    ('comma', r','),                                    # Comma
    ('space', r'\s+'),                                  # Whitespace
    ('unknown', r'[^\s]'),                              # Any other character
]

# Compile regex with case-insensitive flag
token_re = re.compile('|'.join(f'(?P<{pair[0]}>{pair[1]})' for pair in TOKEN_SPECIFICATIONS), re.IGNORECASE)


In [10]:
def tokenize(code):
    token_table = []
    symbol_table = {}
    line_num = 0
    position = 0
    current_function = None
    
    for match in token_re.finditer(code):
        token_type = match.lastgroup
        lexeme = match.group(token_type)
        
        if token_type in ('space', 'comment'):
            if '\n' in lexeme:
                line_num += lexeme.count('\n')
                position = 0
            else:
                position += len(lexeme)
            continue
        
        if token_type == 'range':
            token_table.append(f"Token: range, Lexeme: 'RANGE'")
            token_table.append(f"Token: left_paren, Lexeme: '('")
        else:
            token_table.append(f"Token: {token_type}, Lexeme: {lexeme}")
        
        
        lexeme = lexeme.lower()
        if token_type == 'call':
            current_function = None
        elif token_type == 'identifier':
            if current_function is None and token_table[-2].endswith("CALL"):
                current_function = lexeme
                symbol_table[lexeme] = {'type': 'function', 'parameters': []}
            elif current_function and token_type == 'identifier':
                symbol_table[current_function]['parameters'].append(lexeme)
            elif token_table[-2].endswith("LET"):
                rest_of_code = code[match.end():]
                value_start = rest_of_code.find('=') + 1
                if value_start > 0:
                    value_str = rest_of_code[value_start:].strip().split()[0]
                    print(value_str)

                    if '"' in value_str or "'" in value_str:
                        symbol_table[lexeme] = {'type': 'string'}
                    elif '[' in value_str:
                        symbol_table[lexeme] = {'type': 'list'}
                    elif '.' in value_str:
                        symbol_table[lexeme] = {'type': 'float'}
                    else:
                        symbol_table[lexeme] = {'type': 'integer'}
    
    formatted_table = []
    for name, info in symbol_table.items():
        if info['type'] == 'function':
            formatted_table.append(f"Name: {name}, Type: {info['type']} (with parameters: {', '.join(info['parameters'])})")
        else:
            formatted_table.append(f"Name: {name}, Type: {info['type']}")
            
    return token_table, formatted_table

In [11]:
# Test cases
test_case =  """

LET a = 5
LET b = [1, 2, 3]
LET c = a + 10
CALL myFunction(a, b)
IF a < 10 THEN
    LET d = a * 2
    IF a < 10 THEN
        LET d = a * 2
    ELSE
        LET d = b[1]
    ENDIF
ELSE
    LET d = b[1]
ENDIF
WHILE a > 0 DO
    LET a = a - 1
    DO
        CALL anotherFunction(c)
    WHILE c < 100
ENDWHILE
DO
    CALL anotherFunction(c)
WHILE c < 100
FOR i = 1 TO 10 STEP 2 DO
    LET e = i * 2
    FOR x IN range(1, 5, 1) DO
        LET f = x + 3
    ENDFOR
ENDFOR
FOR x IN range(1, 5, 1) DO
    LET f = x + 3
ENDFOR
REPEAT
    LET g = f * 2
    REPEAT
        LET g = f * 2
    UNTIL g > 20
UNTIL g > 20
FUNC add(a, b) BEGIN
    FUNC add(a, b) BEGIN
    i--
    RETURN a + b
    END
    RETURN a + b
END
h += 1
i--
"""


# Run test cases
token_table, symbol_table = tokenize(test_case)
# save_and_run(token_table)

for token in token_table:
    print(token)

# Print Symbol Table
print("\nSymbol Table:")
for symbol in symbol_table:
    print(symbol)

5
[1,
a
Token: let, Lexeme: LET
Token: identifier, Lexeme: a
Token: equal, Lexeme: =
Token: number, Lexeme: 5
Token: let, Lexeme: LET
Token: identifier, Lexeme: b
Token: equal, Lexeme: =
Token: left_brack, Lexeme: [
Token: number, Lexeme: 1
Token: comma, Lexeme: ,
Token: number, Lexeme: 2
Token: comma, Lexeme: ,
Token: number, Lexeme: 3
Token: right_brack, Lexeme: ]
Token: let, Lexeme: LET
Token: identifier, Lexeme: c
Token: equal, Lexeme: =
Token: identifier, Lexeme: a
Token: operator, Lexeme: +
Token: number, Lexeme: 10
Token: call, Lexeme: CALL
Token: identifier, Lexeme: myFunction
Token: left_paren, Lexeme: (
Token: identifier, Lexeme: a
Token: comma, Lexeme: ,
Token: identifier, Lexeme: b
Token: right_paren, Lexeme: )
Token: if, Lexeme: IF
Token: identifier, Lexeme: a
Token: operator, Lexeme: <
Token: number, Lexeme: 10
Token: then, Lexeme: THEN
Token: let, Lexeme: LET
Token: identifier, Lexeme: d
Token: equal, Lexeme: =
Token: identifier, Lexeme: a
Token: operator, Lexeme: *
Toke

In [12]:
save_and_run(token_table)