# **Lexer Construction**

In [1]:
from token_definition import token_defs
from tokenize_regex import tokenize_regex
from postfix_conversion import to_postfix
from postfix_to_nfa import postfix_to_nfa,combine_nfas,print_nfa
from nfa_to_dfa import nfa_to_dfa
from plot_nfa import plot_nfa_with_graphviz
from optimize_dfa import DFAOptimizer
from dfa_table import DFATable
from scanner import LexicalScanner


print("Processing Lexer Token Definitions...")
print("="*80)
    
results = []
for token_name, regex in token_defs:
    print(f"\n📝 Processing: {token_name} -> {regex}")
        
    # Tokenize and convert to postfix
    tokens = tokenize_regex(regex)
    print(f"   Tokens: {tokens}")
        
    postfix = to_postfix(tokens)
    print(f"   Postfix: {postfix}")
    
    # Convert to NFA and display
    nfa = postfix_to_nfa(postfix)
    if nfa:
        # print_nfa(nfa)
        filename = f"lexer_{token_name.lower()}"
        display_result = plot_nfa_with_graphviz(nfa, f"{token_name}: {regex}")
        results.append((token_name, regex, nfa, None))
    else:
        print(f"❌ Failed to create NFA for {token_name}")
        
    print("-" * 60)
    
#Combine all NFAs into a single NFA and display

final_nfa = combine_nfas(results)
# if final_nfa:
    # print("\n Combined Final NFA:")
    # print_nfa(final_nfa)
    # plot_nfa_with_graphviz(final_nfa, "Combined Final NFA", "final_combined_nfa")
    
#Convert the final NFA to a DFA
print("\n🔄 Converting NFA to DFA...")
start_dfa,all_dfa_states = nfa_to_dfa(final_nfa)
                
# print("\n✅ DFA States and Transitions:")
# for state in all_dfa_states:
#     acc = "Accepting" if state.is_accepting else "Non-Accepting"
#     token = f"Token: {state.token_type}" if state.token_type else ""
#     print(f"{state} ({acc}) {token}")
#     for symbol, target in state.transitions.items():
#         print(f"   {state} --{symbol}--> {target}")
        
#Optimize DFA
optimizer = DFAOptimizer(start_dfa, all_dfa_states)
min_start, min_states = optimizer.minimize_dfa()

# Build transition table
dfa_table = DFATable(min_start, min_states)
dfa_table.print_table_stats()
dfa_table.print_table()
dfa_table.export_to_excel() 
    
# Create scanner
scanner = LexicalScanner(dfa_table, skip_terminators=True)    
print("✅ Scanner ready for use!")




Processing Lexer Token Definitions...

📝 Processing: KEYWORD -> if|else|for|while|main|do
   Tokens: ['(\\bif\\b|\\belse\\b|\\bfor\\b|\\bwhile\\b|\\bmain\\b|\\bdo\\b)']
   Postfix: ['(\\bif\\b|\\belse\\b|\\bfor\\b|\\bwhile\\b|\\bmain\\b|\\bdo\\b)']
✅ NFA graph saved to: NFAs\KEYWORD_if_else_for_while_main_do.png
------------------------------------------------------------

📝 Processing: TYPE -> integer|float|string|boolean
   Tokens: ['(\\binteger\\b|\\bfloat\\b|\\bstring\\b|\\bboolean\\b)']
   Postfix: ['(\\binteger\\b|\\bfloat\\b|\\bstring\\b|\\bboolean\\b)']
✅ NFA graph saved to: NFAs\TYPE_integer_float_string_boolean.png
------------------------------------------------------------

📝 Processing: BOOL -> true|false
   Tokens: ['(\\btrue\\b|\\bfalse\\b)']
   Postfix: ['(\\btrue\\b|\\bfalse\\b)']
✅ NFA graph saved to: NFAs\BOOL_true_false.png
------------------------------------------------------------

📝 Processing: ARITHMETIC_OP -> add|subtract|multiply|divide|remainder|power
   Tok

In [2]:
"""Test the scanner with sample code"""
test_code = '''integer main () {
    integer x equals to 5 semicolon
    integer y equals to 10 semicolon
    boolean result equals to x is less than y semicolon
    if (result) {
        integer sum equals to x add y semicolon
        }
}'''
    
print(f"\n🧪 Testing scanner with sample code:")
print(f"Input: {repr(test_code)}")
print(f"Length: {len(test_code)} characters")
    
tokens = scanner.scan(test_code)
    
print(f"\n📝 Generated tokens:")
for i, token in enumerate(tokens):
    print(f"  {i+1:2d}. {token}")
    




🧪 Testing scanner with sample code:
Input: 'integer main () {\n    integer x equals to 5 semicolon\n    integer y equals to 10 semicolon\n    boolean result equals to x is less than y semicolon\n    if (result) {\n        integer sum equals to x add y semicolon\n        }\n}'
Length: 224 characters
🔍 Scanning input: 224 characters
✅ Scanning complete: 37 tokens generated

📝 Generated tokens:
   1. TYPE(integer)
   2. KEYWORD(main)
   3. LPAREN(()
   4. RPAREN())
   5. LBRACE({)
   6. TYPE(integer)
   7. IDENTIFIER(x)
   8. ASSIGN_OP(equals to)
   9. NUMBER(5)
  10. IDENTIFIER(semicolon)
  11. TYPE(integer)
  12. IDENTIFIER(y)
  13. ASSIGN_OP(equals to)
  14. NUMBER(10)
  15. IDENTIFIER(semicolon)
  16. TYPE(boolean)
  17. IDENTIFIER(result)
  18. ASSIGN_OP(equals to)
  19. IDENTIFIER(x)
  20. RELATIONAL_OP(is less than)
  21. IDENTIFIER(y)
  22. IDENTIFIER(semicolon)
  23. KEYWORD(if)
  24. LPAREN(()
  25. IDENTIFIER(result)
  26. RPAREN())
  27. LBRACE({)
  28. TYPE(integer)
  29. ID

In [3]:
# 1) Function with a loop, float arithmetic, and return
test_code_loop = '''
float compute_area ( float radius ) {
    float pi equals to 3.14 semicolon
    float area equals to pi multiply radius multiply radius semicolon
    return area semicolon
}
'''

print(f"\n🧪 Testing scanner with sample code:")
print(f"Input: {repr(test_code_loop)}")
print(f"Length: {len(test_code_loop)} characters")
    
tokens = scanner.scan(test_code_loop)
    
print(f"\n📝 Generated tokens:")
for i, token in enumerate(tokens):
    print(f"  {i+1:2d}. {token}")
    


🧪 Testing scanner with sample code:
Input: '\nfloat compute_area ( float radius ) {\n    float pi equals to 3.14 semicolon\n    float area equals to pi multiply radius multiply radius semicolon\n    return area semicolon\n}\n'
Length: 175 characters
🔍 Scanning input: 175 characters
✅ Scanning complete: 26 tokens generated

📝 Generated tokens:
   1. TYPE(float)
   2. IDENTIFIER(compute_area)
   3. LPAREN(()
   4. TYPE(float)
   5. IDENTIFIER(radius)
   6. RPAREN())
   7. LBRACE({)
   8. TYPE(float)
   9. IDENTIFIER(pi)
  10. ASSIGN_OP(equals to)
  11. FLOAT(3.14)
  12. IDENTIFIER(semicolon)
  13. TYPE(float)
  14. IDENTIFIER(area)
  15. ASSIGN_OP(equals to)
  16. IDENTIFIER(pi)
  17. ARITHMETIC_OP(multiply)
  18. IDENTIFIER(radius)
  19. ARITHMETIC_OP(multiply)
  20. IDENTIFIER(radius)
  21. IDENTIFIER(semicolon)
  22. KEYWORD(return)
  23. IDENTIFIER(area)
  24. IDENTIFIER(semicolon)
  25. RBRACE(})
  26. $($)


In [4]:
# 2) Conditional with relational and logical ops, booleans, and strings
test_code_cond = '''
boolean check_user ( string name , integer age ) {
    boolean is_adult equals to age is greater than or equal to 18 semicolon
    boolean valid_name equals to name dot length is greater than 0 semicolon
    if ( is_adult and valid_name ) {
        string msg equals to "Access granted" semicolon
    } else {
        string msg equals to "Access denied" semicolon
    }
    return is_adult semicolon
}
'''

print(f"\n🧪 Testing scanner with sample code:")
print(f"Input: {repr(test_code_cond)}")
print(f"Length: {len(test_code_cond)} characters")
    
tokens = scanner.scan(test_code_cond)
    
print(f"\n📝 Generated tokens:")
for i, token in enumerate(tokens):
    print(f"  {i+1:2d}. {token}")
    


🧪 Testing scanner with sample code:
Input: '\nboolean check_user ( string name , integer age ) {\n    boolean is_adult equals to age is greater than or equal to 18 semicolon\n    boolean valid_name equals to name dot length is greater than 0 semicolon\n    if ( is_adult and valid_name ) {\n        string msg equals to "Access granted" semicolon\n    } else {\n        string msg equals to "Access denied" semicolon\n    }\n    return is_adult semicolon\n}\n'
Length: 404 characters
🔍 Scanning input: 404 characters
❌ Lexical error at line 2, position 34: unexpected character ','
✅ Scanning complete: 51 tokens generated

📝 Generated tokens:
   1. TYPE(boolean)
   2. IDENTIFIER(check_user)
   3. LPAREN(()
   4. TYPE(string)
   5. IDENTIFIER(name)
   6. TYPE(integer)
   7. IDENTIFIER(age)
   8. RPAREN())
   9. LBRACE({)
  10. TYPE(boolean)
  11. IDENTIFIER(is_adult)
  12. ASSIGN_OP(equals to)
  13. IDENTIFIER(age)
  14. RELATIONAL_OP(is greater than or equal to)
  15. NUMBER(18)
  16. IDENTI