In [1]:
tokens = {'keyword': ('KEYWORD', ("program", "var", "div", "integer", "real", "begin", "end",
                                      "procedure", "if", "then", "else", "while", "repeat", "until",
                                      "write", "writeln", "or",
                                      "and","div", "mod", "not", "trunc", "real", "do")),
              'ID': 'ID', 'sym':'SYM', ':=': 'ASSIGN', ':': 'COLON', ',': 'COMMA',
              ';': 'SEMICOLON', '.': 'DOT', 'eq': 'EQ', '!=': 'NQ', '<': 'LT',
              '<=': 'LTE', '>': 'GT', '>=': 'GTE', 'quote': 'QUOTE', 'integer': 'INTEGER', 
              '+': 'PLUS', '-': 'MINUS', '*': 'MUL', '/': 'DIV', '(': 'LPAREN', 
               ')': 'RPAREN', '!': 'not', 'eof': 'EOF', 'term': 'TERM'}

In [45]:
class lex_analyzer:
    def __init__(self, code_text):
        self.text = code_text
        self.line_no = 0
        self.col_no = 0
        self.current_index = 0
        self.current_char = self.text[self.current_index]
        self.symbol_table = {}
        self.text_len = len(self.text)
        self.all_tokens = []
    
    def next_char(self):
        if(self.current_index+1 < self.text_len):
            return (self.text[self.current_index+1])
        else:
            return None
    
    def proceed(self):
        self.current_index += 1
        self.col_no += 1
        
        if('\n' in [self.current_char]):
            self.line_no += 1
            self.col_no = 0
        
        if(self.current_index < self.text_len):
            self.current_char = self.text[self.current_index]
        else:
            self.current_char = None
    
    def get_word(self):
        word_str = ''
        while(self.current_char!=None and self.current_char.isalnum()):
            word_str += self.current_char
            self.proceed()
        return word_str
    
    def get_digit(self):
        digit = ''     
        while(self.current_char!=None and self.current_char.isdigit()):
            digit += self.current_char
            self.proceed()
        return digit
     
    def get_str_const(self):
        str_const = '"'
        self.proceed()
        while(self.current_char != None and self.current_char != '"'):
            str_const += self.current_char
            self.proceed()
        str_const += '"'
        self.proceed()
        return str_const
    
    def update_symbol_table(self): 

        for i in range(len(self.all_tokens)):
            print(self.all_tokens[i][1].lower()=='program')
            self.symbol_table[self.all_tokens[i+1][1]] = self.all_tokens[i][1].lower()
            break
            
        for i in self.symbol_table.keys():
            if(self.symbol_table[i] != 'program'):
                print(i, self.symbol_table[i], self.all_tokens[self.symbol_table[i]+2][1])
                self.symbol_table[i] = self.all_tokens[self.symbol_table[i]+2][1]
        return 

    def parse(self):        
        while(self.current_char is not None):
            if self.current_char.isspace():
                self.proceed()
                continue
            
            if(self.current_char.isalpha()):
                line = self.line_no
                col = self.col_no
                x = self.get_word()
                if(x in tokens['keyword'][1]):
                    self.all_tokens.append((tokens['keyword'][0], x, line, col))
                else:
                    self.all_tokens.append(('ID', x, line, col))
                    if(x not in self.symbol_table.keys()):
                        self.symbol_table[x] = len(self.all_tokens)-1
                    
            elif(self.current_char.isdigit()):
                line = self.line_no
                col = self.col_no
                x = self.get_digit()
                self.all_tokens.append(('INTCONST', x, line, col))
            
            elif(self.current_char == '"'):
                line = self.line_no
                col = self.col_no
                x = self.get_str_const()
                self.all_tokens.append(('STRCONST', x, line, col))
          
            elif(self.current_index+1<self.text_len and self.current_char+self.next_char() in tokens.keys()):
                line = self.line_no
                col = self.col_no
                x = self.current_char+self.next_char()
                tok_type = tokens[x]
                self.all_tokens.append((tok_type, x, line, col))
                self.proceed()
                self.proceed()
                
            elif(self.current_char in tokens.keys()):
                line = self.line_no
                col = self.col_no
                x = self.current_char
                tok_type = tokens[x]
                self.all_tokens.append((tok_type, x, line, col))
                self.proceed()
                
#             self.proceed()
        self.update_symbol_table()
        return self.all_tokens
                
source_text = ""
with open("source_code.pas", 'r') as source_file:
    source_text = "".join(source_file.readlines())
source_file.close()


lex = lex_analyzer(source_text)
toks = lex.parse()
toks

True
counter 4 integer
number 8 integer
factorial 12 integer
height 16 real
width 20 real
breadth 24 real
volume 28 real


[('KEYWORD', 'program', 0, 0),
 ('ID', 'checkMyAbility', 0, 8),
 ('SEMICOLON', ';', 0, 22),
 ('KEYWORD', 'var', 1, 0),
 ('ID', 'counter', 2, 0),
 ('COLON', ':', 2, 7),
 ('KEYWORD', 'integer', 2, 9),
 ('SEMICOLON', ';', 2, 16),
 ('ID', 'number', 3, 0),
 ('COLON', ':', 3, 6),
 ('KEYWORD', 'integer', 3, 8),
 ('SEMICOLON', ';', 3, 15),
 ('ID', 'factorial', 4, 0),
 ('COLON', ':', 4, 9),
 ('KEYWORD', 'integer', 4, 11),
 ('SEMICOLON', ';', 4, 18),
 ('ID', 'height', 5, 0),
 ('COLON', ':', 5, 7),
 ('KEYWORD', 'real', 5, 9),
 ('SEMICOLON', ';', 5, 13),
 ('ID', 'width', 6, 0),
 ('COLON', ':', 6, 6),
 ('KEYWORD', 'real', 6, 8),
 ('SEMICOLON', ';', 6, 12),
 ('ID', 'breadth', 7, 0),
 ('COLON', ':', 7, 8),
 ('KEYWORD', 'real', 7, 10),
 ('SEMICOLON', ';', 7, 14),
 ('ID', 'volume', 8, 0),
 ('COLON', ':', 8, 7),
 ('KEYWORD', 'real', 8, 9),
 ('SEMICOLON', ';', 8, 13),
 ('KEYWORD', 'begin', 9, 0),
 ('ID', 'number', 10, 0),
 ('ASSIGN', ':=', 10, 7),
 ('INTCONST', '6', 10, 10),
 ('SEMICOLON', ';', 10, 11),


In [46]:
lex.symbol_table

{'checkMyAbility': 'program',
 'counter': 'integer',
 'number': 'integer',
 'factorial': 'integer',
 'height': 'real',
 'width': 'real',
 'breadth': 'real',
 'volume': 'real'}