In [3]:
import re

def tokenize(code):
    
    #\d+             # one or more digits
    #[a-zA-Z_]\w*    # a letter or underscore followed by zero or more letters, digits, or underscores
    #|               # OR
    #[\(\)\+\-\*/=]  # one of the specified operator characters: ( ) + - * / =

    tokens = re.findall(r'\d+|[a-zA-Z_]\w*|[\(\)\+\-\*/=]', code)
    
    i = 0
    while i < len(tokens):
        token = tokens[i]
        if token == 'def':
    ### analyse function name and parameters
            j = i + 1
            while j < len(tokens) and tokens[j] != '(':
                j += 1
            if j < len(tokens):
    ### compose token with "def" + f. name and parantheses in case there is no string inside the parantheses
                tokens = tokens[:i] + [token] + tokens[i+1:j] + ['(', ')'] + tokens[j+1:]
                i += 3
        elif token.isalpha():
    ### compose token together with parameters inside de parantheses
            j = i + 1
            while j < len(tokens) and tokens[j] == '.':
                j += 2
            if j < len(tokens) and tokens[j] == '(':
                tokens = tokens[:i] + [token] + tokens[i+1:j] + ['(', ')'] + tokens[j+1:]
                i += 3
        i += 1
    
    return tokens

def get_token_location(code, index):
    
    lines = code.split('\n')
    
    ### array vor storing the length of all read lines
    line_lengths = [len(line) for line in lines]
    
    ### check for indentation in lines to calculate column number
    line_start_indexes = [sum(line_lengths[:i]) + i for i in range(len(lines))]
    
    line_number = sum(index >= start for start in line_start_indexes)
    
    column_number = index - line_start_indexes[line_number-1]
    
    return line_number, column_number

def get_token_type(token, line, token_length):
    
    if token in {'=','+','-','*','/','(',')'}:
        
        return 'OPERATOR', line, len(token)
    
    elif token.isnumeric():
        
        return 'NUMBER', line, len(token)
    
    elif token.isalpha():
        
        if token in {'if', 'else', 'for', 'while', 'def', 'return'}:
            
            return 'KEYWORD', line, len(token)
        
        # function names, which are composed of multiple elements
        elif token_length == 1 and token.islower():
            
            return 'VARIABLE', line, len(token)
        
        else:
            
            return 'FUNCTION', line, len(token)
        
    else:
        
        raise ValueError(f'Invalid token: {token} at line {line}, column {column}')

def interpret(code):
    
    tokens = tokenize(code)
    interpreted_tokens = []
    
    for i, token in enumerate(tokens):
        
        line, column = get_token_location(code, sum(len(interpreted_token[0]) for interpreted_token in interpreted_tokens) + i)
        
        token_type, token_line, token_length = get_token_type(token, line, column)
        
        interpreted_tokens.append((token, token_type, token_line, token_length))
 
    return interpreted_tokens

def run(file_path):
    
    with open(file_path, 'r') as file:
        
        code = file.read()
    
    interpreted_tokens = interpret(code)
    
    for token, token_type, line, length in interpreted_tokens:
        print(f"'{token}': {token_type} (length {length}, line {line})")
        
run('exemplu.txt')


'def': KEYWORD (length 3, line 1)
'square': FUNCTION (length 6, line 1)
'(': OPERATOR (length 1, line 1)
')': OPERATOR (length 1, line 1)
'x': FUNCTION (length 1, line 2)
')': OPERATOR (length 1, line 2)
'return': KEYWORD (length 6, line 2)
'12': NUMBER (length 2, line 2)
'x': FUNCTION (length 1, line 2)
'*': OPERATOR (length 1, line 2)
'x': FUNCTION (length 1, line 2)
'print': FUNCTION (length 5, line 4)
'(': OPERATOR (length 1, line 4)
')': OPERATOR (length 1, line 4)
'Enter': FUNCTION (length 5, line 4)
'a': FUNCTION (length 1, line 4)
'number': FUNCTION (length 6, line 4)
')': OPERATOR (length 1, line 5)
'num': FUNCTION (length 3, line 5)
'=': OPERATOR (length 1, line 5)
'int': FUNCTION (length 3, line 5)
'(': OPERATOR (length 1, line 5)
')': OPERATOR (length 1, line 5)
'input': FUNCTION (length 5, line 5)
'(': OPERATOR (length 1, line 7)
')': OPERATOR (length 1, line 7)
')': OPERATOR (length 1, line 7)
'if': KEYWORD (length 2, line 7)
'num': FUNCTION (length 3, line 8)
'0': NUMBER