In [2]:
class Token:
    """ Class to store the tokenvalue and type of a token """
    def __init__(self, value=None, ttype=None, line=None, array_size=None, dec_flag=False):
        self.tokenval = value
        self.tokentype = ttype
        self.line_num = line
        self.array_size = array_size
        self.dec_flag = dec_flag

In [3]:
def getchar(m):
    """returns the next character from the input string m"""
    global p
    p+=1
    return m[p] if p < len(m) else ""

def isdigit(c):
    """returns True if input character c is a digit"""
    Digits = [str(x) for x in range(10)]
    return True if c in Digits else False

def isalpha(c):
    """returns True if input character c is a letter or underscore"""
    Uppers = [chr(x) for x in range(ord('A'),ord('Z')+1)]
    Lowers = [chr(x) for x in range(ord('a'),ord('z')+1)]
    return True if c in Uppers or c in Lowers or c=='_' else False

def isalnum(c):
    """returns True if the input character c is a digit or alphabetic"""
    return isdigit(c) or isalpha(c)

def isop(c):
    """returns True if the input character c is a defined operator"""
    ops = ['+', '-', '*', '/', '=', '+=', '-=', "*=", '/=', '++', '--', '%', '&', '|', '^', '>>', '<<', '~', '!', '?']
    return True if c in ops else False

def isrelop(c):
    """returns True if the input character c is a defined relational operator"""
    ops = ['<','>', '<=', '>=', '==', '!=']
    return True if c in ops else False

def ispun(c):
    """returns True if the input character c is a defined punctuation"""
    puns = ['(', ')', '{', '}', '[', ']', ';', ':', ',']
    return True if c in puns else False  


In [2]:
def init_LA():
    """initialize the symbol table for lexical analysis"""
    global p                 # current input position
    p = -1

    global line_num          # current line number
    line_num = 1
    
    global ST                # symbol table
    ST = {}                  # Symbol Table for variables
    #ST.setdefault('', 0)
    
    ST["int"] = Token("int", "INT")      # adding in reserved words as tokens
    ST["float"] = Token("float","FLOAT")
    ST["if"] = Token("if","IF")             
    ST["else"] = Token("else","ELSE")
    ST["for"] = Token("for","FOR")
    ST["while"] = Token("while","WHILE")
    ST["do"] = Token("do","DO")
    ST["break"] = Token("break", "BREAK")
    ST["continue"] = Token("continue", "CONTINUE")
    ST["goto"] = Token("goto", "GOTO")
    
    global DONE              # Flag value for end of iteration
    DONE = Token("$", None)

In [5]:
def lexan(m):
    """modified lexical analyzer from section 2.9, page 74"""
    t = Token()        # create a blank token
    global p           # string position
    global line_num    # line number
    global ST          # symbol table
    global DONE        # flag for end of file
    
    tokenval = None    # default value for tokens
    
    while True:
        lexeme = ""                                  # current lexeme being read
        c = getchar(m)                               # read first character
        
        if c in [' ','\t']:                          # skip over space and tab
            pass
        elif c == '\n':
            line_num += 1                            # increment line counter if newline is read
        
        elif isdigit(c):
            tokenval = int(c)                        # start with integer value if char is a digit
            c = getchar(m)                           # load next character
            while isdigit(c):
                tokenval = 10*tokenval + int(c)      # process next integer if char is a digit
                c = getchar(m)                       # load next character
            p-=1
            t.tokenval, t.tokentype, t.line_num = tokenval,  "NUM", line_num
            return t                                 # return token object 
        
        elif isalpha(c):
            lexeme += c                              # append character to buffer
            c = getchar(m)                           # load next character
            while isalnum(c):
                lexeme += c                          # append character to buffer
                c = getchar(m)                       # load next character
            if lexeme not in ST.keys():
                ST[lexeme] = Token(lexeme, "ID")     # add entry to symbol table (if not already present)
            p-=1
            t = ST[lexeme]
            t.line_num = line_num
            return t                                 # return the token object for the lexeme
        
        elif isop(c):
            temp = c                                 # store the single character c that is an op
            lexeme += c                              # append character to buffer
            lexeme += getchar(m)                     # lexeme is now two characters together
            if isop(lexeme):                         
                t.tokenval, t.tokentype, t.line_num = lexeme, "OP", line_num
                return t                             # return token object for the two character op
            elif isrelop(lexeme):                    # check if the lexeme is a relop 
                t.tokenval, t.tokentype, t.line_num = lexeme, "RELOP", line_num
                return t                             # return token object for two character relop
            elif lexeme == '//':                     # check if the lexeme is a single line comment symbol
                line_num += 1                        # manually increase the line number
                while c != '\n':                     # keep going to the next character until we get to a newline
                    c = getchar(m)
                    if p >= len(m): return DONE      # return done if we reach the end of the string
                    continue
            elif lexeme == '/*':                     # check if lexeme is a multi line comment symbol
                c = getchar(m)
                lexeme = c
                lexeme += getchar(m)                 # lexeme is two characters together
                while lexeme != '*/':                # keep going until we get to the end of comment symbol
                    c = getchar(m)                   
                    lexeme = lexeme[1]+c             # increment the two character lexeme along the string
                    if c == '\n':                    # check for new lines
                        line_num += 1
                continue                             
            else:                                    # otherwise c was a single character op
                p-=1
                t.tokenval, t.tokentype, t.line_num = temp, "OP", line_num
                return t                             # return token object for the single character op
        
        elif isrelop(c):
            temp = c                                 # store the single character c that is an op
            lexeme += c                              # append character to buffer
            lexeme += getchar(m)                     # lexeme is now the two characters together
            if isop(lexeme):
                t.tokenval, t.tokentype, t.line_num = lexeme, "OP", line_num
                return t                             # return object for two character op
            elif isrelop(lexeme):                    # check if the lexeme is a relop 
                t.tokenval, t.tokentype, t.line_num = lexeme, "RELOP", line_num
                return t                             # return object for two character relop
            else:                                    # otherwise c was a single character op
                p-=1
                t.tokenval, t.tokentype, t.line_num = temp, "RELOP", line_num
                return t                             # return object for single character relop
        
        elif ispun(c):
            t.tokenval, t.tokentype, t.line_num = c, "PUN", line_num
            return t                                 # return object for punctuation
        
        elif p >= len(m): 
            DONE.line_num = line_num
            return DONE                # return DONE Flag for end of iteration
        
        else:
            print("invalid character",c,"found on line",line_num)