In [1]:
# Styling notebook
from IPython.core.display import HTML
def css_styling():
    styles = open("./styles/custom.css", "r").read()
    return HTML(styles)
css_styling()

In [20]:
import string 

class Scanner :
    EOI = '$'
    START_COMMENT = '#'
    END_COMMENT = '#'
    START_STRING = '"'
    END_STRING = '"'
    EQUAL = '='
    NOT = '!'
    GREATER = '>'
    LESS = '<'
    
    WHITESPACE = {' ', '\t', '\n'}
    DIGITS = {'0','1','2','3','4','5','6','7','8','9'}
    LETTERS = set(string.ascii_uppercase).union(set(string.ascii_lowercase))
    LETTERS_OR_DIGITS = LETTERS.union(DIGITS)
    
    OP_TABLE = {
        '(' : 'lParen',
        ')' : 'rParen',
        '+' : 'plusSym',
        '-' : 'minusSym',
        '*' : 'timesSym',
        '/' : 'divSym',
        ';' : 'semicolon',
    }
    
    KEYWORD_TABLE = {
        'while'  : 'whileSym',
        'return' : 'returnSym',
        'if'     : 'ifSym',
        'else'   : 'elseSym',
        'do'     : 'doSym',
    }
        
    def __init__(self, source) :
        self.source = source + Scanner.EOI

        # Initialize Scanner state
        self.position = 0           # First character
        self.currentText = None     # Current text/token
        self.currentToken = None    # Nothing beats good naming, right?
    
    def error(self, message) :
        print(">>> Error: ", message)
        
    def currentCh(self) :
        return self.source[self.position]
    
    def move(self) :
        # Have to do error check
        self.position += 1
        
    def EAT(self) :
        if self.currentCh == Scanner.EOI :
            self.error('Cannot move beyond EOI!')
        else :
            self.move()
            
    def FIND(self,x) :
        result = ''
        while self.currentCh() != x and self.currentCh() != Scanner.EOI :
            result = result + self.currentCh()
            self.EAT()
        if self.currentCh() == Scanner.EOI :
            self.error('EOI detected searching for '+ x)
        else :
            return result

    def FINDSTAR(self,s) :
        while self.currentCh() not in s and self.currentCh() != Scanner.EOI :
            self.EAT()
        if self.currentCh() == Scanner.EOI :
            self.error('EOI detected searching for '+ s)
   
    def SKIP(self,x) :
        result = ''
        while self.currentCh() == x :
            result = result + self.currentCh()
            self.EAT()
        return result
            
    def SKIPSTAR(self,s) :
        result = ''
        while self.currentCh() in s :
            result = result + self.currentCh()
            self.EAT()
        return result
    
    def SKIPWS(self) :
        self.SKIPSTAR(Scanner.WHITESPACE)
    
    def SKIPCOMMENT(self) :
        self.EAT()
        self.FIND(Scanner.END_COMMENT)
        self.EAT()
        
    def JUMP(self) :
        if self.currentCh() in Scanner.WHITESPACE :
            self.SKIPWS()
        elif self.currentCh() == Scanner.START_COMMENT :
            self.SKIPCOMMENT()
    
    def JUMPSTAR(self) :
        while self.currentCh() in Scanner.WHITESPACE or self.currentCh() == Scanner.START_COMMENT :
            self.JUMP()
    
    def NUM(self) :
        return 'numConstant', int(self.SKIPSTAR(Scanner.DIGITS))
    
    def ID(self) :
        return 'identifier', self.SKIPSTAR(Scanner.LETTERS_OR_DIGITS)
    
    def STRI(self) :
        self.EAT()
        chars = self.FIND(Scanner.END_STRING)
        self.EAT()
        return 'stringConstant', chars
    
    def twoCharSym(self,second, firstToken, secondToken) :
        self.EAT()
        if self.currentCh() == second : 
            self.EAT()
            return secondToken
        else :
            return firstToken

    # Return class and text of the next token
    # 'eoI' is artificial and denotes "End of Input"
    # 'Unknown' is a class for text that doesn't translate into a proper token
    # Lots of assumptions in this, needs to be rewritten if
    #    a) we have numerical constants
    #    b) we have alphanumerical variable names
    #    c) it's Monday
    
    def nextToken(self) :
        if self.currentCh() == Scanner.EOI :
            return 'eoI', 'eoI'                 # Done! Both are the same
        
        self.JUMPSTAR()
        
        c = self.currentCh()
        if c in Scanner.DIGITS : return self.NUM()
        if c in Scanner.LETTERS : 
            token, string = self.ID()
            if Scanner.KEYWORD_TABLE.get(string,None) != None : 
                return Scanner.KEYWORD_TABLE[string], None
            else :
                return token, string
        if c == Scanner.START_STRING : return self.STRI()
            
        if c == Scanner.EQUAL : return self.twoCharSym(Scanner.EQUAL,'assignSym','equalSym'), None
        if c == Scanner.NOT : return self.twoCharSym(Scanner.EQUAL,'notSym','notEqualSym'), None
        if c == Scanner.GREATER : return self.twoCharSym(Scanner.EQUAL,'greaterSym','greaterEQSym'), None
        if c == Scanner.LESS : return self.twoCharSym(Scanner.EQUAL,'lessSym','lessEQSym'), None
        
        if Scanner.OP_TABLE.get(c,None) != None : 
            self.EAT()
            return Scanner.OP_TABLE[c], None
        
        return None, None

In [21]:
s = Scanner('xyz = # hello # while else (32*b);;')

tok, text = s.nextToken()
while tok != 'eoI' :
    print(tok,' ',text)
    tok, text = s.nextToken()

identifier   xyz
assignSym   None
whileSym   None
elseSym   None
lParen   None
numConstant   32
timesSym   None
identifier   b
rParen   None
semicolon   None
semicolon   None
