# Analisador Léxico

## Lista de tokens

In [19]:
# Tokens a serem usados
token_exp = {
    'SELECT': r'SELECT',
    'WHERE': r'WHERE',
    'LIMIT': r'LIMIT',
    'VAR': r'\?[A-Za-z_][A-Za-z0-9_]*',
    'LB': r'\{',
    'RB': r'\}',
    'COLON': r':',
    'NUM': r'[0-9]+',
    'DOT': r'\.',
    'SKIP': r'[ \t\r\n]+',
    'A': r'\ba\b',
    'STRING': r'"[^"]*"',
    'ALT': r'@[a-z]{2}',
    'PREFIX': r'[A-Za-z_][A-Za-z0-9_]*'
}

## Função Tokenize

In [20]:
import re

def tokenize(input_string):
    recognized = []
    reg_exp = '|'.join(f'(?P<{tok}>{exp})' for tok,exp in token_exp.items())
    mo = re.finditer(reg_exp,input_string)

    for m in mo:
        dic = m.groupdict()
        t = ("UNKNOWN", m.group(), m.span())
        for name in token_exp.keys(): 
            if dic[name]:
                t = (name, dic[name], m.span())
        if not dic['SKIP'] and t[0] != 'UNKNOWN': 
            recognized.append(t)

    return recognized

## Testes

In [21]:
query = '''
SELECT ?nome ?desc WHERE {
?s a dbo:MusicalArtist.
?s foaf:name "Chuck Berry"@en .
?w dbo:artist ?s.
?w foaf:name ?nome.
?w dbo:abstract ?desc
} LIMIT 1000
'''

for tok in tokenize(query):
    print(tok)


('SELECT', 'SELECT', (1, 7))
('VAR', '?nome', (8, 13))
('VAR', '?desc', (14, 19))
('WHERE', 'WHERE', (20, 25))
('LB', '{', (26, 27))
('VAR', '?s', (28, 30))
('A', 'a', (31, 32))
('PREFIX', 'dbo', (33, 36))
('COLON', ':', (36, 37))
('PREFIX', 'MusicalArtist', (37, 50))
('DOT', '.', (50, 51))
('VAR', '?s', (52, 54))
('PREFIX', 'foaf', (55, 59))
('COLON', ':', (59, 60))
('PREFIX', 'name', (60, 64))
('STRING', '"Chuck Berry"', (65, 78))
('ALT', '@en', (78, 81))
('DOT', '.', (82, 83))
('VAR', '?w', (84, 86))
('PREFIX', 'dbo', (87, 90))
('COLON', ':', (90, 91))
('PREFIX', 'artist', (91, 97))
('VAR', '?s', (98, 100))
('DOT', '.', (100, 101))
('VAR', '?w', (102, 104))
('PREFIX', 'foaf', (105, 109))
('COLON', ':', (109, 110))
('PREFIX', 'name', (110, 114))
('VAR', '?nome', (115, 120))
('DOT', '.', (120, 121))
('VAR', '?w', (122, 124))
('PREFIX', 'dbo', (125, 128))
('COLON', ':', (128, 129))
('PREFIX', 'abstract', (129, 137))
('VAR', '?desc', (138, 143))
('RB', '}', (144, 145))
('LIMIT', 'LIMIT'

In [22]:
query = '''
SELECT ?nome  WHERE {
?s a dbo:MusicalArtist.
?s foaf:name "Chuck Berry"@en .
} LIMIT 10
'''

for tok in tokenize(query):
    print(tok)

('SELECT', 'SELECT', (1, 7))
('VAR', '?nome', (8, 13))
('WHERE', 'WHERE', (15, 20))
('LB', '{', (21, 22))
('VAR', '?s', (23, 25))
('A', 'a', (26, 27))
('PREFIX', 'dbo', (28, 31))
('COLON', ':', (31, 32))
('PREFIX', 'MusicalArtist', (32, 45))
('DOT', '.', (45, 46))
('VAR', '?s', (47, 49))
('PREFIX', 'foaf', (50, 54))
('COLON', ':', (54, 55))
('PREFIX', 'name', (55, 59))
('STRING', '"Chuck Berry"', (60, 73))
('ALT', '@en', (73, 76))
('DOT', '.', (77, 78))
('RB', '}', (79, 80))
('LIMIT', 'LIMIT', (81, 86))
('NUM', '10', (87, 89))


In [23]:
query = '''
SELECT ?nome  WHERE {
?s a dbo:MusicalArtist.
?s foaf:name "Chuck Berry"@en .
} LIM67IT 10
'''

for tok in tokenize(query):
    print(tok)

('SELECT', 'SELECT', (1, 7))
('VAR', '?nome', (8, 13))
('WHERE', 'WHERE', (15, 20))
('LB', '{', (21, 22))
('VAR', '?s', (23, 25))
('A', 'a', (26, 27))
('PREFIX', 'dbo', (28, 31))
('COLON', ':', (31, 32))
('PREFIX', 'MusicalArtist', (32, 45))
('DOT', '.', (45, 46))
('VAR', '?s', (47, 49))
('PREFIX', 'foaf', (50, 54))
('COLON', ':', (54, 55))
('PREFIX', 'name', (55, 59))
('STRING', '"Chuck Berry"', (60, 73))
('ALT', '@en', (73, 76))
('DOT', '.', (77, 78))
('RB', '}', (79, 80))
('PREFIX', 'LIM67IT', (81, 88))
('NUM', '10', (89, 91))
