Ideally we would want each token that python requires to be it's own token. This is not strictly necessary but I believe it will help since we will be training quite small models. Will probably also reduce the amount of synxtax errors. 

In [56]:
import tokenize

tokenize.EQUAL, ord("=") # not ascii

(22, 61)

In [91]:
from io import BytesIO


python_code = """
# sum from 1 to 9
accumulator = 0 # accumulates the sum
for i in range(10):
    accumulator += 1
"""

list(x.string for x in tokenize.tokenize(BytesIO(python_code.encode("utf-8")).readline)) # don't know why utf-8 is there

['utf-8',
 '\n',
 '# sum from 1 to 9',
 '\n',
 'accumulator',
 '=',
 '0',
 '# accumulates the sum',
 '\n',
 'for',
 'i',
 'in',
 'range',
 '(',
 '10',
 ')',
 ':',
 '\n',
 '    ',
 'accumulator',
 '+=',
 '1',
 '\n',
 '',
 '']

We can use ast to check for obvious syntax errors (ast.parse throws an error). This won't find semantic errors but that would be quite hard regardless.

In [75]:
import ast
print(ast.dump(ast.parse(python_code), indent=4))
print(ast.unparse(ast.parse(python_code)))

Module(
    body=[
        FunctionDef(
            name='example_function',
            args=arguments(
                posonlyargs=[],
                args=[
                    arg(arg='x')],
                kwonlyargs=[],
                kw_defaults=[],
                defaults=[]),
            body=[
                Assign(
                    targets=[
                        Name(id='accumulator', ctx=Store())],
                    value=Constant(value=0)),
                For(
                    target=Name(id='i', ctx=Store()),
                    iter=Call(
                        func=Name(id='range', ctx=Load()),
                        args=[
                            Name(id='x', ctx=Load())],
                        keywords=[]),
                    body=[
                        AugAssign(
                            target=Name(id='accumulator', ctx=Store()),
                            op=Add(),
                            value=Constant(value=1))],
               

In [92]:
bad_python_code = """
def example(x):
return x
"""
try: ast.parse(bad_python_code)
except: print("finds syntax errors such as missing indents")

bad_python_code = "a  b"
try: ast.parse(bad_python_code)
except: print("... invalid statements")


bad_python_code = """
def example(x:
    return x
"""
try: ast.parse(bad_python_code)
except: print("... missing parentheses")

bad_python_code = """
fo i in range(5):
    print(i)
"""
try: ast.parse(bad_python_code)
except: print("also detects the misuse of python keywords")

bad_python_code = """
for i in ra(5):
    p(i)
"""
ast.parse(bad_python_code) # ... but not the misuse of inbult functions like range or print

bad_python_code = """
def example(x):
    return x*2
xample(x)
"""
ast.parse(bad_python_code) # does not fail


bad_python_code = """
def foo(): return "bar"
a = 2
a += foo
"""
ast.parse(bad_python_code) # does not fail


bad_python_code = "a = b"
_ = ast.parse(bad_python_code) # does not fail

finds syntax errors such as missing indents
... invalid statements
... missing parentheses
also detects the misuse of python keywords
