-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprocessor_ast.py
52 lines (41 loc) · 1.6 KB
/
processor_ast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re
import tokenize as py_tokenize
import token as py_token
from io import BytesIO
class Preprocess:
def __init__(self, mode):
assert mode in ['anno', 'code', 'docs']
self.mode = mode
def tokenize_python(self, snippet: str):
toks = py_tokenize.tokenize(
BytesIO(snippet.strip().encode('utf-8')).readline)
def predicate(t):
return py_token.tok_name[t.type] not in \
['ENCODING', 'NEWLINE', 'ENDMARKER', 'ERRORTOKEN']
return [t.string for t in toks if predicate(t)]
def clean(self, x):
x = re.sub(r'[‘…—−–]', ' ', x)
x = re.sub(r'[?,`“”’™•°]', '', x)
if self.mode == 'anno' or self.mode == 'docs':
x = re.sub(r'[,:;]', r'', x)
x = re.sub(r'([\+\-\*/=(){}%^&\.])', r' \1 ', x)
x = re.sub(r'\.+$', r'', x)
if self.mode == 'docs':
x = re.sub(r'[\t\r\n\v\f]', r'', x)
x = re.sub(r'[\(\[\]\)]', r'', x)
if self.mode == 'code':
x = re.sub(r'[\(\[\+\-\*/,:;=(){}%^&\]\)\'\"]', r'', x).strip()
# x = re.sub(r"([])':;{}%^&|")
# row = re.sub(r'[\[\]\(\)]', '', row).strip()
x = ' '.join(x.split())
x = ' '.join(self.tokenize_python(x))
x = re.sub(r'[ ]+', ' ', x)
x = x.strip()
return x
def tokenize(self, x):
if self.mode == 'anno':
# TODO: something smarter?
# return [tok.text for tok in nlp.tokenizer(x)]
return x.split()
if self.mode == 'code':
return x.split()