In [1]:
from copy import deepcopy
import glob
import pickle

from parso import parse

In [2]:
def check_folder(folder):
    folder_ = deepcopy(folder)
    if not folder_.endswith("**/*.py"):
        if not folder_.endswith("/"):
            folder_ += "/"
        folder_ += "**/*.py"
    return folder_


def get_token_ind(node, token2ind):
    if node == "push" or node == "pop":
        return token2ind.setdefault(node, len(token2ind))
    elif "keyword" == node.type or "operator" == node.type:
        # keywords: "for", "while", "if", etc.
        # operators: ":", ",", "+=", etc.
        return token2ind.setdefault(node.type + "_" + node.value, len(token2ind))
    else:
        return token2ind.setdefault(node.type, len(token2ind))


def _file_features(root, token2ind):
    yield get_token_ind(root, token2ind)
    
    if hasattr(root, "children"):
        yield get_token_ind("push", token2ind)
        for ch in root.children:
            yield from _file_features(ch, token2ind)
        yield get_token_ind("pop", token2ind)
    

def file_features(filename, token2ind):
    with open(filename, errors="ignore") as f:
        content = f.read()
        parser = parse(content)
        return list(_file_features(parser, token2ind))        

    
def feat_extraction_pipeline(folder):
    folder_ = check_folder(folder)
        
    token2ind = {}
    feat = []
    
    for i, filename in enumerate(glob.iglob(folder_, recursive=True)):
        if (i + 1) % 10 == 0:
            print(i + 1, len(token2ind), sep="\t\t", end="\r")
        feat.append(file_features(filename, token2ind))
    print(i + 1, len(token2ind), sep="\t\t")
    return feat, token2ind

In [3]:
folder = "/usr/local/lib/python3.5/dist-packages"

feat, token2ind = feat_extraction_pipeline(folder)

print("number of unique tokens:", len(token2ind))
print("number of samples:", len(feat))
print("total number of tokens:", sum(map(len, feat)))
print("avg number of tokens per file:", sum(map(len, feat)) / len(feat))

6604		15439
number of unique tokens: 154
number of samples: 6604
total number of tokens: 29814322
avg number of tokens per file: 4514.585402786191


In [4]:
sorted(token2ind.items(), key=lambda x: x[1])

[('file_input', 0),
 ('push', 1),
 ('simple_stmt', 2),
 ('string', 3),
 ('newline', 4),
 ('pop', 5),
 ('if_stmt', 6),
 ('keyword_if', 7),
 ('comparison', 8),
 ('name', 9),
 ('operator_==', 10),
 ('operator_:', 11),
 ('suite', 12),
 ('import_from', 13),
 ('keyword_from', 14),
 ('dotted_name', 15),
 ('operator_.', 16),
 ('keyword_import', 17),
 ('atom_expr', 18),
 ('trailer', 19),
 ('operator_(', 20),
 ('operator_)', 21),
 ('endmarker', 22),
 ('operator_*', 23),
 ('import_name', 24),
 ('expr_stmt', 25),
 ('operator_=', 26),
 ('import_as_name', 27),
 ('keyword_as', 28),
 ('try_stmt', 29),
 ('keyword_try', 30),
 ('except_clause', 31),
 ('keyword_except', 32),
 ('keyword_None', 33),
 ('atom', 34),
 ('operator_[', 35),
 ('testlist_comp', 36),
 ('operator_,', 37),
 ('operator_]', 38),
 ('subscript', 39),
 ('number', 40),
 ('keyword_else', 41),
 ('funcdef', 42),
 ('keyword_def', 43),
 ('parameters', 44),
 ('param', 45),
 ('arglist', 46),
 ('return_stmt', 47),
 ('keyword_return', 48),
 ('lambde

In [5]:
# some visualization functionality
def print_structure(root, offset=""):
    to_print = offset + ": " + root.type
    if hasattr(root, "value"):
        to_print += " \"" + root.value +"\""
    print(to_print)        
    
    if hasattr(root, "children"):
        offset += "-"
        print(offset + ": " + "push")
        for ch in root.children:
            print_structure(ch, offset)
        offset = offset[:-1]
        print(offset + ": " + "pop")
        

codes = ["""import pickle
import numpy as np""", """
folder = "some_folder"

with open(folder, "wb") as f:
    pickle.dump((feat, token2ind), f""", """
some_var = 'asd'
some_int = 5
some_float = 5.
some_bool = True""", """
def a():
    return 0""", """[i for i in range(10)]""", "a = 5"]

parsers = [parse(code) for code in codes]

for c, p in zip(codes, parsers):
    print("\|/" * 10)
    print(c)
    print("+" * 10, "parser output")
    print_structure(p)

\|/\|/\|/\|/\|/\|/\|/\|/\|/\|/
import pickle
import numpy as np
++++++++++ parser output
: file_input
-: push
-: simple_stmt
--: push
--: import_name
---: push
---: keyword "import"
---: name "pickle"
--: pop
--: newline "
"
-: pop
-: import_name
--: push
--: keyword "import"
--: dotted_as_name
---: push
---: name "numpy"
---: keyword "as"
---: name "np"
--: pop
-: pop
-: endmarker ""
: pop
\|/\|/\|/\|/\|/\|/\|/\|/\|/\|/

folder = "some_folder"

with open(folder, "wb") as f:
    pickle.dump((feat, token2ind), f
++++++++++ parser output
: file_input
-: push
-: simple_stmt
--: push
--: expr_stmt
---: push
---: name "folder"
---: operator "="
---: string ""some_folder""
--: pop
--: newline "
"
-: pop
-: with_stmt
--: push
--: keyword "with"
--: with_item
---: push
---: atom_expr
----: push
----: name "open"
----: trailer
-----: push
-----: operator "("
-----: arglist
------: push
------: name "folder"
------: operator ","
------: string ""wb""
-----: pop
-----: operator ")"
----: pop
---:

In [None]:
save_features = "location_to_save_features"
with open(save_features, "wb") as f:
    pickle.dump((feat, token2ind), f)