Better config schema collector

Current schema collector implementation has issues. When it extracts option information from source files, it imports them into python runtime. If imported module has third-party dependencies, they are also imported thus making it a requirement to have third-party libraries installed on the system where schema collection is executed. Also, there are situations when option group guessing is inaccurate. New schema collector uses code analysis and rewriting techniques to extract just enough code to collect configuration option data. Then it evals it, causing configuration options to be added to vanilla oslo.config subsystem. Then this information gets dumped. Implementation uses various heuristics but is more accurate than existing one. Was tested on 'nova' and 'cinder' projects. Change-Id: I0bd2e93288478e7a3db273055597b7bedfbf0625
stackforge · Nov 21, 2013 · 5d9a7d3 · 5d9a7d3
1 parent 4bbe6ff
commit 5d9a7d3
Showing 1 changed file with 201 additions and 110 deletions.
diff --git a/rubick/schemas/collector.py b/rubick/schemas/collector.py
@@ -1,6 +1,9 @@
 import argparse
 from copy import copy
-import imp
+from lib2to3.pgen2 import driver
+from lib2to3.pgen2 import token
+from lib2to3.pygram import python_grammar, python_symbols as py
+from lib2to3.pytree import Node, Leaf
 import os
 import re
 import sys
@@ -187,10 +190,198 @@ def generate_schema_from_sample_config(project, version, config_file, writer):
 OPTION_REGEX = re.compile(r"(%s)" % "|".join(OPT_TYPE_MAPPING.keys()))
 
 
+def convert(gr, raw_node):
+    type, value, context, children = raw_node
+    # if has children or correspond to nonterminal
+    if children or type in gr.number2symbol:
+        return Node(type, children, context=context)
+    else:
+        return Leaf(type, value, context=context)
+
+
+def walk_tree(root):
+    while True:
+        yield root
+
+        # Optimize traversing single-child nodes
+        if len(root.children) == 1:
+            root = root.children[0]
+            continue
+
+        break
+
+    for child in copy(root.children):
+        for node in walk_tree(child):
+            yield node
+
+
+def extract_config_from_file(path):
+    with open(path) as f:
+        contents = f.read()
+
+    d = driver.Driver(python_grammar, convert=convert)
+    tree = d.parse_string(contents)
+
+    def mark_stmt(node):
+        n = node
+        while n:
+            if n.type == py.stmt:
+                n.marked = True
+                break
+            n = n.parent
+
+    fullnames = {}
+    # Process imports and renames
+    for node in walk_tree(tree):
+        if node.type == py.import_from:
+            mod = str(node.children[1]).strip()
+            for node2 in walk_tree(node.children[3]):
+                if node2.type == py.import_as_name:
+                    n = str(node2).strip()
+                    f = '.'.join([mod, n])
+                    fullnames[n] = f
+        elif node.type == py.expr_stmt:
+            if len(node.children) > 1 and node.children[1].type == token.EQUAL:
+                lhs = str(node.children[0]).strip()
+                rhs = str(node.children[2]).strip()
+                if re.match('\S+(\.\S+)*', rhs):
+                    parts = rhs.split('.')
+                    if parts[0] in fullnames:
+                        rhs = '.'.join([fullnames[parts[0]]] + parts[1:])
+                        fullnames[lhs] = rhs
+
+                        if any([rhs.startswith(s) for s in ['oslo.', 'oslo.config.', 'oslo.config.cfg.']]):
+                            mark_stmt(node)
+
+    # Process all callsites CONF.register*
+    for node in walk_tree(tree):
+        if node.type == py.power and node.children[0].children[0].type == token.NAME:
+            s = str(node.children[0]).strip()
+            if s in fullnames:
+                s = fullnames[s]
+
+            cs = node.children
+            i = 1
+            while i < len(cs) and cs[i].type == py.trailer:
+                c = cs[i]
+                if c.children[0].type != token.DOT:
+                    break
+
+                s += '.' + c.children[1].value
+                i += 1
+
+            if i < len(cs) and cs[i].type == py.trailer and cs[i].children[0].type == token.LPAR:
+                # call site
+                if s.startswith('oslo.config.cfg.CONF.'):
+                    rest = s[len('oslo.config.cfg.CONF.'):]
+                    if rest.startswith('register_'):
+                        mark_stmt(node)
+
+                if s.startswith('oslo.config.cfg.'):
+                    rest = s[len('oslo.config.cfg.'):]
+                    if rest.endswith('Opt'):
+                        mark_stmt(node)
+
+    # Traverse code and find all var references
+    seen_vars = set()
+    referenced_vars_queue = []
+
+    def find_definition(tree, name):
+        for node in walk_tree(tree):
+            if node.type == py.classdef and node.children[1].value == name:
+                return node
+            elif node.type == py.funcdef and node.children[1].value == name:
+                return node
+            elif node.type == py.import_name:
+                imported_name = str(node.children[1]).strip()
+                if imported_name == name:
+                    return node
+            elif node.type == py.import_from:
+                for n in walk_tree(node):
+                    if n.type == py.import_as_name:
+                        i = 0
+                        if len(n.children) == 3:
+                            i = 2
+
+                        if n.children[i].value == name:
+                            return node
+            elif node.type == py.expr_stmt:
+                if len(node.children) > 1 and node.children[1].type == token.EQUAL:
+                    for n in walk_tree(node):
+                        if n.type == py.power:
+                            assignment_name = str(n.children[0]).strip()
+                            if assignment_name == name:
+                                return node
+
+        return None
+
+    def collect_refs(root):
+        for n2 in walk_tree(root):
+            if n2.type == py.power and n2.children[0].children[0].type == token.NAME:
+                name = n2.children[0].children[0].value
+                x = 1
+                while (x < len(n2.children) and
+                       n2.children[x].type == py.trailer and
+                       n2.children[x].children[0].type == token.DOT):
+                    name += str(n2.children[x]).strip()
+                    x += 1
+
+                if '.' not in name:
+                    isKWArgName = False
+                    n = n2
+                    while n.parent:
+                        if n.parent.type == py.argument:
+                            arg = n.parent
+                            if len(arg.children) > 1 and arg.children[1].type == token.EQUAL and n == arg.children[0]:
+                                isKWArgName = True
+                        n = n.parent
+
+                    if isKWArgName:
+                        continue
+
+                    if name in dir(__builtins__):
+                        continue
+
+                if name not in seen_vars:
+                    seen_vars.add(name)
+                    referenced_vars_queue.append(name)
+
+    for node in tree.children:
+        if node.type == py.stmt and (hasattr(node, 'marked') and node.marked):
+            collect_refs(node)
+
+    for name in referenced_vars_queue:
+        node = find_definition(tree, name)
+        if node:
+            mark_stmt(node)
+            collect_refs(node)
+        else:
+            while '.' in name:
+                name = '.'.join(name.split('.')[:-1])
+                node = find_definition(tree, name)
+                if node:
+                    mark_stmt(node)
+                    collect_refs(node)
+
+    # Remove all unmarked top-level statements
+    for node in walk_tree(tree):
+        if node.type == py.stmt and node.parent.type == py.file_input:
+            if not (hasattr(node, 'marked') and node.marked):
+                node.remove()
+
+    code = str(tree)
+
+    try:
+        exec code in {'__file__': path}
+    except Exception:
+        sys.stderr.write("Error processing file %s\n" % path)
+        traceback.print_exc()
+        sys.stderr.write(code)
+
+
 def generate_schema_from_code(project, version, module_path, writer):
     old_sys_path = copy(sys.path)
 
-    mods_by_pkg = dict()
     filepaths = []
     module_directory = ''
 
@@ -223,119 +414,19 @@ def generate_schema_from_code(project, version, module_path, writer):
         filepaths.append(module_path)
 
     for filepath in filepaths:
-        pkg_name = filepath.split(os.sep)[1]
-        mod_path = filepath
-        if module_directory != '':
-            mod_path = filepath.replace(module_directory + '/', '', 1)
-        mod_str = '.'.join(['.'.join(mod_path.split(os.sep)[:-1]),
-                           os.path.basename(mod_path).split('.')[0]])
-
-        mods_by_pkg.setdefault(pkg_name, list()).append(mod_str)
-
-    pkg_names = filter(lambda x: x.endswith('.py'), mods_by_pkg.keys())
-    pkg_names.sort()
-    ext_names = filter(lambda x: x not in pkg_names, mods_by_pkg.keys())
-    ext_names.sort()
-    pkg_names.extend(ext_names)
-
-    # opts_by_group is a mapping of group name to an options list
-    # The options list is a list of (module, options) tuples
-    opts_by_group = {'DEFAULT': []}
-
-    for pkg_name in pkg_names:
-        mods = mods_by_pkg.get(pkg_name)
-        mods.sort()
-        for mod_str in mods:
-            if mod_str.endswith('.__init__'):
-                mod_str = mod_str[:mod_str.rfind(".")]
-
-            mod_obj = _import_module(mod_str)
-            if not mod_obj:
-                sys.stderr.write("Unable to import module %s" % mod_str)
-
-            for group, opts in _list_opts(mod_obj):
-                opts_by_group.setdefault(group, []).append((mod_str, opts))
-
-    print_group_opts(writer, 'DEFAULT', opts_by_group.pop('DEFAULT', []))
-    for group, opts in opts_by_group.items():
-        print_group_opts(writer, group, opts)
-
-    sys.path = old_sys_path
-
-
-def _import_module(mod_str):
-    try:
-        if mod_str.startswith('bin.'):
-            imp.load_source(mod_str[4:], os.path.join('bin', mod_str[4:]))
-            return sys.modules[mod_str[4:]]
-        else:
-            __import__(mod_str)
-            return sys.modules[mod_str]
-    except ImportError:
-        traceback.print_exc()
-        # sys.stderr.write("%s\n" % str(ie))
-        return None
-    except Exception:
-        traceback.print_exc()
-        return None
-
+        extract_config_from_file(filepath)
 
-def _is_in_group(opt, group):
-    "Check if opt is in group."
-    for key, value in group._opts.items():
-        if value['opt'] == opt:
-            return True
-    return False
+    print_group_opts(writer, 'DEFAULT', cfg.CONF._opts.values())
+    for group_name in cfg.CONF._groups:
+        print_group_opts(writer, group_name, cfg.CONF._groups[group_name]._opts.values())
 
-
-def _guess_groups(opt, mod_obj):
-    # is it in the DEFAULT group?
-    if _is_in_group(opt, cfg.CONF):
-        return 'DEFAULT'
-
-    # what other groups is it in?
-    for key, value in cfg.CONF.items():
-        if not isinstance(value, cfg.CONF.GroupAttr):
-            continue
-
-        if _is_in_group(opt, value._group):
-            return value._group.name
-
-    # raise RuntimeError(
-    #     "Unable to find group for option %s, "
-    #     "maybe it's defined twice in the same group?"
-    #     % opt.name
-    # )
-
-    return 'DEFAULT'
-
-
-def _list_opts(obj):
-    def is_opt(o):
-        return (isinstance(o, cfg.Opt) and
-                not isinstance(o, cfg.SubCommandOpt))
-
-    opts = list()
-    for attr_str in dir(obj):
-        attr_obj = getattr(obj, attr_str)
-        if is_opt(attr_obj):
-            opts.append(attr_obj)
-        elif (isinstance(attr_obj, list) and
-              all(map(lambda x: is_opt(x), attr_obj))):
-            opts.extend(attr_obj)
-
-    ret = {}
-    for opt in opts:
-        ret.setdefault(_guess_groups(opt, obj), []).append(opt)
-    return ret.items()
+    sys.path = old_sys_path
 
 
-def print_group_opts(writer, group, opts_by_module):
+def print_group_opts(writer, group, opts):
     writer.section(group)
-    for mod, opts in opts_by_module:
-        writer.comment("Options defined in %s" % mod)
-        for opt in opts:
-            print_opt(writer, opt)
+    for opt in opts:
+        print_opt(writer, opt['opt'])
 
 
 def print_opt(writer, opt):