Skip to content

Commit

Permalink
Better config schema collector
Browse files Browse the repository at this point in the history
Current schema collector implementation has issues. When it extracts
option information from source files, it imports them into python
runtime. If imported module has third-party dependencies, they are
also imported thus making it a requirement to have third-party
libraries installed on the system where schema collection is
executed. Also, there are situations when option group guessing
is inaccurate.
New schema collector uses code analysis and rewriting techniques
to extract just enough code to collect configuration option data.
Then it evals it, causing configuration options to be added to
vanilla oslo.config subsystem. Then this information gets dumped.
Implementation uses various heuristics but is more accurate than
existing one. Was tested on 'nova' and 'cinder' projects.

Change-Id: I0bd2e93288478e7a3db273055597b7bedfbf0625
  • Loading branch information
Maxim Kulkin committed Nov 21, 2013
1 parent 4bbe6ff commit 5d9a7d3
Showing 1 changed file with 201 additions and 110 deletions.
311 changes: 201 additions & 110 deletions rubick/schemas/collector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import argparse
from copy import copy
import imp
from lib2to3.pgen2 import driver
from lib2to3.pgen2 import token
from lib2to3.pygram import python_grammar, python_symbols as py
from lib2to3.pytree import Node, Leaf
import os
import re
import sys
Expand Down Expand Up @@ -187,10 +190,198 @@ def generate_schema_from_sample_config(project, version, config_file, writer):
OPTION_REGEX = re.compile(r"(%s)" % "|".join(OPT_TYPE_MAPPING.keys()))


def convert(gr, raw_node):
type, value, context, children = raw_node
# if has children or correspond to nonterminal
if children or type in gr.number2symbol:
return Node(type, children, context=context)
else:
return Leaf(type, value, context=context)


def walk_tree(root):
while True:
yield root

# Optimize traversing single-child nodes
if len(root.children) == 1:
root = root.children[0]
continue

break

for child in copy(root.children):
for node in walk_tree(child):
yield node


def extract_config_from_file(path):
with open(path) as f:
contents = f.read()

d = driver.Driver(python_grammar, convert=convert)
tree = d.parse_string(contents)

def mark_stmt(node):
n = node
while n:
if n.type == py.stmt:
n.marked = True
break
n = n.parent

fullnames = {}
# Process imports and renames
for node in walk_tree(tree):
if node.type == py.import_from:
mod = str(node.children[1]).strip()
for node2 in walk_tree(node.children[3]):
if node2.type == py.import_as_name:
n = str(node2).strip()
f = '.'.join([mod, n])
fullnames[n] = f
elif node.type == py.expr_stmt:
if len(node.children) > 1 and node.children[1].type == token.EQUAL:
lhs = str(node.children[0]).strip()
rhs = str(node.children[2]).strip()
if re.match('\S+(\.\S+)*', rhs):
parts = rhs.split('.')
if parts[0] in fullnames:
rhs = '.'.join([fullnames[parts[0]]] + parts[1:])
fullnames[lhs] = rhs

if any([rhs.startswith(s) for s in ['oslo.', 'oslo.config.', 'oslo.config.cfg.']]):
mark_stmt(node)

# Process all callsites CONF.register*
for node in walk_tree(tree):
if node.type == py.power and node.children[0].children[0].type == token.NAME:
s = str(node.children[0]).strip()
if s in fullnames:
s = fullnames[s]

cs = node.children
i = 1
while i < len(cs) and cs[i].type == py.trailer:
c = cs[i]
if c.children[0].type != token.DOT:
break

s += '.' + c.children[1].value
i += 1

if i < len(cs) and cs[i].type == py.trailer and cs[i].children[0].type == token.LPAR:
# call site
if s.startswith('oslo.config.cfg.CONF.'):
rest = s[len('oslo.config.cfg.CONF.'):]
if rest.startswith('register_'):
mark_stmt(node)

if s.startswith('oslo.config.cfg.'):
rest = s[len('oslo.config.cfg.'):]
if rest.endswith('Opt'):
mark_stmt(node)

# Traverse code and find all var references
seen_vars = set()
referenced_vars_queue = []

def find_definition(tree, name):
for node in walk_tree(tree):
if node.type == py.classdef and node.children[1].value == name:
return node
elif node.type == py.funcdef and node.children[1].value == name:
return node
elif node.type == py.import_name:
imported_name = str(node.children[1]).strip()
if imported_name == name:
return node
elif node.type == py.import_from:
for n in walk_tree(node):
if n.type == py.import_as_name:
i = 0
if len(n.children) == 3:
i = 2

if n.children[i].value == name:
return node
elif node.type == py.expr_stmt:
if len(node.children) > 1 and node.children[1].type == token.EQUAL:
for n in walk_tree(node):
if n.type == py.power:
assignment_name = str(n.children[0]).strip()
if assignment_name == name:
return node

return None

def collect_refs(root):
for n2 in walk_tree(root):
if n2.type == py.power and n2.children[0].children[0].type == token.NAME:
name = n2.children[0].children[0].value
x = 1
while (x < len(n2.children) and
n2.children[x].type == py.trailer and
n2.children[x].children[0].type == token.DOT):
name += str(n2.children[x]).strip()
x += 1

if '.' not in name:
isKWArgName = False
n = n2
while n.parent:
if n.parent.type == py.argument:
arg = n.parent
if len(arg.children) > 1 and arg.children[1].type == token.EQUAL and n == arg.children[0]:
isKWArgName = True
n = n.parent

if isKWArgName:
continue

if name in dir(__builtins__):
continue

if name not in seen_vars:
seen_vars.add(name)
referenced_vars_queue.append(name)

for node in tree.children:
if node.type == py.stmt and (hasattr(node, 'marked') and node.marked):
collect_refs(node)

for name in referenced_vars_queue:
node = find_definition(tree, name)
if node:
mark_stmt(node)
collect_refs(node)
else:
while '.' in name:
name = '.'.join(name.split('.')[:-1])
node = find_definition(tree, name)
if node:
mark_stmt(node)
collect_refs(node)

# Remove all unmarked top-level statements
for node in walk_tree(tree):
if node.type == py.stmt and node.parent.type == py.file_input:
if not (hasattr(node, 'marked') and node.marked):
node.remove()

code = str(tree)

try:
exec code in {'__file__': path}
except Exception:
sys.stderr.write("Error processing file %s\n" % path)
traceback.print_exc()
sys.stderr.write(code)


def generate_schema_from_code(project, version, module_path, writer):
old_sys_path = copy(sys.path)

mods_by_pkg = dict()
filepaths = []
module_directory = ''

Expand Down Expand Up @@ -223,119 +414,19 @@ def generate_schema_from_code(project, version, module_path, writer):
filepaths.append(module_path)

for filepath in filepaths:
pkg_name = filepath.split(os.sep)[1]
mod_path = filepath
if module_directory != '':
mod_path = filepath.replace(module_directory + '/', '', 1)
mod_str = '.'.join(['.'.join(mod_path.split(os.sep)[:-1]),
os.path.basename(mod_path).split('.')[0]])

mods_by_pkg.setdefault(pkg_name, list()).append(mod_str)

pkg_names = filter(lambda x: x.endswith('.py'), mods_by_pkg.keys())
pkg_names.sort()
ext_names = filter(lambda x: x not in pkg_names, mods_by_pkg.keys())
ext_names.sort()
pkg_names.extend(ext_names)

# opts_by_group is a mapping of group name to an options list
# The options list is a list of (module, options) tuples
opts_by_group = {'DEFAULT': []}

for pkg_name in pkg_names:
mods = mods_by_pkg.get(pkg_name)
mods.sort()
for mod_str in mods:
if mod_str.endswith('.__init__'):
mod_str = mod_str[:mod_str.rfind(".")]

mod_obj = _import_module(mod_str)
if not mod_obj:
sys.stderr.write("Unable to import module %s" % mod_str)

for group, opts in _list_opts(mod_obj):
opts_by_group.setdefault(group, []).append((mod_str, opts))

print_group_opts(writer, 'DEFAULT', opts_by_group.pop('DEFAULT', []))
for group, opts in opts_by_group.items():
print_group_opts(writer, group, opts)

sys.path = old_sys_path


def _import_module(mod_str):
try:
if mod_str.startswith('bin.'):
imp.load_source(mod_str[4:], os.path.join('bin', mod_str[4:]))
return sys.modules[mod_str[4:]]
else:
__import__(mod_str)
return sys.modules[mod_str]
except ImportError:
traceback.print_exc()
# sys.stderr.write("%s\n" % str(ie))
return None
except Exception:
traceback.print_exc()
return None

extract_config_from_file(filepath)

def _is_in_group(opt, group):
"Check if opt is in group."
for key, value in group._opts.items():
if value['opt'] == opt:
return True
return False
print_group_opts(writer, 'DEFAULT', cfg.CONF._opts.values())
for group_name in cfg.CONF._groups:
print_group_opts(writer, group_name, cfg.CONF._groups[group_name]._opts.values())


def _guess_groups(opt, mod_obj):
# is it in the DEFAULT group?
if _is_in_group(opt, cfg.CONF):
return 'DEFAULT'

# what other groups is it in?
for key, value in cfg.CONF.items():
if not isinstance(value, cfg.CONF.GroupAttr):
continue

if _is_in_group(opt, value._group):
return value._group.name

# raise RuntimeError(
# "Unable to find group for option %s, "
# "maybe it's defined twice in the same group?"
# % opt.name
# )

return 'DEFAULT'


def _list_opts(obj):
def is_opt(o):
return (isinstance(o, cfg.Opt) and
not isinstance(o, cfg.SubCommandOpt))

opts = list()
for attr_str in dir(obj):
attr_obj = getattr(obj, attr_str)
if is_opt(attr_obj):
opts.append(attr_obj)
elif (isinstance(attr_obj, list) and
all(map(lambda x: is_opt(x), attr_obj))):
opts.extend(attr_obj)

ret = {}
for opt in opts:
ret.setdefault(_guess_groups(opt, obj), []).append(opt)
return ret.items()
sys.path = old_sys_path


def print_group_opts(writer, group, opts_by_module):
def print_group_opts(writer, group, opts):
writer.section(group)
for mod, opts in opts_by_module:
writer.comment("Options defined in %s" % mod)
for opt in opts:
print_opt(writer, opt)
for opt in opts:
print_opt(writer, opt['opt'])


def print_opt(writer, opt):
Expand Down

0 comments on commit 5d9a7d3

Please sign in to comment.