In [1]:
import yaml
import itertools

In [2]:
with open('dvc.yaml') as dvc_yaml_file:
    dvc = yaml.load(dvc_yaml_file)

  dvc = yaml.load(dvc_yaml_file)


In [3]:
stages = list(dvc['stages'].keys())

In [4]:
deps = {stage: dvc['stages'][stage]['deps'] for stage in stages}

In [5]:
outs = {stage: dvc['stages'][stage]['outs'] for stage in stages}

In [6]:
# v -> u means v needs to be run before u
children = lambda parent: [stage for stage in stages if any(out in deps[stage] for out in outs[parent])]
dep_graph = {parent: children(parent) for parent in stages} 

In [7]:
dep_graph

{'subsample': ['validate-column-names'],
 'validate-column-names': ['nullify'],
 'nullify': ['guess-schema', 'ignore', 'assemble-database'],
 'guess-schema': ['cgpm-schema',
  'ignore',
  'numericalize',
  'save-linear-stats',
  'xcat-complete-import',
  'qc-dashboard-spec',
  'qc-splom-spec',
  'predict',
  'prepare-publish'],
 'cgpm-schema': ['cgpm-generate-metadata'],
 'ignore': ['numericalize',
  'save-linear-stats',
  'ast-export',
  'sppl-sample',
  'qc-tag-samples',
  'xcat-complete-import',
  'predict'],
 'numericalize': ['cgpm-generate-metadata',
  'cgpm-infer-hyperparameters',
  'cgpm-infer-hyperparameters-refined',
  'xcat-complete-import'],
 'cgpm-generate-metadata': ['cgpm-infer-hyperparameters'],
 'cgpm-infer-hyperparameters': ['cgpm-infer-hyperparameters-refined',
  'save-dependencies',
  'save-max-number-views',
  'ast-export',
  'xcat-complete-import'],
 'cgpm-infer-hyperparameters-refined': [],
 'save-dependencies': ['dep-prob-vl',
  'linear-stats-vl',
  'compare-dep-

In [8]:
qc_stages = list(filter(lambda stage: stage[:2] == 'qc', stages))

In [9]:
def downstream(digraph, node):
    return set.union(set([node]), *(downstream(digraph, child) for child in digraph[node]))

is_child = lambda digraph, node, other_node: node in digraph[other_node]

def upstream(digraph, node):
    parents = filter(lambda other_node: is_child(digraph, node, other_node), digraph.keys())
    return set.union(set([node]), *(upstream(digraph, parent) for parent in parents))

In [10]:
def toposort(digraph, nodes):
    unsorted, sorted = nodes.copy(), []
    while unsorted:
        sink = unsorted[0]
        while any(node in digraph[sink] for node in unsorted):
            sink = next(node for node in unsorted if node in digraph[sink])
        sorted.append(sink)
        unsorted.remove(sink)
    return list(reversed(sorted))

In [11]:
qc_upstream = set.union(*(upstream(dep_graph, qc_stage) for qc_stage in qc_stages))

In [12]:
cgpm_downstream = downstream(dep_graph, 'cgpm-infer-hyperparameters')

In [13]:
relevant_stages = list(set.intersection(qc_upstream, cgpm_downstream))

In [14]:
toposort(dep_graph, relevant_stages)

['cgpm-infer-hyperparameters',
 'save-max-number-views',
 'ast-export',
 'sppl-import',
 'sppl-merge',
 'sppl-sample',
 'qc-tag-samples',
 'qc-dashboard-spec',
 'qc-dashboard-app',
 'qc-splom-spec',
 'qc-splom-app']

In [18]:
toposort(dep_graph, list(upstream(dep_graph, 'qc-dashboard-spec')))

['subsample',
 'validate-column-names',
 'nullify',
 'guess-schema',
 'cgpm-schema',
 'ignore',
 'numericalize',
 'cgpm-generate-metadata',
 'cgpm-infer-hyperparameters',
 'save-max-number-views',
 'ast-export',
 'sppl-import',
 'sppl-merge',
 'save-linear-stats',
 'sppl-sample',
 'qc-tag-samples',
 'qc-dashboard-spec']