In [27]:
# before running notebook, follow steps to install Kiara examples:
# https://github.com/DHARPA-Project/kiara.examples

In [1]:
from kiara import KiaraAPI, Kiara
import networkx
from networkx.readwrite import json_graph

In [2]:
kiara = Kiara.instance()
api = KiaraAPI(kiara=kiara)

In [3]:
# version of Kiara used in this notebook
!pip show kiara

Name: kiara
Version: 0.4.20
Summary: Data-centric workflow orchestration.
Home-page: https://github.com/DHARPA-Project/kiara
Author: Markus Binsteiner
Author-email: markus@frkl.io
License: MPL-2.0
Location: /opt/miniconda3/envs/lineage_data/lib/python3.9/site-packages
Requires: airium, alembic, appdirs, bidict, black, click, dag-cbor, deepdiff, Deprecated, distro, dpath, filetype, humanfriendly, jinja2, jupytext, mistune, mkdocstrings, mmh3, multiformats, networkx, orjson, pp-ez, pydantic, python-dateutil, python-slugify, pyzmq, regex, rich, rich-click, ruamel.yaml, sortedcontainers, sqlalchemy, sqlalchemy-utc, sqlalchemy-utils, stevedore, structlog, textual, tzlocal
Required-by: kiara-plugin.core-types, kiara-plugin.playground, kiara-plugin.tabular


#### 1. Overview of the operations we will be experimenting on to create lineage data

- corpus onboarding
example corpus onboarding pipeline from https://github.com/DHARPA-Project/kiara.examples/blob/main/examples/pipelines/topic_modeling/corpus_onboarding.yaml 

In [4]:
api.get_operation_info('import.table.from.text_file_bundle')

- text processing
example topic modeling pipeline from https://github.com/DHARPA-Project/kiara.examples/blob/main/examples/pipelines/topic_modeling/topic_modeling.yaml 

In [5]:
api.get_operation_info('topic_modeling')

#### 2. Lineage data for the onboarding step

- Data onboarding

In [14]:
# add local path to example dataset, for example: https://github.com/DHARPA-Project/kiara.examples/tree/main/examples/data/text_corpus
folder_path = '/Users/mariella.decrouychan/Documents/GitHub/kiara_plugin.playground/examples/data/CI_newspaper_subcorpora'

In [15]:
# choose general alias related to the current data prep (to be able to easily spot data created now in data registry)
gen_alias = 'test3oct22'

In [16]:
onboarding_result = api.run_job(operation="tm_onboarding", inputs={'folder_path':folder_path})

In [17]:
table = onboarding_result["corpus_table"]
api.store_value(table, f'tm_{gen_alias}_onboard')

StoreValueResult(value=Value(id=efb83e08-ed00-48e8-b359-56b32881437e, type=table, status=set, initialized=True optional=False), aliases=['tm_test3oct22_onboard'], persisted_data=None, error=None)

In [18]:
# checking how that would appear in CLI
! kiara data list


╭─ Available aliases ──────────────────────────────────────────────────────────╮
│                                                                              │
│  [1m [0m[1malias                [0m[1m [0m [1m [0m[1mtype [0m[1m [0m [1m [0m[1m    size[0m[1m [0m                                  │
│  ──────────────────────────────────────────                                  │
│   tm_test3oct22_onboard   table   16.06 MB                                   │
│                                                                              │
╰──────────────────────────────────────────────────────────────────────────────╯


- lineage data preparation

In [19]:
value = api.get_value(value='tm_test3oct22_onboard')

In [20]:
graph = value.lineage.module_graph
result = json_graph.node_link_data(graph)

In [21]:
result

{'directed': True,
 'multigraph': False,
 'graph': {},
 'nodes': [{'data_type': 'table',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1,
   'id': 'value:efb83e08-ed00-48e8-b359-56b32881437e'},
  {'module_type': 'create.table',
   'module_config': {'constants': {},
    'defaults': {},
    'source_type': 'text_file_bundle',
    'target_type': 'table',
    'ignore_errors': False},
   'label': 'create.table',
   'node_type': 'operation',
   'level': 3,
   'id': 'module:zdpuAmNzF8e16HSzruEtEYa3p35f4uTP7dG6xTr8VZdHUTdVv'},
  {'module_type': 'import.file_bundle',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'import.file_bundle',
   'node_type': 'operation',
   'level': 5,
   'id': 'module:zdpuApZCg4Kvf7BzkNyJd3YZD8ih9kPdg7qPAbYr4D35Fj1VW'},
  {'label': 'path (string)',
   'node_type': 'value',
   'data_type': 'string',
   'data_type_config': {},
   'level': 6,
   'id': 'value:f4663d01-3c01-4d24-9be5-907f03410767'}],
 'links'

In [23]:
type(result)

dict

In [28]:
result['nodes'][0]

{'data_type': 'table',
 'label': '[this value]',
 'node_type': 'value',
 'data_type_config': {},
 'level': 1,
 'id': 'value:efb83e08-ed00-48e8-b359-56b32881437e'}

In [73]:
nodes = graph.nodes.data()
augmented_nodes = dict()

In [78]:
for idx, node in enumerate(nodes): 
    node_dict = {
        "id": node[0],
        "desc": node[1],
        "children": [pred for pred in graph.predecessors(node[0])]
    }
    augmented_nodes[idx] = node_dict

In [79]:
augmented_nodes

{0: {'id': 'value:efb83e08-ed00-48e8-b359-56b32881437e',
  'desc': {'data_type': 'table',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1,
   'children': ['module:zdpuAmNzF8e16HSzruEtEYa3p35f4uTP7dG6xTr8VZdHUTdVv']},
  'children': ['module:zdpuAmNzF8e16HSzruEtEYa3p35f4uTP7dG6xTr8VZdHUTdVv']},
 1: {'id': 'module:zdpuAmNzF8e16HSzruEtEYa3p35f4uTP7dG6xTr8VZdHUTdVv',
  'desc': {'module_type': 'create.table',
   'module_config': {'constants': {},
    'defaults': {},
    'source_type': 'text_file_bundle',
    'target_type': 'table',
    'ignore_errors': False},
   'label': 'create.table',
   'node_type': 'operation',
   'level': 3},
  'children': ['module:zdpuApZCg4Kvf7BzkNyJd3YZD8ih9kPdg7qPAbYr4D35Fj1VW']},
 2: {'id': 'module:zdpuApZCg4Kvf7BzkNyJd3YZD8ih9kPdg7qPAbYr4D35Fj1VW',
  'desc': {'module_type': 'import.file_bundle',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'import.file_bundle',
   'node_type': 'operation',
   'le