In [1]:
# before running notebook pip install https://github.com/DHARPA-Project/kiara_plugin.playground to access the example pipelines

In [2]:
from kiara import KiaraAPI, Kiara
import networkx
from networkx.readwrite import json_graph
import json
import os

In [3]:
# all (global) variables that will be used in this notebook

# the folder this notebook lives in
current_path = os.getcwd()

# the onboarding pipeline, defined in 'corpus_onboarding.yaml' in the 'pipelines' folder
onboarding_pipeline = os.path.join(current_path, 'pipelines', 'corpus_onboarding.yaml')
# the onboarding pipeline, defined in 'topic_modeling.yaml' in the 'pipelines' folder
topic_modeling_pipeline = os.path.join(current_path, 'pipelines', 'topic_modeling.yaml')


# the local path to example dataset, the defaults is located in the `example_data/mini_corpus` folder next to this notebook
# change to a different one by adjusting the following variable, e.g.:
# corpus_path = '/Users/mariella.decrouychan/Documents/GitHub/kiara_plugin.playground/examples/data/CI_newspaper_subcorpora'
corpus_path = os.path.join(current_path, 'example_data', 'mini_corpus')

# the general alias related to the current data prep (to be able to easily spot data created now in data registry)
gen_alias = 'test3oct22'

In [4]:
api = KiaraAPI.instance()

In [5]:
# version of Kiara used in this notebook
!pip show kiara

Name: kiara
Version: 0.4.20
Summary: Data-centric workflow orchestration.
Home-page: https://github.com/DHARPA-Project/kiara
Author: Markus Binsteiner
Author-email: markus@frkl.io
License: MPL-2.0
Location: /home/markus/.local/share/micromamba/envs/kiara-dev/lib/python3.10/site-packages
Requires: airium, alembic, appdirs, bidict, black, click, dag-cbor, deepdiff, Deprecated, distro, dpath, filetype, humanfriendly, jinja2, jupytext, mistune, mkdocstrings, mmh3, multiformats, networkx, orjson, pp-ez, pydantic, python-dateutil, python-slugify, pyzmq, regex, rich, rich-click, ruamel.yaml, sortedcontainers, sqlalchemy, sqlalchemy-utc, sqlalchemy-utils, stevedore, structlog, textual, tzlocal
Required-by: kiara-plugin.core-types, kiara-plugin.tabular


#### 1. Overview of the operations we will be experimenting on to create lineage data

- corpus onboarding
example corpus onboarding pipeline from https://github.com/DHARPA-Project/DHARPA-Project-viz-observable/blob/main/dag-lineage/pipelines/corpus_onboarding.yaml 

In [6]:
onboarding_op = api.get_operation(onboarding_pipeline, allow_external=True)
onboarding_op                        

- text processing
example topic modeling pipeline from https://github.com/DHARPA-Project/DHARPA-Project-viz-observable/blob/main/dag-lineage/pipelines/topic_modeling.yaml 

In [7]:
topic_modeling_op = api.get_operation(topic_modeling_pipeline, allow_external=True)
topic_modeling_op

#### 2. Lineage data for the onboarding step

- Data onboarding

In [8]:
onboarding_result = api.run_job(operation=onboarding_op, inputs={'text_corpus_folder_path': corpus_path})

In [9]:
table = onboarding_result["corpus_table"]
api.store_value(table, f'tm_{gen_alias}_onboard')

StoreValueResult(value=Value(id=4853c2de-bb64-483c-8824-e441aa68d874, type=table, status=set, initialized=True optional=False), aliases=['tm_test3oct22_onboard'], persisted_data=PersistedData(model_id=zdpuAxPin6TsfMCtxHYfmPXujaY8AqcLgKG87vHrmfYKbvRbv, category=instance.persisted_data, fields=[data_type, data_type_config, serialization_profile, metadata, hash_codec, archive_id, chunk_id_map]), error=None)

In [10]:
# checking how that would appear in CLI
! kiara data list


╭─ Available aliases ──────────────────────────────────────────────────────────╮
│                                                                              │
│  [1m [0m[1malias                            [0m[1m [0m [1m [0m[1mtype [0m[1m [0m [1m [0m[1m     size[0m[1m [0m                     │
│  ───────────────────────────────────────────────────────                     │
│   tm_test3oct22_preprocessed_corpus   array   489.27 KB                      │
│   tm_test3oct22_onboard               table   300.77 KB                      │
│                                                                              │
╰──────────────────────────────────────────────────────────────────────────────╯


- lineage data preparation

In [11]:
corpus_table = api.get_value(value='tm_test3oct22_onboard')

In [12]:
graph = corpus_table.lineage.module_graph
result = json_graph.node_link_data(graph)

In [13]:
result

{'directed': True,
 'multigraph': False,
 'graph': {},
 'nodes': [{'data_type': 'table',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1,
   'id': 'value:4853c2de-bb64-483c-8824-e441aa68d874'},
  {'module_type': 'table.merge',
   'module_config': {'constants': {},
    'defaults': {},
    'inputs_schema': {'source_table': {'type': 'table',
      'type_config': {},
      'default': '__not_set__',
      'optional': False,
      'is_constant': False,
      'doc': {'description': 'The original table.', 'doc': None}},
     'date_array': {'type': 'array',
      'type_config': {},
      'default': '__not_set__',
      'optional': False,
      'is_constant': False,
      'doc': {'description': 'The array containing the parsed date items.',
       'doc': None}}},
    'column_map': {'date': 'date_array',
     'content': 'source_table.content',
     'file_name': 'source_table.file_name'}},
   'label': 'table.merge',
   'node_type': 'operation',
   'lev

In [14]:
nodes = graph.nodes.data()
augmented_nodes = dict()

- (message for Markus): cell below here is where the additional info would be needed

In [15]:
for idx, node in enumerate(nodes): 
    # printing out the content to make obvious what is in there
    # what would be needed here is info about input value (and not only type), and module info (doc)
    print(node[1])
    node_dict = {
        "id": node[0],
        "desc": node[1],
        "parentIds": [pred for pred in graph.predecessors(node[0])]
    }
    augmented_nodes[idx] = node_dict
node_dict

{'data_type': 'table', 'label': '[this value]', 'node_type': 'value', 'data_type_config': {}, 'level': 1}
{'module_type': 'table.merge', 'module_config': {'constants': {}, 'defaults': {}, 'inputs_schema': {'source_table': {'type': 'table', 'type_config': {}, 'default': '__not_set__', 'optional': False, 'is_constant': False, 'doc': {'description': 'The original table.', 'doc': None}}, 'date_array': {'type': 'array', 'type_config': {}, 'default': '__not_set__', 'optional': False, 'is_constant': False, 'doc': {'description': 'The array containing the parsed date items.', 'doc': None}}}, 'column_map': {'date': 'date_array', 'content': 'source_table.content', 'file_name': 'source_table.file_name'}}, 'label': 'table.merge', 'node_type': 'operation', 'level': 3}
{'module_type': 'parse.date_array', 'module_config': {'constants': {}, 'defaults': {}, 'add_inputs': True, 'input_fields': [], 'force_non_null': True, 'min_index': None, 'max_index': None, 'remove_tokens': []}, 'label': 'parse.date_ar

{'id': 'value:7c77b5c3-5aea-427a-85c4-c3d189e3298b',
 'desc': {'label': 'remove_tokens (list)',
  'node_type': 'value',
  'data_type': 'list',
  'data_type_config': {},
  'level': 6},
 'parentIds': []}

In [16]:
augmented_nodes

{0: {'id': 'value:4853c2de-bb64-483c-8824-e441aa68d874',
  'desc': {'data_type': 'table',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1},
  'parentIds': ['module:zdpuAmPsaKJ1FXFYQQUANyGmpQwBWnjrxVGZaMdpZVbW6o5dv']},
 1: {'id': 'module:zdpuAmPsaKJ1FXFYQQUANyGmpQwBWnjrxVGZaMdpZVbW6o5dv',
  'desc': {'module_type': 'table.merge',
   'module_config': {'constants': {},
    'defaults': {},
    'inputs_schema': {'source_table': {'type': 'table',
      'type_config': {},
      'default': '__not_set__',
      'optional': False,
      'is_constant': False,
      'doc': {'description': 'The original table.', 'doc': None}},
     'date_array': {'type': 'array',
      'type_config': {},
      'default': '__not_set__',
      'optional': False,
      'is_constant': False,
      'doc': {'description': 'The array containing the parsed date items.',
       'doc': None}}},
    'column_map': {'date': 'date_array',
     'content': 'source_table.content',
     '

In [17]:
# uncomment to export dataset for the viz

# res = json.dumps(augmented_nodes)
# with open("test_data.json", "w") as outfile:
#     outfile.write(res)

#### 3. Lineage data for nlp step

- running the example TM pipeline with previously onboarded data

In [18]:
nlp_step = api.run_job(operation=topic_modeling_op, inputs={'corpus': corpus_table})

In [19]:
preprocessed_corpus = nlp_step["preprocessed_corpus"]
api.store_value(preprocessed_corpus, f'tm_{gen_alias}_preprocessed_corpus')

StoreValueResult(value=Value(id=fbe99dd9-7a65-4f99-9f4a-1e613cf5b9f1, type=array, status=set, initialized=True optional=False), aliases=['tm_test3oct22_preprocessed_corpus'], persisted_data=PersistedData(model_id=zdpuAnqZ1wH2BTnVWkBNaHsctiFsUMxWDi1Y5Kzo7L8odbHsE, category=instance.persisted_data, fields=[data_type, data_type_config, serialization_profile, metadata, hash_codec, archive_id, chunk_id_map]), error=None)

In [20]:
!kiara data list


╭─ Available aliases ──────────────────────────────────────────────────────────╮
│                                                                              │
│  [1m [0m[1malias                            [0m[1m [0m [1m [0m[1mtype [0m[1m [0m [1m [0m[1m     size[0m[1m [0m                     │
│  ───────────────────────────────────────────────────────                     │
│   tm_test3oct22_onboard               table   300.77 KB                      │
│   tm_test3oct22_preprocessed_corpus   array   489.27 KB                      │
│                                                                              │
╰──────────────────────────────────────────────────────────────────────────────╯


- lineage data preparation

In [21]:
# this is not really necessary, since the 'preprocessed_corpus' variable still holds that value
preprocessed_corpus = api.get_value(value='tm_test3oct22_preprocessed_corpus')

In [22]:
graph = preprocessed_corpus.lineage.module_graph
result = json_graph.node_link_data(graph)

In [23]:
result

{'directed': True,
 'multigraph': False,
 'graph': {},
 'nodes': [{'data_type': 'array',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1,
   'id': 'value:fbe99dd9-7a65-4f99-9f4a-1e613cf5b9f1'},
  {'module_type': 'preprocess.tokens_array',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'preprocess.tokens_array',
   'node_type': 'operation',
   'level': 3,
   'id': 'module:zdpuAm4yHZ2hrjekacL8tU9afs5zdGFf3qfyuLcPKdgvW6W6z'},
  {'label': 'to_lowercase (boolean)',
   'node_type': 'value',
   'data_type': 'boolean',
   'data_type_config': {},
   'level': 4,
   'id': 'value:2d4b63fb-36bc-48c8-9a36-39bd22ae52d4'},
  {'label': 'remove_short_tokens (integer)',
   'node_type': 'value',
   'data_type': 'integer',
   'data_type_config': {},
   'level': 4,
   'id': 'value:cc99a2d0-4862-4c57-8dbd-1867ce0ead56'},
  {'module_type': 'create.stopwords_list',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'create.stopwo

In [24]:
nodes = graph.nodes.data()
augmented_nodes = dict()

- (message for Markus): cell below here is where the additional info would be needed

In [25]:
for idx, node in enumerate(nodes): 
    # printing out the content to make obvious what is in there
    # what would be needed here is info about input value (and not only type), and module info (doc)
   
    node_dict = {
        "id": node[0],
        "desc": node[1],
        "parentIds": [pred for pred in graph.predecessors(node[0])]
    }
    augmented_nodes[idx] = node_dict

In [26]:
augmented_nodes

{0: {'id': 'value:fbe99dd9-7a65-4f99-9f4a-1e613cf5b9f1',
  'desc': {'data_type': 'array',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1},
  'parentIds': ['module:zdpuAm4yHZ2hrjekacL8tU9afs5zdGFf3qfyuLcPKdgvW6W6z']},
 1: {'id': 'module:zdpuAm4yHZ2hrjekacL8tU9afs5zdGFf3qfyuLcPKdgvW6W6z',
  'desc': {'module_type': 'preprocess.tokens_array',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'preprocess.tokens_array',
   'node_type': 'operation',
   'level': 3},
  'parentIds': ['value:2d4b63fb-36bc-48c8-9a36-39bd22ae52d4',
   'value:cc99a2d0-4862-4c57-8dbd-1867ce0ead56',
   'module:zdpuB2qxnw2NwATaz7PJjN97hQhPPXESFrLt3EPAznJx8wQMK',
   'module:zdpuAsx6Q8GCKxEXMhkfPmjKvje7Pi7GrZvR7URsph26fY8Gn']},
 2: {'id': 'value:2d4b63fb-36bc-48c8-9a36-39bd22ae52d4',
  'desc': {'label': 'to_lowercase (boolean)',
   'node_type': 'value',
   'data_type': 'boolean',
   'data_type_config': {},
   'level': 4},
  'parentIds': []},
 3: {'id': 'valu

In [27]:
# uncomment to export dataset for the viz

# res = json.dumps(augmented_nodes)
# with open("test_data2.json", "w") as outfile:
#     outfile.write(res)