In [50]:
# before running notebook pip install https://github.com/DHARPA-Project/kiara_plugin.playground to access the example pipelines

In [48]:
from kiara import KiaraAPI, Kiara
import networkx
from networkx.readwrite import json_graph
import json

In [25]:
kiara = Kiara.instance()
api = KiaraAPI(kiara=kiara)

In [26]:
# version of Kiara used in this notebook
!pip show kiara

Name: kiara
Version: 0.4.20
Summary: Data-centric workflow orchestration.
Home-page: https://github.com/DHARPA-Project/kiara
Author: Markus Binsteiner
Author-email: markus@frkl.io
License: MPL-2.0
Location: /opt/miniconda3/envs/lineage_data/lib/python3.9/site-packages
Requires: airium, alembic, appdirs, bidict, black, click, dag-cbor, deepdiff, Deprecated, distro, dpath, filetype, humanfriendly, jinja2, jupytext, mistune, mkdocstrings, mmh3, multiformats, networkx, orjson, pp-ez, pydantic, python-dateutil, python-slugify, pyzmq, regex, rich, rich-click, ruamel.yaml, sortedcontainers, sqlalchemy, sqlalchemy-utc, sqlalchemy-utils, stevedore, structlog, textual, tzlocal
Required-by: kiara-plugin.core-types, kiara-plugin.playground, kiara-plugin.tabular


#### 1. Overview of the operations we will be experimenting on to create lineage data

- corpus onboarding
example corpus onboarding pipeline from https://github.com/DHARPA-Project/kiara.examples/blob/main/examples/pipelines/topic_modeling/corpus_onboarding.yaml 

In [27]:
api.get_operation_info('import.table.from.text_file_bundle')

- text processing
example topic modeling pipeline from https://github.com/DHARPA-Project/kiara.examples/blob/main/examples/pipelines/topic_modeling/topic_modeling.yaml 

In [28]:
api.get_operation_info('topic_modeling')

#### 2. Lineage data for the onboarding step

- Data onboarding

In [29]:
# add local path to example dataset, for example: https://github.com/DHARPA-Project/kiara.examples/tree/main/examples/data/text_corpus
folder_path = '/Users/mariella.decrouychan/Documents/GitHub/kiara_plugin.playground/examples/data/CI_newspaper_subcorpora'

In [30]:
# choose general alias related to the current data prep (to be able to easily spot data created now in data registry)
gen_alias = 'test3oct22'

In [31]:
onboarding_result = api.run_job(operation="tm_onboarding", inputs={'folder_path':folder_path})

In [32]:
table = onboarding_result["corpus_table"]
api.store_value(table, f'tm_{gen_alias}_onboard')

StoreValueResult(value=Value(id=552718d6-4f10-400f-b7a9-5f6790de6333, type=table, status=set, initialized=True optional=False), aliases=['tm_test3oct22_onboard'], persisted_data=None, error=None)

In [33]:
# checking how that would appear in CLI
! kiara data list


╭─ Available aliases ──────────────────────────────────────────────────────────╮
│                                                                              │
│  [1m [0m[1malias                [0m[1m [0m [1m [0m[1mtype [0m[1m [0m [1m [0m[1m    size[0m[1m [0m                                  │
│  ──────────────────────────────────────────                                  │
│   tm_test3oct22_onboard   table   16.06 MB                                   │
│                                                                              │
╰──────────────────────────────────────────────────────────────────────────────╯


- lineage data preparation

In [34]:
value = api.get_value(value='tm_test3oct22_onboard')

In [35]:
graph = value.lineage.module_graph
result = json_graph.node_link_data(graph)

In [36]:
result

{'directed': True,
 'multigraph': False,
 'graph': {},
 'nodes': [{'data_type': 'table',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1,
   'id': 'value:552718d6-4f10-400f-b7a9-5f6790de6333'},
  {'module_type': 'create.table',
   'module_config': {'constants': {},
    'defaults': {},
    'source_type': 'text_file_bundle',
    'target_type': 'table',
    'ignore_errors': False},
   'label': 'create.table',
   'node_type': 'operation',
   'level': 3,
   'id': 'module:zdpuAqGye5Jd79Ae2t7pa5Q2bz4H7PwMoSVyfY6vXnpL5z8Xx'},
  {'module_type': 'import.file_bundle',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'import.file_bundle',
   'node_type': 'operation',
   'level': 5,
   'id': 'module:zdpuApZCg4Kvf7BzkNyJd3YZD8ih9kPdg7qPAbYr4D35Fj1VW'},
  {'label': 'path (string)',
   'node_type': 'value',
   'data_type': 'string',
   'data_type_config': {},
   'level': 6,
   'id': 'value:f4663d01-3c01-4d24-9be5-907f03410767'}],
 'links'

In [39]:
nodes = graph.nodes.data()
augmented_nodes = dict()

- (message for Markus): cell below here is where the additional info would be needed

In [44]:
for idx, node in enumerate(nodes): 
    # printing out the content to make obvious what is in there
    # what would be needed here is info about input value (and not only type), and module info (doc)
    print(node[1])
    node_dict = {
        "id": node[0],
        "desc": node[1],
        "parentIds": [pred for pred in graph.predecessors(node[0])]
    }
    augmented_nodes[idx] = node_dict

In [45]:
augmented_nodes

{0: {'id': 'value:552718d6-4f10-400f-b7a9-5f6790de6333',
  'desc': {'data_type': 'table',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1},
  'parentIds': ['module:zdpuAqGye5Jd79Ae2t7pa5Q2bz4H7PwMoSVyfY6vXnpL5z8Xx']},
 1: {'id': 'module:zdpuAqGye5Jd79Ae2t7pa5Q2bz4H7PwMoSVyfY6vXnpL5z8Xx',
  'desc': {'module_type': 'create.table',
   'module_config': {'constants': {},
    'defaults': {},
    'source_type': 'text_file_bundle',
    'target_type': 'table',
    'ignore_errors': False},
   'label': 'create.table',
   'node_type': 'operation',
   'level': 3},
  'parentIds': ['module:zdpuApZCg4Kvf7BzkNyJd3YZD8ih9kPdg7qPAbYr4D35Fj1VW']},
 2: {'id': 'module:zdpuApZCg4Kvf7BzkNyJd3YZD8ih9kPdg7qPAbYr4D35Fj1VW',
  'desc': {'module_type': 'import.file_bundle',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'import.file_bundle',
   'node_type': 'operation',
   'level': 5},
  'parentIds': ['value:f4663d01-3c01-4d24-9be5-907f03410767']},
 

In [47]:
# uncomment to export dataset for the viz

# res = json.dumps(augmented_nodes)
# with open("test_data.json", "w") as outfile:
#     outfile.write(res)

#### 3. Lineage data for nlp step

- running the example TM pipeline with previously onboarded data

In [51]:
nlp_step = api.run_job(operation="topic_modeling", inputs={'text_corpus_folder_path': folder_path})

In [52]:
table = nlp_step["preprocessed_corpus"]
api.store_value(table, f'tm_{gen_alias}_preprocessed_corpus')

StoreValueResult(value=Value(id=adf031bf-c26a-4c69-ab5e-f16dfd0e8b3d, type=array, status=set, initialized=True optional=False), aliases=['tm_test3oct22_preprocessed_corpus'], persisted_data=PersistedData(model_id=zdpuAsh9pUJAw8VZ2H6zF9WgYWhpAgFznrBAcV6PMZK8RnN32, category=instance.persisted_data, fields=[data_type, data_type_config, serialization_profile, metadata, hash_codec, archive_id, chunk_id_map]), error=None)

In [53]:
!kiara data list


╭─ Available aliases ──────────────────────────────────────────────────────────╮
│                                                                              │
│  [1m [0m[1malias                            [0m[1m [0m [1m [0m[1mtype [0m[1m [0m [1m [0m[1m    size[0m[1m [0m                      │
│  ──────────────────────────────────────────────────────                      │
│   tm_test3oct22_onboard               table   16.06 MB                       │
│   tm_test3oct22_preprocessed_corpus   array   26.64 MB                       │
│                                                                              │
╰──────────────────────────────────────────────────────────────────────────────╯


- lineage data preparation

In [54]:
value = api.get_value(value='tm_test3oct22_preprocessed_corpus')

In [55]:
graph = value.lineage.module_graph
result = json_graph.node_link_data(graph)

In [56]:
result

{'directed': True,
 'multigraph': False,
 'graph': {},
 'nodes': [{'data_type': 'array',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1,
   'id': 'value:adf031bf-c26a-4c69-ab5e-f16dfd0e8b3d'},
  {'module_type': 'preprocess.tokens_array',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'preprocess.tokens_array',
   'node_type': 'operation',
   'level': 3,
   'id': 'module:zdpuAncTwo9pDiXGFdUFmVSbbyh4xyC3NkhcWGTruDJh9uE4V'},
  {'label': 'to_lowercase (boolean)',
   'node_type': 'value',
   'data_type': 'boolean',
   'data_type_config': {},
   'level': 4,
   'id': 'value:19a56a11-3553-42d6-9d46-d8821157f992'},
  {'label': 'remove_short_tokens (integer)',
   'node_type': 'value',
   'data_type': 'integer',
   'data_type_config': {},
   'level': 4,
   'id': 'value:ad9909ee-827e-4968-a18c-b7a3f9c52cc3'},
  {'module_type': 'create.stopwords_list',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'create.stopwo

In [57]:
nodes = graph.nodes.data()
augmented_nodes = dict()

- (message for Markus): cell below here is where the additional info would be needed

In [58]:
for idx, node in enumerate(nodes): 
    # printing out the content to make obvious what is in there
    # what would be needed here is info about input value (and not only type), and module info (doc)
   
    node_dict = {
        "id": node[0],
        "desc": node[1],
        "parentIds": [pred for pred in graph.predecessors(node[0])]
    }
    augmented_nodes[idx] = node_dict

In [59]:
augmented_nodes

{0: {'id': 'value:adf031bf-c26a-4c69-ab5e-f16dfd0e8b3d',
  'desc': {'data_type': 'array',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1},
  'parentIds': ['module:zdpuAncTwo9pDiXGFdUFmVSbbyh4xyC3NkhcWGTruDJh9uE4V']},
 1: {'id': 'module:zdpuAncTwo9pDiXGFdUFmVSbbyh4xyC3NkhcWGTruDJh9uE4V',
  'desc': {'module_type': 'preprocess.tokens_array',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'preprocess.tokens_array',
   'node_type': 'operation',
   'level': 3},
  'parentIds': ['value:19a56a11-3553-42d6-9d46-d8821157f992',
   'value:ad9909ee-827e-4968-a18c-b7a3f9c52cc3',
   'module:zdpuAqkshVC2VdPNuL1qJUukbLDosgnffaL5WwnXMZV2GAM9q',
   'module:zdpuAtk17r31968BYdD7oNbRefp133rquw71Xi12CxdKs8wpt']},
 2: {'id': 'value:19a56a11-3553-42d6-9d46-d8821157f992',
  'desc': {'label': 'to_lowercase (boolean)',
   'node_type': 'value',
   'data_type': 'boolean',
   'data_type_config': {},
   'level': 4},
  'parentIds': []},
 3: {'id': 'valu

In [60]:
# uncomment to export dataset for the viz

# res = json.dumps(augmented_nodes)
# with open("test_data2.json", "w") as outfile:
#     outfile.write(res)