In [27]:
from kiara import KiaraAPI, Kiara
import networkx
from networkx.readwrite import json_graph
import json
import os

In [28]:
# all (global) variables that will be used in this notebook

# the folder this notebook lives in
current_path = os.getcwd()

# the onboarding pipeline, defined in 'corpus_onboarding.yaml' in the 'pipelines' folder
onboarding_pipeline = os.path.join(current_path, 'pipelines', 'corpus_onboarding.yaml')
# the onboarding pipeline, defined in 'topic_modeling.yaml' in the 'pipelines' folder
topic_modeling_pipeline = os.path.join(current_path, 'pipelines', 'topic_modeling.yaml')


# the local path to example dataset, the defaults is located in the `example_data/mini_corpus` folder next to this notebook
# change to a different one by adjusting the following variable, e.g.:
# corpus_path = '/Users/mariella.decrouychan/Documents/GitHub/kiara_plugin.playground/examples/data/CI_newspaper_subcorpora'
corpus_path = os.path.join(current_path, 'example_data', 'mini_corpus')

# the general alias related to the current data prep (to be able to easily spot data created now in data registry)
gen_alias = 'test3oct22'

In [29]:
#api = KiaraAPI.instance("topic_modeling")
api = KiaraAPI.instance()


In [30]:
# version of Kiara used in this notebook
!pip show kiara

Name: kiara
Version: 0.4.19.dev2+g72c1ab7a
Summary: Data-centric workflow orchestration.
Home-page: https://github.com/DHARPA-Project/kiara
Author: Markus Binsteiner
Author-email: markus@frkl.io
License: MPL-2.0
Location: /home/markus/projects/kiara/core/kiara/src
Requires: airium, alembic, appdirs, bidict, black, click, dag-cbor, deepdiff, Deprecated, distro, dpath, filetype, humanfriendly, jinja2, jupytext, mistune, mkdocstrings, mmh3, multiformats, networkx, orjson, pp-ez, pydantic, python-dateutil, python-slugify, pyzmq, regex, rich, rich-click, ruamel.yaml, sortedcontainers, sqlalchemy, sqlalchemy-utc, sqlalchemy-utils, stevedore, structlog, textual, tzlocal
Required-by: kiara-plugin.core-types, kiara-plugin.develop, kiara-plugin.html, kiara-plugin.language-processing, kiara-plugin.network-analysis, kiara-plugin.service, kiara-plugin.tabular


#### 1. Overview of the operations we will be experimenting on to create lineage data

- corpus onboarding
example corpus onboarding pipeline from https://github.com/DHARPA-Project/DHARPA-Project-viz-observable/blob/main/dag-lineage/pipelines/corpus_onboarding.yaml 

In [31]:
onboarding_op = api.get_operation(onboarding_pipeline, allow_external=True)
onboarding_op                        

- text processing
example topic modeling pipeline from https://github.com/DHARPA-Project/DHARPA-Project-viz-observable/blob/main/dag-lineage/pipelines/topic_modeling.yaml 

In [32]:
topic_modeling_op = api.get_operation(topic_modeling_pipeline, allow_external=True)
topic_modeling_op

#### 2. Lineage data for the onboarding step

- Data onboarding

In [33]:
onboarding_result = api.run_job(operation=onboarding_op, inputs={'text_corpus_folder_path': corpus_path})

In [34]:
table = onboarding_result["corpus_table"]
api.store_value(table, f'tm_{gen_alias}_onboard')

StoreValueResult(value=Value(id=f176d39e-7f0a-4aa0-ae52-6d50063a9048, type=table, status=set, initialized=True optional=False), aliases=['tm_test3oct22_onboard'], persisted_data=None, error=None)

In [35]:
# checking how that would appear in CLI
! kiara data list

╭─ [33mInvalid kiara module '[0m[3;33mdict_test[0m[33m'[0m ───────────────────────────────────────────╮
│                                                                              │
│ Invalid kiara module: [1mkiara[0m[1m_[0m[1mplugin.develop.modules.ExampleModule[0m ( [3mdict[0m[3m_[0m[3mtest[0m │
│ )                                                                            │
│                                                                              │
│ Missing method(s):                                                           │
│                                                                              │
│ [1;33m • [0m[3mcreate[0m[3m_[0m[3minputs[0m[3m_[0m[3mschema[0m                                                      │
╰──────────────────────────────────────────────────────────────────────────────╯

╭─ Available aliases ──────────────────────────────────────────────────────────╮
│                                                 

- lineage data preparation

In [36]:
corpus_table = api.get_value(value='tm_test3oct22_onboard')

In [37]:
graph = corpus_table.lineage.module_graph
result = json_graph.node_link_data(graph)

In [38]:
result

{'directed': True,
 'multigraph': False,
 'graph': {},
 'nodes': [{'data_type': 'table',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1,
   'id': 'value:f176d39e-7f0a-4aa0-ae52-6d50063a9048'},
  {'module_type': 'table.merge',
   'module_config': {'constants': {},
    'defaults': {},
    'inputs_schema': {'source_table': {'type': 'table',
      'type_config': {},
      'default': '__not_set__',
      'optional': False,
      'is_constant': False,
      'doc': {'description': 'The original table.', 'doc': None}},
     'date_array': {'type': 'array',
      'type_config': {},
      'default': '__not_set__',
      'optional': False,
      'is_constant': False,
      'doc': {'description': 'The array containing the parsed date items.',
       'doc': None}}},
    'column_map': {'date': 'date_array',
     'content': 'source_table.content',
     'file_name': 'source_table.file_name'}},
   'label': 'table.merge',
   'node_type': 'operation',
   'lev

In [39]:
nodes = graph.nodes.data()
augmented_nodes = dict()

- (message for Markus): cell below here is where the additional info would be needed

In [40]:
def get_info(node):
    # all this is terribly inefficient
    if node[1]["node_type"] == "operation":
        result = api.retrieve_module_type_info(node[1]["module_type"]).dict()
    elif node[1]["node_type"] == "value":
        value_id = node[0][6:]
        v = api.get_value(value_id)

        render_result = api.render_value(value=v, target_format="string").rendered

        result = {
            "preview": render_result
        }
    return result

for idx, node in enumerate(nodes):
    # printing out the content to make obvious what is in there
    # what would be needed here is info about input value (and not only type), and module info (doc)
    node_dict = {
        "id": node[0],
        "desc": node[1],
        "parentIds": [pred for pred in graph.predecessors(node[0])],
        "info": get_info(node)
    }
    augmented_nodes[idx] = node_dict
node_dict

{'id': 'value:e12d1e3c-bef8-4187-9906-c6b68e3d974d',
 'desc': {'label': 'remove_tokens (list)',
  'node_type': 'value',
  'data_type': 'list',
  'data_type_config': {},
  'level': 6},
 'parentIds': [],
 'info': {'preview': "list_data=[] item_schema={'title': 'list', 'type': 'object'} python_class=PythonClass(model_id=list, category=instance.wrapped_python_class, fields=[python_class_name, python_module_name, full_name])"}}

In [41]:
augmented_nodes

{0: {'id': 'value:f176d39e-7f0a-4aa0-ae52-6d50063a9048',
  'desc': {'data_type': 'table',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1},
  'parentIds': ['module:zdpuAo1d9Dgbw2bDh74YRNYXCfdktruufsfareKYFdsXcNC6n'],
  'info': {'preview': "date\tcontent\tfile_name\t\n1917-04-25 00:00:00\tLA RAGIONE\tsn84037024_1917-04-25_ed-1_seq-1_ocr.txt\t\n1917-04-25 00:00:00\tLA RAG ONE\tsn84037024_1917-04-25_ed-2_seq-1_ocr.txt\t\n1917-04-25 00:00:00\tLA RAGIONE\tsn84037024_1917-04-25_ed-3_seq-1_ocr.txt\t\n1917-04-25 00:00:00\tcontro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici della patria di e di quella d adozione.\tsn84037024_1917-04-25_ed-4_seq-1_ocr.txt\t\n1917-05-05 00:00:00\tcontro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici della patria di origine e di quella d' adozione\tsn84037024_1917-05-05_ed-1_seq-1_ocr.txt\t\n1917-05-05 00:00:00\tLA RAGIONA\tsn84037024_1917-05-05_ed-2_seq-1_ocr.txt\t\n1917-0

In [42]:
# uncomment to export dataset for the viz

# res = json.dumps(augmented_nodes)
# with open("test_data.json", "w") as outfile:
#     outfile.write(res)

#### 3. Lineage data for nlp step

- running the example TM pipeline with previously onboarded data

In [43]:
nlp_step = api.run_job(operation=topic_modeling_op, inputs={'corpus': corpus_table})

In [44]:
preprocessed_corpus = nlp_step["preprocessed_corpus"]
api.store_value(preprocessed_corpus, f'tm_{gen_alias}_preprocessed_corpus')

StoreValueResult(value=Value(id=e6487140-eb6f-49a3-afd4-ee897df71247, type=array, status=set, initialized=True optional=False), aliases=['tm_test3oct22_preprocessed_corpus'], persisted_data=None, error=None)

In [45]:
!kiara data list

╭─ [33mInvalid kiara module '[0m[3;33mdict_test[0m[33m'[0m ───────────────────────────────────────────╮
│                                                                              │
│ Invalid kiara module: [1mkiara[0m[1m_[0m[1mplugin.develop.modules.ExampleModule[0m ( [3mdict[0m[3m_[0m[3mtest[0m │
│ )                                                                            │
│                                                                              │
│ Missing method(s):                                                           │
│                                                                              │
│ [1;33m • [0m[3mcreate[0m[3m_[0m[3minputs[0m[3m_[0m[3mschema[0m                                                      │
╰──────────────────────────────────────────────────────────────────────────────╯

╭─ Available aliases ──────────────────────────────────────────────────────────╮
│                                                 

- lineage data preparation

In [46]:
# this is not really necessary, since the 'preprocessed_corpus' variable still holds that value
preprocessed_corpus = api.get_value(value='tm_test3oct22_preprocessed_corpus')

In [47]:
graph = preprocessed_corpus.lineage.module_graph
result = json_graph.node_link_data(graph)

In [48]:
result

{'directed': True,
 'multigraph': False,
 'graph': {},
 'nodes': [{'data_type': 'array',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1,
   'id': 'value:e6487140-eb6f-49a3-afd4-ee897df71247'},
  {'module_type': 'preprocess.tokens_array',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'preprocess.tokens_array',
   'node_type': 'operation',
   'level': 3,
   'id': 'module:zdpuAxr2sGUG28Bi97JQr8QW5RRCt9dJYwewmnepmJfXwwhWz'},
  {'label': 'to_lowercase (boolean)',
   'node_type': 'value',
   'data_type': 'boolean',
   'data_type_config': {},
   'level': 4,
   'id': 'value:a8e23c9c-e88f-4846-bd98-02ed935adc5e'},
  {'label': 'remove_short_tokens (integer)',
   'node_type': 'value',
   'data_type': 'integer',
   'data_type_config': {},
   'level': 4,
   'id': 'value:9bbfe2c4-023b-41fe-92ea-e2d831cf58e4'},
  {'module_type': 'create.stopwords_list',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'create.stopwo

In [49]:
nodes = graph.nodes.data()
augmented_nodes = dict()

- (message for Markus): cell below here is where the additional info would be needed

In [50]:

for idx, node in enumerate(nodes):
    # printing out the content to make obvious what is in there
    # what would be needed here is info about input value (and not only type), and module info (doc)

    node_dict = {
        "id": node[0],
        "desc": node[1],
        "parentIds": [pred for pred in graph.predecessors(node[0])],
        "info": get_info(node)
    }
    augmented_nodes[idx] = node_dict

In [51]:
augmented_nodes

{0: {'id': 'value:e6487140-eb6f-49a3-afd4-ee897df71247',
  'desc': {'data_type': 'array',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1},
  'parentIds': ['module:zdpuAxr2sGUG28Bi97JQr8QW5RRCt9dJYwewmnepmJfXwwhWz'],
  'info': {'preview': 'KiaraArray(model_id=-- n/a --, category=kiara_array, fields=[data_path])'}},
 1: {'id': 'module:zdpuAxr2sGUG28Bi97JQr8QW5RRCt9dJYwewmnepmJfXwwhWz',
  'desc': {'module_type': 'preprocess.tokens_array',
   'module_config': {'constants': {}, 'defaults': {}},
   'label': 'preprocess.tokens_array',
   'node_type': 'operation',
   'level': 3},
  'parentIds': ['value:a8e23c9c-e88f-4846-bd98-02ed935adc5e',
   'value:9bbfe2c4-023b-41fe-92ea-e2d831cf58e4',
   'module:zdpuAmCPKGoanw3o4rhkchgBKpX7ZxrMp1PGoDcb99qZMzucG',
   'module:zdpuAouJLh5QMhBecDkH1noQnkburvzjxE8h1Z1tfBAXovPWf'],
  'info': {'type_name': 'preprocess.tokens_array',
   'documentation': {'description': 'Preprocess lists of tokens, incl. lowercasing, r

In [52]:
# uncomment to export dataset for the viz

# res = json.dumps(augmented_nodes)
# with open("test_data2.json", "w") as outfile:
#     outfile.write(res)