In [51]:
import os
from pathlib import Path
from IPython.core.display import display, Markdown
from ipywidgets import Output
from dharpa_toolbox.modules.core import GLOBAL_EVENT_MANAGER
from dharpa_toolbox.modules.utils import describe_module, list_available_module_names, print_module_desc
from dharpa_toolbox.modules.workflows import DharpaWorkflow
from dharpa_toolbox.utils import graph_to_image, print_file_content, get_data_from_file
from rich.jupyter import print

module_events = Output()
GLOBAL_EVENT_MANAGER.set_output(module_events)

base_path = Path("/home/markus/projects/dharpa/dharpa-toolbox/dev")

In [52]:
config_path = base_path / "corpus_processing.yaml"
print_module_desc("tokenize_corpus", "lowercase_corpus", "remove_stopwords_from_corpus")
print_file_content(config_path)

```yaml
---
modules:

- type: tokenize_corpus

- type: lowercase_corpus
  input_map:
    tokenized_text: tokenize_corpus.tokenized_text

- type: remove_stopwords_from_corpus
  input_map:
    tokenized_text: lowercase_corpus.tokenized_text
  workflow_outputs:
    tokenized_text: processed_text_corpus

```

In [53]:
workflow_config = get_data_from_file(config_path)
workflow = DharpaWorkflow(**workflow_config)

graph_to_image(workflow.execution_graph)
list_available_module_names()


['dharpa_workflow',
 'file_reader',
 'lowercase_corpus',
 'remove_stopwords_from_corpus',
 'tokenize_corpus',
 'tokenize_workflow']

In [54]:
print_module_desc(workflow)


In [None]:
text_map = {
    "1": "Hello World!",
    "2": "Hello DHARPA!"
}
stopwords = [
    "hello",
    "!"
]

workflow.set_input("tokenize_corpus__text_map", text_map)
workflow.set_input("lowercase_corpus__enabled", True)
workflow.set_input("remove_stopwords_from_corpus__enabled", True)
workflow.set_input("remove_stopwords_from_corpus__stopwords_list", stopwords)

output1 = workflow.get_output("processed_text_corpus")
print(output1)
m = workflow.get_module('remove_stopwords_from_corpus')
m.get_output("tokenized_text")

{'1': ['world'], '2': ['dharpa']}

In [56]:
workflow.set_input("remove_stopwords_from_corpus__enabled", True)
output2 = workflow.get_output("processed_text_corpus")

print(output2)

In [57]:
class TokenizeWorkflow(DharpaWorkflow):

    _module_name = "tokenize_workflow"

    def __init__(self, **config):

        config.update(workflow_config)

        super().__init__(**config)

all_modules_new = list_available_module_names()
print(all_modules_new)
#

In [58]:
print_module_desc('file_reader', 'tokenize_workflow')

In [59]:
new_config = {
    "modules": [
        {
            "type": "file_reader"
        },
        {
            "type": "tokenize_workflow",
            "input_map": {
                "tokenize_corpus__text_map": "file_reader.content_map"
            },
            "workflow_outputs": {
                "processed_text_corpus": "processed_text_corpus"
            }
        }
    ]
}
new_workflow = DharpaWorkflow(**new_config)

print_module_desc(new_workflow)

input_file = base_path / "text_corpus_1.txt"

new_workflow.set_input("file_reader__files", input_file)
new_workflow.set_input("tokenize_workflow__remove_stopwords_from_corpus__stopwords_list", stopwords)
new_workflow.get_output("processed_text_corpus")

# m = new_workflow.get_module("file_reader")
# print(m.current_state)
# m = new_workflow.get_module("tokenize_workflow")
# print(m.current_state)

{'text_corpus_1.txt': ['world', 'dharpa']}

In [60]:
display(module_events)




Output()