Skip to content

Commit

Permalink
Merge pull request #446 from skalish/cat-ops
Browse files Browse the repository at this point in the history
TC: Add functions to run Categorization and Schema Mapping project workflows
  • Loading branch information
pcattori committed Sep 1, 2020
2 parents 0f10ba3 + 24eafcd commit d72d7eb
Show file tree
Hide file tree
Showing 18 changed files with 240 additions and 43 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
- [#443](https://github.com/Datatamer/tamr-client/pull/443) Added function to materialize datasets.
- [#445](https://github.com/Datatamer/tamr-client/pull/445) Added functions for getting projects and datasets by name via `tc.project.by_name` and `tc.dataset.by_name`
- Renamed functions `from_resource_id` to `by_resource_id` in `tc.attribute`, `tc.dataset`, `tc.operation`, and `tc.project`
- [#446](https://github.com/Datatamer/tamr-client/pull/446) Added functions for categorization workflow operations in `tc.categorization` and schema mapping workflow operations in `tc.schema_mapping`

**NEW FEATURES**
- [#383](https://github.com/Datatamer/tamr-client/issues/383) Now able to create an Operation from Job resource id
Expand Down
1 change: 1 addition & 0 deletions docs/beta/categorization.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Categorization

* [Categorization](/beta/categorization/categorization)
* [Project](/beta/categorization/project)
7 changes: 7 additions & 0 deletions docs/beta/categorization/categorization.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Categorization
==============

.. autofunction:: tamr_client.categorization.update_unified_dataset
.. autofunction:: tamr_client.categorization.apply_feedback
.. autofunction:: tamr_client.categorization.update_results
.. autofunction:: tamr_client.categorization.manual_labels
3 changes: 1 addition & 2 deletions docs/beta/categorization/project.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ Categorization Project

.. autoclass:: tamr_client.CategorizationProject

.. autofunction:: tamr_client.categorization.project.create
.. autofunction:: tamr_client.categorization.project.manual_labels
.. autofunction:: tamr_client.categorization.project.create
1 change: 1 addition & 0 deletions docs/beta/schema_mapping.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Schema Mapping

* [Schema Mapping](/beta/schema_mapping/schema_mapping)
* [Project](/beta/schema_mapping/project)
4 changes: 4 additions & 0 deletions docs/beta/schema_mapping/schema_mapping.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Schema Mapping
==============

.. autofunction:: tamr_client.schema_mapping.update_unified_dataset
8 changes: 8 additions & 0 deletions tamr_client/categorization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,11 @@
See https://docs.tamr.com/docs/overall-workflow-classification
"""
from tamr_client.categorization import project
from tamr_client.categorization._categorization import (
_apply_feedback_async,
_update_results_async,
apply_feedback,
manual_labels,
update_results,
update_unified_dataset,
)
81 changes: 81 additions & 0 deletions tamr_client/categorization/_categorization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
Tamr - Categorization
See https://docs.tamr.com/docs/overall-workflow-classification
The terminology used here is consistent with Tamr UI terminology
Asynchronous versions of each function can be found with the suffix `_async` and may be of
interest to power users
"""
from tamr_client import operation
from tamr_client._types import CategorizationProject, Dataset, Operation, Session
from tamr_client.dataset import _dataset, unified


def manual_labels(session: Session, project: CategorizationProject) -> Dataset:
"""Get manual labels from a Categorization project.
Args:
project: Tamr project containing labels
Returns:
Dataset containing manual labels
Raises:
dataset.NotFound: If no dataset could be found at the specified URL
dataset.Ambiguous: If multiple targets match dataset name
"""
unified_dataset = unified.from_project(session=session, project=project)
labels_dataset_name = unified_dataset.name + "_manual_categorizations"
return _dataset.by_name(
session=session, instance=project.url.instance, name=labels_dataset_name
)


def update_unified_dataset(
session: Session, project: CategorizationProject
) -> Operation:
"""Apply changes to the unified dataset and wait for the operation to complete
Args:
project: Tamr Categorization project
"""
unified_dataset = unified.from_project(session, project)
op = unified._apply_changes_async(session, unified_dataset)
return operation.wait(session, op)


def apply_feedback(session: Session, project: CategorizationProject) -> Operation:
"""Train the categorization model according to verified labels and wait for the
operation to complete
Args:
project: Tamr Categorization project
"""
op = _apply_feedback_async(session, project)
return operation.wait(session, op)


def update_results(session: Session, project: CategorizationProject) -> Operation:
"""Generate classifications based on the latest categorization model and wait for the
operation to complete
Args:
project: Tamr Categorization project
"""
op = _update_results_async(session, project)
return operation.wait(session, op)


def _apply_feedback_async(
session: Session, project: CategorizationProject
) -> Operation:
r = session.post(str(project.url) + "/categorizations/model:refresh")
return operation._from_response(project.url.instance, r)


def _update_results_async(
session: Session, project: CategorizationProject
) -> Operation:
r = session.post(str(project.url) + "/categorizations:refresh")
return operation._from_response(project.url.instance, r)
22 changes: 0 additions & 22 deletions tamr_client/categorization/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@
from tamr_client import project
from tamr_client._types import (
CategorizationProject,
Dataset,
Instance,
JsonDict,
Project,
Session,
URL,
)
from tamr_client.dataset import _dataset, unified


def _from_json(url: URL, data: JsonDict) -> CategorizationProject:
Expand Down Expand Up @@ -58,23 +56,3 @@ def create(
external_id=external_id,
unified_dataset_name=unified_dataset_name,
)


def manual_labels(session: Session, project: CategorizationProject) -> Dataset:
"""Get manual labels from a Categorization project.
Args:
project: Tamr project containing labels
Returns:
Dataset containing manual labels
Raises:
dataset.NotFound: If no dataset could be found at the specified URL
dataset.Ambiguous: If multiple targets match dataset name
"""
unified_dataset = unified.from_project(session=session, project=project)
labels_dataset_name = unified_dataset.name + "_manual_categorizations"
return _dataset.by_name(
session=session, instance=project.url.instance, name=labels_dataset_name
)
1 change: 0 additions & 1 deletion tamr_client/mastering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
_update_cluster_results_async,
_update_high_impact_pairs_async,
_update_pair_results_async,
_update_unified_dataset_async,
apply_feedback,
estimate_pairs,
generate_pairs,
Expand Down
10 changes: 2 additions & 8 deletions tamr_client/mastering/_mastering.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def update_unified_dataset(session: Session, project: MasteringProject) -> Opera
Args:
project: Tamr Mastering project
"""
op = _update_unified_dataset_async(session, project)
unified_dataset = unified.from_project(session, project)
op = unified._apply_changes_async(session, unified_dataset)
return operation.wait(session, op)


Expand Down Expand Up @@ -97,13 +98,6 @@ def publish_clusters(session: Session, project: MasteringProject) -> Operation:
return operation.wait(session, op)


def _update_unified_dataset_async(
session: Session, project: MasteringProject
) -> Operation:
unified_dataset = unified.from_project(session, project)
return unified._apply_changes_async(session, unified_dataset)


def _estimate_pairs_async(session: Session, project: MasteringProject) -> Operation:
r = session.post(str(project.url) + "/estimatedPairCounts:refresh")
return operation._from_response(project.url.instance, r)
Expand Down
1 change: 1 addition & 0 deletions tamr_client/schema_mapping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
See https://docs.tamr.com/new/docs/overall-workflow-schema
"""
from tamr_client.schema_mapping import project
from tamr_client.schema_mapping._schema_mapping import update_unified_dataset
25 changes: 25 additions & 0 deletions tamr_client/schema_mapping/_schema_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
Tamr - Schema Mapping
See https://docs.tamr.com/new/docs/overall-workflow-schema
The terminology used here is consistent with Tamr UI terminology
Asynchronous versions of each function can be found with the suffix `_async` and may be of
interest to power users
"""
from tamr_client import operation
from tamr_client._types import Operation, SchemaMappingProject, Session
from tamr_client.dataset import unified


def update_unified_dataset(
session: Session, project: SchemaMappingProject
) -> Operation:
"""Apply changes to the unified dataset and wait for the operation to complete
Args:
project: Tamr Schema Mapping project
"""
unified_dataset = unified.from_project(session, project)
op = unified._apply_changes_async(session, unified_dataset)
return operation.wait(session, op)
42 changes: 42 additions & 0 deletions tests/tamr_client/categorization/test_categorization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import tamr_client as tc
from tests.tamr_client import fake


@fake.json
def test_manual_labels():
s = fake.session()
project = fake.categorization_project()

tc.categorization.manual_labels(session=s, project=project)


@fake.json
def test_apply_feedback_async():
s = fake.session()
project = fake.categorization_project()

op = tc.categorization._apply_feedback_async(s, project)
assert op.type == "SPARK"
assert op.description == "Materialize views to Elastic"
assert op.status == {
"state": "PENDING",
"startTime": "",
"endTime": "",
"message": "Job has not yet been submitted to Spark",
}


@fake.json
def test_update_results_async():
s = fake.session()
project = fake.categorization_project()

op = tc.categorization._update_results_async(s, project)
assert op.type == "SPARK"
assert op.description == "Materialize views to Elastic"
assert op.status == {
"state": "PENDING",
"startTime": "",
"endTime": "",
"message": "Job has not yet been submitted to Spark",
}
10 changes: 0 additions & 10 deletions tests/tamr_client/categorization/test_project.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[
{
"request": {
"method": "POST",
"path": "projects/2/categorizations/model:refresh"
},
"response": {
"status": 200,
"json": {
"id": "1",
"type": "SPARK",
"description": "Materialize views to Elastic",
"status": {
"state": "PENDING",
"startTime": "",
"endTime": "",
"message": "Job has not yet been submitted to Spark"
},
"created": {
"username": "admin",
"time": "2020-06-12T18:21:42.288Z",
"version": "operation 1 created version"
},
"lastModified": {
"username": "admin",
"time": "2020-06-12T18:21:42.288Z",
"version": "operation 1 modified version"
},
"relativeId": "operations/1"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[
{
"request": {
"method": "POST",
"path": "projects/2/categorizations:refresh"
},
"response": {
"status": 200,
"json": {
"id": "1",
"type": "SPARK",
"description": "Materialize views to Elastic",
"status": {
"state": "PENDING",
"startTime": "",
"endTime": "",
"message": "Job has not yet been submitted to Spark"
},
"created": {
"username": "admin",
"time": "2020-06-12T18:21:42.288Z",
"version": "operation 1 created version"
},
"lastModified": {
"username": "admin",
"time": "2020-06-12T18:21:42.288Z",
"version": "operation 1 modified version"
},
"relativeId": "operations/1"
}
}
}
]

0 comments on commit d72d7eb

Please sign in to comment.