Skip to content

Commit

Permalink
Merge pull request #425 from abafzal/cat_labels
Browse files Browse the repository at this point in the history
Categorization projects and manual categorization labels dataset
  • Loading branch information
pcattori committed Aug 12, 2020
2 parents 0a1b54d + c30c6f5 commit 6d067fe
Show file tree
Hide file tree
Showing 19 changed files with 265 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ignore = E203, E266, E501, W503, F403
max-line-length = 88
max-complexity = 18
select = B,C,E,F,I,W,T4,B9
exclude = build,.venv,*.egg-info
exclude = build,venv,.venv,*.egg-info
per-file-ignores =
tamr_client/__init__.py:E402,F401,I100,I202
tamr_client/*/__init__.py:F401
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

**NEW FEATURES**
- [#383](https://github.com/Datatamer/tamr-client/issues/383) Now able to create an Operation from Job resource id

- [#425](https://github.com/Datatamer/tamr-client/pull/425) Now able to get, update and delete manual labels for Categorization projects

## 0.12.0
**BETA**
Important: Do not use BETA features for production workflows.
Expand Down
1 change: 1 addition & 0 deletions docs/beta.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

* [Attribute](beta/attribute)
* [Auth](beta/auth)
* [Categorization](beta/categorization)
* [Dataset](beta/dataset)
* [Instance](beta/instance)
* [Mastering](beta/mastering)
Expand Down
3 changes: 3 additions & 0 deletions docs/beta/categorization.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Categoriation

* [Project](/beta/categorization/project)
6 changes: 6 additions & 0 deletions docs/beta/categorization/project.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Categorization Project
======================

.. autoclass:: tamr_client.CategorizationProject

.. autofunction:: tamr_client.categorization.project.manual_labels
3 changes: 2 additions & 1 deletion docs/contributor-guide/dev-tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ See [`nox --list`](https://nox.thea.codes/en/stable/tutorial.html#selecting-whic
To run specific tests, see [these pytest docs](https://docs.pytest.org/en/latest/usage.html#specifying-tests-selecting-tests) and pass `pytest` args after `--` e.g.:

```sh
prn -s test -- tests/unit/test_attribute.py
prn -s test -- tests/unit/test_attribute.py # with alias
poetry run nox -s test -- tests/unit/test_attribute.py # without alias
```


Expand Down
2 changes: 2 additions & 0 deletions tamr_client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
AnyDataset,
Attribute,
AttributeType,
CategorizationProject,
Dataset,
InputTransformation,
Instance,
Expand All @@ -38,6 +39,7 @@
###############

from tamr_client import attribute
from tamr_client import categorization
from tamr_client import dataset
from tamr_client import instance
from tamr_client import mastering
Expand Down
2 changes: 1 addition & 1 deletion tamr_client/_types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from tamr_client._types.instance import Instance
from tamr_client._types.json import JsonDict
from tamr_client._types.operation import Operation
from tamr_client._types.project import MasteringProject, Project
from tamr_client._types.project import CategorizationProject, MasteringProject, Project
from tamr_client._types.session import Session
from tamr_client._types.transformations import InputTransformation, Transformations
from tamr_client._types.url import URL
19 changes: 18 additions & 1 deletion tamr_client/_types/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,21 @@ class MasteringProject:
description: Optional[str] = None


Project = Union[MasteringProject]
@dataclass(frozen=True)
class CategorizationProject:
"""A Tamr Categorization project
See https://docs.tamr.com/reference/the-project-object
Args:
url
name
description
"""

url: URL
name: str
description: Optional[str] = None


Project = Union[MasteringProject, CategorizationProject]
5 changes: 5 additions & 0 deletions tamr_client/categorization/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
Tamr - Categorization
See https://docs.tamr.com/docs/overall-workflow-classification
"""
from tamr_client.categorization import project
55 changes: 55 additions & 0 deletions tamr_client/categorization/project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from tamr_client._types import (
CategorizationProject,
Dataset,
Instance,
JsonDict,
Session,
URL,
)
from tamr_client.dataset import _dataset, unified


def _from_json(url: URL, data: JsonDict) -> CategorizationProject:
"""Make Categorization project from JSON data (deserialize)
Args:
url: Project URL
data: Project JSON data from Tamr server
"""
return CategorizationProject(
url, name=data["name"], description=data.get("description")
)


def manual_labels(
session: Session, instance: Instance, project: CategorizationProject
) -> Dataset:
"""Get manual labels from a Categorization project
Args:
instance: Tamr instance containing project
project: Tamr project containing labels
Returns:
Dataset containing manual labels
Raises:
_dataset.NotFound: If no dataset could be found at the specified URL
Ambiguous: If multiple targets match dataset name
"""
unified_dataset = unified.from_project(
session=session, instance=instance, project=project
)
labels_dataset_name = unified_dataset.name + "_manual_categorizations"
datasets_url = URL(instance=instance, path="datasets")
r = session.get(
url=str(datasets_url), params={"filter": f"name=={labels_dataset_name}"}
)
matches = r.json()
if len(matches) == 0:
raise _dataset.NotFound(str(r.url))
if len(matches) > 1:
raise _dataset.Ambiguous(str(r.url))

dataset_path = matches[0]["relativeId"]
dataset_url = URL(instance=instance, path=dataset_path)
return _dataset._from_url(session=session, url=dataset_url)
6 changes: 6 additions & 0 deletions tamr_client/dataset/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ class NotFound(TamrClientException):
pass


class Ambiguous(TamrClientException):
"""Raised when referencing a dataset by name that matches multiple possible targets."""

pass


def from_resource_id(session: Session, instance: Instance, id: str) -> Dataset:
"""Get dataset by resource ID
Expand Down
10 changes: 3 additions & 7 deletions tamr_client/project.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from tamr_client import response
from tamr_client._types import Instance, JsonDict, Project, Session, URL
from tamr_client.categorization import project as categorization_project
from tamr_client.exception import TamrClientException
from tamr_client.mastering import project as mastering_project

Expand All @@ -13,13 +14,10 @@ class NotFound(TamrClientException):

def from_resource_id(session: Session, instance: Instance, id: str) -> Project:
"""Get project by resource ID
Fetches project from Tamr server
Args:
instance: Tamr instance containing this dataset
id: Project ID
Raises:
project.NotFound: If no project could be found at the specified URL.
Corresponds to a 404 HTTP error.
Expand All @@ -31,12 +29,9 @@ def from_resource_id(session: Session, instance: Instance, id: str) -> Project:

def _from_url(session: Session, url: URL) -> Project:
"""Get project by URL
Fetches project from Tamr server
Args:
url: Project URL
Raises:
NotFound: If no project could be found at the specified URL.
Corresponds to a 404 HTTP error.
Expand All @@ -51,13 +46,14 @@ def _from_url(session: Session, url: URL) -> Project:

def _from_json(url: URL, data: JsonDict) -> Project:
"""Make project from JSON data (deserialize)
Args:
url: Project URL
data: Project JSON data from Tamr server
"""
proj_type = data["type"]
if proj_type == "DEDUP":
return mastering_project._from_json(url, data)
elif proj_type == "CATEGORIZATION":
return categorization_project._from_json(url, data)
else:
raise ValueError(f"Unrecognized project type '{proj_type}' in {repr(data)}")
Empty file.
13 changes: 13 additions & 0 deletions tests/tamr_client/categorization/test_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import tamr_client as tc
from tests.tamr_client import fake


@fake.json
def test_manual_labels():
s = fake.session()
instance = fake.instance()
project = fake.categorization_project()

tc.categorization.project.manual_labels(
session=s, instance=instance, project=project
)
8 changes: 8 additions & 0 deletions tests/tamr_client/fake.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ def mastering_project() -> tc.MasteringProject:
return mastering_project


def categorization_project() -> tc.CategorizationProject:
url = tc.URL(path="projects/2")
categorization_project = tc.CategorizationProject(
url, name="Project 2", description="A Categorization Project"
)
return categorization_project


def transforms() -> tc.Transformations:
return tc.Transformations(
input_scope=[
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
[
{
"request": {
"method": "GET",
"path": "projects/2/unifiedDataset"
},
"response": {
"json": {
"id": "unify://unified-data/v1/datasets/161",
"name": "Party_Categorization_Unified_Dataset",
"description": "",
"version": "3607",
"keyAttributeNames": [
"tamr_id"
],
"tags": [],
"created": {
"username": "afsana.afzal",
"time": "2020-05-21T15:18:38.575Z",
"version": "18336"
},
"lastModified": {
"username": "workflow.bot",
"time": "2020-06-18T15:18:30.833Z",
"version": "149940"
},
"relativeId": "datasets/161",
"upstreamDatasetIds": [
"unify://unified-data/v1/datasets/106"
],
"externalId": "Party_Categorization_Unified_Dataset"
}
}
},
{
"request": {
"method": "GET",
"path": "datasets?filter=name==Party_Categorization_Unified_Dataset_manual_categorizations"
},
"response": {
"json": [
{
"id": "unify://unified-data/v1/datasets/167",
"name": "Party_Categorization_Unified_Dataset_manual_categorizations",
"description": "Manual categorizations",
"version": "2992",
"keyAttributeNames": [
"recordId"
],
"tags": [],
"created": {
"username": "afsana.afzal",
"time": "2020-06-01T20:49:46.549Z",
"version": "57920"
},
"lastModified": {
"username": "workflow.bot",
"time": "2020-06-18T15:32:44.631Z",
"version": "150069"
},
"relativeId": "datasets/167",
"upstreamDatasetIds": [],
"externalId": "Party_Categorization_Unified_Dataset_manual_categorizations"
}
]
}
},
{
"request": {
"method": "GET",
"path": "datasets/167"
},
"response": {
"json": {
"id": "unify://unified-data/v1/datasets/167",
"name": "Party_Categorization_Unified_Dataset_manual_categorizations",
"description": "Manual categorizations",
"version": "2992",
"keyAttributeNames": [
"recordId"
],
"tags": [],
"created": {
"username": "afsana.afzal",
"time": "2020-06-01T20:49:46.549Z",
"version": "57920"
},
"lastModified": {
"username": "workflow.bot",
"time": "2020-06-18T15:32:44.631Z",
"version": "150069"
},
"relativeId": "datasets/167",
"upstreamDatasetIds": [],
"externalId": "Party_Categorization_Unified_Dataset_manual_categorizations"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[
{
"request": {
"method": "GET",
"path": "projects/2"
},
"response": {
"json": {
"id": "unify://unified-data/v1/projects/2",
"name": "Party Categorization",
"description": "Categorizes organization at the Party/Domestic level",
"type": "CATEGORIZATION",
"unifiedDatasetName": "party_categorization_unified_dataset",
"created": {
"username": "admin",
"time": "2020-08-04T14:54:11.767Z",
"version": "20"
},
"lastModified": {
"username": "admin",
"time": "2020-08-04T14:54:11.767Z",
"version": "21"
},
"relativeId": "projects/2",
"externalId": "98f9e4ee-1a35-4242-917d-1163363d5411"
}
}
}
]
11 changes: 11 additions & 0 deletions tests/tamr_client/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@ def test_from_resource_id_mastering():
assert project.description == "Mastering Project"


@fake.json
def test_from_resource_id_categorization():
s = fake.session()
instance = fake.instance()

project = tc.project.from_resource_id(s, instance, "2")
assert isinstance(project, tc.CategorizationProject)
assert project.name == "Party Categorization"
assert project.description == "Categorizes organization at the Party/Domestic level"


@fake.json
def test_from_resource_id_not_found():
s = fake.session()
Expand Down

0 comments on commit 6d067fe

Please sign in to comment.