Skip to content

Commit

Permalink
Merge pull request #389 from ianbakst/master
Browse files Browse the repository at this point in the history
Unified Dataset dataclass
  • Loading branch information
pcattori committed Jun 17, 2020
2 parents 3bc45ee + 3519447 commit 344f024
Show file tree
Hide file tree
Showing 15 changed files with 242 additions and 18 deletions.
9 changes: 7 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
## 0.12.0-dev
**BETA**

Important: Do not use BETA features for production workflows.

- [#372](https://github.com/Datatamer/tamr-client/issues/372) TC:Design for unified datasets
- `AnyDataset` can be any type of dataset.
- Unified Dataset is `tc.dataset.unified.Dataset`
- Any other Dataset is `tc.dataset.dataset.Dataset`
- Added function to get unified dataset from its project
- Added function to commit unified dataset

- [#367](https://github.com/Datatamer/tamr-client/issues/367) Support for projects:
- generic projects via `tc.project`
- Mastering projects via `tc.mastering.project`
Expand Down
1 change: 1 addition & 0 deletions docs/beta/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
* [Dataset](/beta/dataset/dataset)
* [Record](/beta/dataset/record)
* [Dataframe](/beta/dataset/dataframe)
* [Unified](/beta/dataset/unified)
2 changes: 1 addition & 1 deletion docs/beta/dataset/dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ Dataset
Exceptions
----------

.. autoclass:: tamr_client.DatasetNotFound
.. autoclass:: tamr_client.dataset.NotFound
:no-inherited-members:
13 changes: 13 additions & 0 deletions docs/beta/dataset/unified.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Unified
=======

.. autoclass:: tamr_client.dataset.unified.UnifiedDataset

.. autofunction:: tamr_client.dataset.unified.from_project
.. autofunction:: tamr_client.dataset.unified.commit

Exceptions
----------

.. autoclass:: tamr_client.dataset.unified.NotFound
:no-inherited-members:
6 changes: 4 additions & 2 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ def lint(session):
@nox.session(python="3.6")
def format(session):
session.run("poetry", "install", external=True)
check = "" if "--fix" in session.posargs else "--check"
session.run("black", check, ".")
if "--fix" in session.posargs:
session.run("black", ".")
else:
session.run("black", ".", "--check")


@nox.session(python="3.6")
Expand Down
2 changes: 1 addition & 1 deletion tamr_client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from tamr_client import session

# datasets
from tamr_client.dataset import Dataset, DatasetNotFound
from tamr_client.dataset import AnyDataset, Dataset
from tamr_client import dataset

# records
Expand Down
6 changes: 3 additions & 3 deletions tamr_client/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# flake8: noqa
from tamr_client.dataset.dataset import Dataset
from tamr_client.dataset.dataset import AnyDataset, Dataset
from tamr_client.dataset.dataset import from_resource_id
from tamr_client.dataset.dataset import DatasetNotFound
from tamr_client.dataset import dataframe, record
from tamr_client.dataset.dataset import NotFound
from tamr_client.dataset import dataframe, record, unified
14 changes: 9 additions & 5 deletions tamr_client/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
"""
from copy import deepcopy
from dataclasses import dataclass
from typing import Optional, Tuple
from typing import Optional, Tuple, Union

from tamr_client import response
from tamr_client.dataset.unified import UnifiedDataset
from tamr_client.instance import Instance
from tamr_client.session import Session
from tamr_client.types import JsonDict
from tamr_client.url import URL


class DatasetNotFound(Exception):
class NotFound(Exception):
"""Raised when referencing (e.g. updating or deleting) a dataset
that does not exist on the server.
"""
Expand All @@ -37,6 +38,9 @@ class Dataset:
description: Optional[str] = None


AnyDataset = Union[Dataset, UnifiedDataset]


def from_resource_id(session: Session, instance: Instance, id: str) -> Dataset:
"""Get dataset by resource ID
Expand All @@ -47,7 +51,7 @@ def from_resource_id(session: Session, instance: Instance, id: str) -> Dataset:
id: Dataset ID
Raises:
DatasetNotFound: If no dataset could be found at the specified URL.
dataset.NotFound: If no dataset could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
Expand All @@ -64,13 +68,13 @@ def _from_url(session: Session, url: URL) -> Dataset:
url: Dataset URL
Raises:
DatasetNotFound: If no dataset could be found at the specified URL.
dataset.NotFound: If no dataset could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
r = session.get(str(url))
if r.status_code == 404:
raise DatasetNotFound(str(url))
raise NotFound(str(url))
data = response.successful(r).json()
return _from_json(url, data)

Expand Down
4 changes: 2 additions & 2 deletions tamr_client/dataset/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import cast, Dict, IO, Iterable, Iterator, Optional

from tamr_client import response
from tamr_client.dataset.dataset import Dataset
from tamr_client.dataset.dataset import AnyDataset, Dataset
from tamr_client.session import Session
from tamr_client.types import JsonDict

Expand Down Expand Up @@ -145,7 +145,7 @@ def _delete_command(record: Dict, *, primary_key_name: str) -> Dict:
return {"action": "DELETE", "recordId": record[primary_key_name]}


def stream(session: Session, dataset: Dataset) -> Iterator[JsonDict]:
def stream(session: Session, dataset: AnyDataset) -> Iterator[JsonDict]:
"""Stream the records in this dataset as Python dictionaries.
Args:
Expand Down
108 changes: 108 additions & 0 deletions tamr_client/dataset/unified.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
See https://docs.tamr.com/reference/dataset-models
"""
from copy import deepcopy
from dataclasses import dataclass
from typing import Optional, Tuple

from tamr_client import response
from tamr_client.instance import Instance
from tamr_client.project import Project
from tamr_client.session import Session
from tamr_client.types import JsonDict
from tamr_client.url import URL


class NotFound(Exception):
"""Raised when referencing (e.g. updating or deleting) a unified dataset
that does not exist on the server.
"""

pass


@dataclass(frozen=True)
class UnifiedDataset:
"""A Tamr unified dataset
See https://docs.tamr.com/reference/dataset-models
Args:
url
key_attribute_names
"""

url: URL
name: str
key_attribute_names: Tuple[str, ...]
description: Optional[str] = None


def from_project(
session: Session, instance: Instance, project: Project
) -> UnifiedDataset:
"""Get unified dataset of a project
Fetches the unified dataset of a given project from Tamr server
Args:
instance: Tamr instance containing this dataset
project: Tamr project of this Unified Dataset
Raises:
unified.NotFound: If no unified dataset could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
url = URL(instance=instance, path=f"{project.url.path}/unifiedDataset")
return _from_url(session, url)


def _from_url(session: Session, url: URL) -> UnifiedDataset:
"""Get dataset by URL
Fetches dataset from Tamr server
Args:
url: Dataset URL
Raises:
unified.NotFound: If no dataset could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
r = session.get(str(url))
if r.status_code == 404:
raise NotFound(str(url))
data = response.successful(r).json()
return _from_json(url, data)


def _from_json(url: URL, data: JsonDict) -> UnifiedDataset:
"""Make unified dataset from JSON data (deserialize)
Args:
url: Unified Dataset URL
data: Unified Dataset JSON data from Tamr server
"""
cp = deepcopy(data)
return UnifiedDataset(
url,
name=cp["name"],
description=cp.get("description"),
key_attribute_names=tuple(cp["keyAttributeNames"]),
)


def commit(session: Session, unified_dataset: UnifiedDataset) -> JsonDict:
"""Commits the Unified Dataset.
Args:
unified_dataset: The UnifiedDataset which will be committed
session: The Tamr Session
"""
r = session.post(
str(unified_dataset.url) + ":refresh",
headers={"Content-Type": "application/json", "Accept": "application/json"},
)
return response.successful(r).json()
2 changes: 1 addition & 1 deletion tamr_client/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def from_resource_id(session: Session, instance: Instance, id: str) -> Project:
id: Project ID
Raises:
NotFound: If no project could be found at the specified URL.
project.NotFound: If no project could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
Expand Down
22 changes: 22 additions & 0 deletions tests/tamr_client/data/operation.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"id": "1",
"type": "SPARK",
"description": "operation 1 description",
"status": {
"state": "PENDING",
"startTime": "",
"endTime": "",
"message": "Job has not yet been submitted to Spark"
},
"created": {
"username": "admin",
"time": "2020-06-12T18:21:42.288Z",
"version": "operation 1 created version"
},
"lastModified": {
"username": "admin",
"time": "2020-06-12T18:21:42.288Z",
"version": "operation 1 modified version"
},
"relativeId": "operations/1"
}
2 changes: 1 addition & 1 deletion tests/tamr_client/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ def test_from_resource_id_dataset_not_found():
url = tc.URL(path="datasets/1")
responses.add(responses.GET, str(url), status=404)

with pytest.raises(tc.DatasetNotFound):
with pytest.raises(tc.dataset.NotFound):
tc.dataset.from_resource_id(s, instance, "1")
53 changes: 53 additions & 0 deletions tests/tamr_client/dataset/test_unified.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pytest
import responses

import tamr_client as tc
from tests.tamr_client import utils


@responses.activate
def test_from_project():
s = utils.session()
instance = utils.instance()
project = utils.mastering_project()

dataset_json = utils.load_json("dataset.json")
url = tc.URL(path="projects/1/unifiedDataset")
responses.add(responses.GET, str(url), json=dataset_json)

unified_dataset = tc.dataset.unified.from_project(s, instance, project)
assert unified_dataset.name == "dataset 1 name"
assert unified_dataset.description == "dataset 1 description"
assert unified_dataset.key_attribute_names == ("tamr_id",)


@responses.activate
def test_from_project_dataset_not_found():
s = utils.session()
instance = utils.instance()
project = utils.mastering_project()

url = tc.URL(path="projects/1/unifiedDataset")
responses.add(responses.GET, str(url), status=404)

with pytest.raises(tc.dataset.unified.NotFound):
tc.dataset.unified.from_project(s, instance, project)


@responses.activate
def test_commit():
s = utils.session()
instance = utils.instance()
project = utils.mastering_project()

operation_json = utils.load_json("operation.json")
dataset_json = utils.load_json("dataset.json")
prj_url = tc.URL(path="projects/1/unifiedDataset")
responses.add(responses.GET, str(prj_url), json=dataset_json)
unified_dataset = tc.dataset.unified.from_project(s, instance, project)

url = tc.URL(path="projects/1/unifiedDataset:refresh")
responses.add(responses.POST, str(url), json=operation_json)

response = tc.dataset.unified.commit(s, unified_dataset)
assert response == operation_json
16 changes: 16 additions & 0 deletions tests/tamr_client/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,22 @@ def dataset():
return dataset


def unified_dataset():
url = tc.URL(path="projects/1/unifiedDataset")
unified_dataset = tc.dataset.unified.UnifiedDataset(
url, name="dataset.csv", key_attribute_names=("primary_key",)
)
return unified_dataset


def mastering_project():
url = tc.URL(path="projects/1")
mastering_project = tc.mastering.Project(
url, name="Project 1", description="A Mastering Project"
)
return mastering_project


def capture_payload(request, snoop, status, response_json):
"""Capture request body within `snoop` so we can inspect that the request body is constructed correctly (e.g. for streaming requests).
Expand Down

0 comments on commit 344f024

Please sign in to comment.