Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
## 0.12.0-dev
**BETA**

Important: Do not use BETA features for production workflows.

- [#372](https://github.com/Datatamer/tamr-client/issues/372) TC:Design for unified datasets
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should go at the bottom of this section, below

- Support for streaming records from a dataset via `tc.record.stream`

so that these are in chronological order.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was my assumption that they should be in chronological order with most recent at top.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Historically, we haven't cared about order for changes that are part of the same release since those changes become user-facing at the same time.

- `AnyDataset` can be any type of dataset.
- Unified Dataset is `tc.dataset.unified.Dataset`
- Any other Dataset is `tc.dataset.dataset.Dataset`
- Added function to get unified dataset from its project
- Added function to commit unified dataset

- [#367](https://github.com/Datatamer/tamr-client/issues/367) Support for projects:
- generic projects via `tc.project`
- Mastering projects via `tc.mastering.project`
Expand Down
1 change: 1 addition & 0 deletions docs/beta/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
* [Dataset](/beta/dataset/dataset)
* [Record](/beta/dataset/record)
* [Dataframe](/beta/dataset/dataframe)
* [Unified](/beta/dataset/unified)
2 changes: 1 addition & 1 deletion docs/beta/dataset/dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ Dataset
Exceptions
----------

.. autoclass:: tamr_client.DatasetNotFound
.. autoclass:: tamr_client.dataset.NotFound
:no-inherited-members:
13 changes: 13 additions & 0 deletions docs/beta/dataset/unified.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Unified
=======

.. autoclass:: tamr_client.dataset.unified.UnifiedDataset

.. autofunction:: tamr_client.dataset.unified.from_project
.. autofunction:: tamr_client.dataset.unified.commit

Exceptions
----------

.. autoclass:: tamr_client.dataset.unified.NotFound
:no-inherited-members:
6 changes: 4 additions & 2 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ def lint(session):
@nox.session(python="3.6")
def format(session):
session.run("poetry", "install", external=True)
check = "" if "--fix" in session.posargs else "--check"
session.run("black", check, ".")
if "--fix" in session.posargs:
session.run("black", ".")
else:
session.run("black", ".", "--check")


@nox.session(python="3.6")
Expand Down
2 changes: 1 addition & 1 deletion tamr_client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from tamr_client import session

# datasets
from tamr_client.dataset import Dataset, DatasetNotFound
from tamr_client.dataset import AnyDataset, Dataset
from tamr_client import dataset

# records
Expand Down
6 changes: 3 additions & 3 deletions tamr_client/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# flake8: noqa
from tamr_client.dataset.dataset import Dataset
from tamr_client.dataset.dataset import AnyDataset, Dataset
from tamr_client.dataset.dataset import from_resource_id
from tamr_client.dataset.dataset import DatasetNotFound
from tamr_client.dataset import dataframe, record
from tamr_client.dataset.dataset import NotFound
from tamr_client.dataset import dataframe, record, unified
14 changes: 9 additions & 5 deletions tamr_client/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
"""
from copy import deepcopy
from dataclasses import dataclass
from typing import Optional, Tuple
from typing import Optional, Tuple, Union

from tamr_client import response
from tamr_client.dataset.unified import UnifiedDataset
from tamr_client.instance import Instance
from tamr_client.session import Session
from tamr_client.types import JsonDict
from tamr_client.url import URL


class DatasetNotFound(Exception):
class NotFound(Exception):
"""Raised when referencing (e.g. updating or deleting) a dataset
that does not exist on the server.
"""
Expand All @@ -37,6 +38,9 @@ class Dataset:
description: Optional[str] = None


AnyDataset = Union[Dataset, UnifiedDataset]


def from_resource_id(session: Session, instance: Instance, id: str) -> Dataset:
"""Get dataset by resource ID

Expand All @@ -47,7 +51,7 @@ def from_resource_id(session: Session, instance: Instance, id: str) -> Dataset:
id: Dataset ID

Raises:
DatasetNotFound: If no dataset could be found at the specified URL.
dataset.NotFound: If no dataset could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
Expand All @@ -64,13 +68,13 @@ def _from_url(session: Session, url: URL) -> Dataset:
url: Dataset URL

Raises:
DatasetNotFound: If no dataset could be found at the specified URL.
dataset.NotFound: If no dataset could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
r = session.get(str(url))
if r.status_code == 404:
raise DatasetNotFound(str(url))
raise NotFound(str(url))
data = response.successful(r).json()
return _from_json(url, data)

Expand Down
4 changes: 2 additions & 2 deletions tamr_client/dataset/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import cast, Dict, IO, Iterable, Iterator, Optional

from tamr_client import response
from tamr_client.dataset.dataset import Dataset
from tamr_client.dataset.dataset import AnyDataset, Dataset
from tamr_client.session import Session
from tamr_client.types import JsonDict

Expand Down Expand Up @@ -145,7 +145,7 @@ def _delete_command(record: Dict, *, primary_key_name: str) -> Dict:
return {"action": "DELETE", "recordId": record[primary_key_name]}


def stream(session: Session, dataset: Dataset) -> Iterator[JsonDict]:
def stream(session: Session, dataset: AnyDataset) -> Iterator[JsonDict]:
"""Stream the records in this dataset as Python dictionaries.

Args:
Expand Down
108 changes: 108 additions & 0 deletions tamr_client/dataset/unified.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
See https://docs.tamr.com/reference/dataset-models
"""
from copy import deepcopy
from dataclasses import dataclass
from typing import Optional, Tuple

from tamr_client import response
from tamr_client.instance import Instance
from tamr_client.project import Project
from tamr_client.session import Session
from tamr_client.types import JsonDict
from tamr_client.url import URL


class NotFound(Exception):
"""Raised when referencing (e.g. updating or deleting) a unified dataset
that does not exist on the server.
"""

pass


@dataclass(frozen=True)
class UnifiedDataset:
"""A Tamr unified dataset

See https://docs.tamr.com/reference/dataset-models

Args:
url
key_attribute_names
"""

url: URL
name: str
key_attribute_names: Tuple[str, ...]
description: Optional[str] = None


def from_project(
session: Session, instance: Instance, project: Project
) -> UnifiedDataset:
"""Get unified dataset of a project

Fetches the unified dataset of a given project from Tamr server

Args:
instance: Tamr instance containing this dataset
project: Tamr project of this Unified Dataset

Raises:
unified.NotFound: If no unified dataset could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
url = URL(instance=instance, path=f"{project.url.path}/unifiedDataset")
return _from_url(session, url)


def _from_url(session: Session, url: URL) -> UnifiedDataset:
"""Get dataset by URL

Fetches dataset from Tamr server

Args:
url: Dataset URL

Raises:
unified.NotFound: If no dataset could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
r = session.get(str(url))
if r.status_code == 404:
raise NotFound(str(url))
data = response.successful(r).json()
return _from_json(url, data)


def _from_json(url: URL, data: JsonDict) -> UnifiedDataset:
"""Make unified dataset from JSON data (deserialize)

Args:
url: Unified Dataset URL
data: Unified Dataset JSON data from Tamr server
"""
cp = deepcopy(data)
return UnifiedDataset(
url,
name=cp["name"],
description=cp.get("description"),
key_attribute_names=tuple(cp["keyAttributeNames"]),
)


def commit(session: Session, unified_dataset: UnifiedDataset) -> JsonDict:
"""Commits the Unified Dataset.

Args:
unified_dataset: The UnifiedDataset which will be committed
session: The Tamr Session
"""
r = session.post(
str(unified_dataset.url) + ":refresh",
headers={"Content-Type": "application/json", "Accept": "application/json"},
)
return response.successful(r).json()
2 changes: 1 addition & 1 deletion tamr_client/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def from_resource_id(session: Session, instance: Instance, id: str) -> Project:
id: Project ID

Raises:
NotFound: If no project could be found at the specified URL.
project.NotFound: If no project could be found at the specified URL.
Corresponds to a 404 HTTP error.
requests.HTTPError: If any other HTTP error is encountered.
"""
Expand Down
22 changes: 22 additions & 0 deletions tests/tamr_client/data/operation.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"id": "1",
"type": "SPARK",
"description": "operation 1 description",
"status": {
"state": "PENDING",
"startTime": "",
"endTime": "",
"message": "Job has not yet been submitted to Spark"
},
"created": {
"username": "admin",
"time": "2020-06-12T18:21:42.288Z",
"version": "operation 1 created version"
},
"lastModified": {
"username": "admin",
"time": "2020-06-12T18:21:42.288Z",
"version": "operation 1 modified version"
},
"relativeId": "operations/1"
}
2 changes: 1 addition & 1 deletion tests/tamr_client/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ def test_from_resource_id_dataset_not_found():
url = tc.URL(path="datasets/1")
responses.add(responses.GET, str(url), status=404)

with pytest.raises(tc.DatasetNotFound):
with pytest.raises(tc.dataset.NotFound):
tc.dataset.from_resource_id(s, instance, "1")
53 changes: 53 additions & 0 deletions tests/tamr_client/dataset/test_unified.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pytest
import responses

import tamr_client as tc
from tests.tamr_client import utils


@responses.activate
def test_from_project():
s = utils.session()
instance = utils.instance()
project = utils.mastering_project()

dataset_json = utils.load_json("dataset.json")
url = tc.URL(path="projects/1/unifiedDataset")
responses.add(responses.GET, str(url), json=dataset_json)

unified_dataset = tc.dataset.unified.from_project(s, instance, project)
assert unified_dataset.name == "dataset 1 name"
assert unified_dataset.description == "dataset 1 description"
assert unified_dataset.key_attribute_names == ("tamr_id",)


@responses.activate
def test_from_project_dataset_not_found():
s = utils.session()
instance = utils.instance()
project = utils.mastering_project()

url = tc.URL(path="projects/1/unifiedDataset")
responses.add(responses.GET, str(url), status=404)

with pytest.raises(tc.dataset.unified.NotFound):
tc.dataset.unified.from_project(s, instance, project)


@responses.activate
def test_commit():
s = utils.session()
instance = utils.instance()
project = utils.mastering_project()

operation_json = utils.load_json("operation.json")
dataset_json = utils.load_json("dataset.json")
prj_url = tc.URL(path="projects/1/unifiedDataset")
responses.add(responses.GET, str(prj_url), json=dataset_json)
unified_dataset = tc.dataset.unified.from_project(s, instance, project)

url = tc.URL(path="projects/1/unifiedDataset:refresh")
responses.add(responses.POST, str(url), json=operation_json)

response = tc.dataset.unified.commit(s, unified_dataset)
assert response == operation_json
16 changes: 16 additions & 0 deletions tests/tamr_client/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,22 @@ def dataset():
return dataset


def unified_dataset():
url = tc.URL(path="projects/1/unifiedDataset")
unified_dataset = tc.dataset.unified.UnifiedDataset(
url, name="dataset.csv", key_attribute_names=("primary_key",)
)
return unified_dataset


def mastering_project():
url = tc.URL(path="projects/1")
mastering_project = tc.mastering.Project(
url, name="Project 1", description="A Mastering Project"
)
return mastering_project


def capture_payload(request, snoop, status, response_json):
"""Capture request body within `snoop` so we can inspect that the request body is constructed correctly (e.g. for streaming requests).

Expand Down