Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DEVX-396]: Dataset Version ID support #315

Merged
merged 2 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,7 @@ dataset.upload_from_csv(csv_path='csv_path', input_type='text', csv_type='raw',
dataset.upload_from_folder(folder_path='folder_path', input_type='text', labels=True)

# Export Dataset
from clarifai.client.dataset import Dataset
# Note: clarifai-data-protobuf.zip is acquired through exporting datasets within the Clarifai Platform.
Dataset().export(save_path='output.zip', local_archive_path='clarifai-data-protobuf.zip')
dataset.export(save_path='output.zip')
```


Expand Down
5 changes: 4 additions & 1 deletion clarifai/client/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def list_datasets(self, page_no: int = None,
for dataset_info in all_datasets_info:
if 'version' in dataset_info:
dataset_info['version'].pop('metrics', None)
dataset_info['version'].pop('export_info', None)
yield Dataset.from_auth_helper(auth=self.auth_helper, **dataset_info)

def list_models(self,
Expand Down Expand Up @@ -457,11 +458,12 @@ def create_module(self, module_id: str, description: str, **kwargs) -> Module:

return Module.from_auth_helper(auth=self.auth_helper, module_id=module_id, **kwargs)

def dataset(self, dataset_id: str, **kwargs) -> Dataset:
def dataset(self, dataset_id: str, dataset_version_id: str = None, **kwargs) -> Dataset:
"""Returns a Dataset object for the existing dataset ID.

Args:
dataset_id (str): The dataset ID for the dataset to interact with.
dataset_version_id (str): The version ID for the dataset version to interact with.

Returns:
Dataset: A Dataset object for the existing dataset ID.
Expand All @@ -480,6 +482,7 @@ def dataset(self, dataset_id: str, **kwargs) -> Dataset:
kwargs = self.process_response_keys(dict_response[list(dict_response.keys())[1]],
list(dict_response.keys())[1])
kwargs['version'] = response.dataset.version if response.dataset.version else None
kwargs['dataset_version_id'] = dataset_version_id
return Dataset.from_auth_helper(auth=self.auth_helper, **kwargs)

def model(self, model_id: str, model_version_id: str = "", **kwargs) -> Model:
Expand Down
18 changes: 11 additions & 7 deletions clarifai/client/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class Dataset(Lister, BaseClient):
def __init__(self,
url: str = None,
dataset_id: str = None,
dataset_version_id: str = None,
base_url: str = "https://api.clarifai.com",
pat: str = None,
token: str = None,
Expand All @@ -52,6 +53,7 @@ def __init__(self,
Args:
url (str): The URL to initialize the dataset object.
dataset_id (str): The Dataset ID within the App to interact with.
dataset_version_id (str): The Dataset Version ID within the Dataset to interact with.
base_url (str): Base API url. Default "https://api.clarifai.com"
pat (str): A personal access token for authentication. Can be set as env var CLARIFAI_PAT
token (str): A session token for authentication. Accepts either a session token or a pat. Can be set as env var CLARIFAI_SESSION_TOKEN
Expand All @@ -60,9 +62,13 @@ def __init__(self,
if url and dataset_id:
raise UserError("You can only specify one of url or dataset_id.")
if url:
user_id, app_id, _, dataset_id, _ = ClarifaiUrlHelper.split_clarifai_url(url)
user_id, app_id, _, dataset_id, dataset_version_id = ClarifaiUrlHelper.split_clarifai_url(
url)
kwargs = {'user_id': user_id, 'app_id': app_id}
self.kwargs = {**kwargs, 'id': dataset_id}
dataset_version = {
'id': dataset_version_id
} if dataset_version_id else kwargs['version'] if 'version' in kwargs else None
self.kwargs = {**kwargs, 'id': dataset_id, 'version': dataset_version}
self.dataset_info = resources_pb2.Dataset(**self.kwargs)
# Related to Dataset Upload
self.num_workers: int = min(10, cpu_count()) #15 req/sec rate limit
Expand Down Expand Up @@ -162,18 +168,16 @@ def list_versions(self, page_no: int = None,
for dataset_version_info in all_dataset_versions_info:
dataset_version_info['id'] = dataset_version_info['dataset_version_id']
del dataset_version_info['dataset_version_id']
del dataset_version_info['metrics']
dataset_version_info.pop('metrics', None)
dataset_version_info.pop('export_info', None)
kwargs = {
'dataset_id': self.id,
'version': resources_pb2.DatasetVersion(**dataset_version_info),
}
yield Dataset.from_auth_helper(self.auth_helper, **kwargs)

def iter_inputs(self):
return iter(DatasetExportReader(archive_url=self.archive_zip()))

def __iter__(self):
return self.iter_inputs()
return iter(DatasetExportReader(archive_url=self.archive_zip()))

def _concurrent_annot_upload(self, annots: List[List[resources_pb2.Annotation]]
) -> Union[List[resources_pb2.Annotation], List[None]]:
Expand Down
15 changes: 8 additions & 7 deletions clarifai/datasets/export/inputs_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ def __init__(self,
"""
self.input_iterator = input_iterator
self.num_workers = min(num_workers, 10) # Max 10 threads
self.num_inputs_annotations = 0
self.num_inputs = 0
self.num_annotations = 0
self.split_prefix = None
self.session = session
self.input_ext = dict(image=".png", text=".txt", audio=".mp3", video=".mp4")
Expand Down Expand Up @@ -187,14 +188,14 @@ def _write_archive(self, input_, new_archive, split: Optional[str] = None) -> No
self._save_audio_to_archive(new_archive, hosted_url, file_name)
elif input_type == "video":
self._save_video_to_archive(new_archive, hosted_url, file_name)
self.num_inputs_annotations += 1
self.num_inputs += 1

if data_dict.get("concepts") or data_dict.get("regions"):
file_name = os.path.join(split, "annotations", input_.id + ".json")
annot_data = data_dict.get("concepts") or data_dict.get("regions")

self._save_annotation_to_archive(new_archive, annot_data, file_name)
self.num_inputs_annotations += 1
self.num_annotations += 1

def _check_output_archive(self, save_path: str) -> None:
try:
Expand All @@ -203,8 +204,8 @@ def _check_output_archive(self, save_path: str) -> None:
raise e
assert len(
archive.namelist()
) == self.num_inputs_annotations, "Archive has %d inputs+annotations | expecting %d inputs+annotations" % (
len(archive.namelist()), self.num_inputs_annotations)
) == self.num_inputs + self.num_annotations, "Archive has %d inputs+annotations | expecting %d inputs+annotations" % (
len(archive.namelist()), self.num_inputs + self.num_annotations)

def download_archive(self, save_path: str, split: Optional[str] = None) -> None:
"""Downloads the archive from the URL into an archive of inputs, annotations in the directory format
Expand All @@ -223,5 +224,5 @@ def download_archive(self, save_path: str, split: Optional[str] = None) -> None:
progress.update()

self._check_output_archive(save_path)
logger.info("Downloaded %d inputs+annotations to %s" % (self.num_inputs_annotations,
save_path))
logger.info("Downloaded %d inputs and %d annotations to %s" %
(self.num_inputs, self.num_annotations, save_path))