Clarifai · sanjaychelliah · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024
diff --git a/README.md b/README.md
@@ -148,9 +148,7 @@ dataset.upload_from_csv(csv_path='csv_path', input_type='text', csv_type='raw',
 dataset.upload_from_folder(folder_path='folder_path', input_type='text', labels=True)
 
 # Export Dataset
-from clarifai.client.dataset import Dataset
-# Note: clarifai-data-protobuf.zip is acquired through exporting datasets within the Clarifai Platform.
-Dataset().export(save_path='output.zip', local_archive_path='clarifai-data-protobuf.zip')
+dataset.export(save_path='output.zip')
 ```
 
 

diff --git a/clarifai/client/app.py b/clarifai/client/app.py
@@ -88,6 +88,7 @@ def list_datasets(self, page_no: int = None,
     for dataset_info in all_datasets_info:
       if 'version' in dataset_info:
         dataset_info['version'].pop('metrics', None)
+        dataset_info['version'].pop('export_info', None)
       yield Dataset.from_auth_helper(auth=self.auth_helper, **dataset_info)
 
   def list_models(self,
@@ -457,11 +458,12 @@ def create_module(self, module_id: str, description: str, **kwargs) -> Module:
 
     return Module.from_auth_helper(auth=self.auth_helper, module_id=module_id, **kwargs)
 
-  def dataset(self, dataset_id: str, **kwargs) -> Dataset:
+  def dataset(self, dataset_id: str, dataset_version_id: str = None, **kwargs) -> Dataset:
     """Returns a Dataset object for the existing dataset ID.
 
     Args:
         dataset_id (str): The dataset ID for the dataset to interact with.
+        dataset_version_id (str): The version ID for the dataset version to interact with.
 
     Returns:
         Dataset: A Dataset object for the existing dataset ID.
@@ -480,6 +482,7 @@ def dataset(self, dataset_id: str, **kwargs) -> Dataset:
     kwargs = self.process_response_keys(dict_response[list(dict_response.keys())[1]],
                                         list(dict_response.keys())[1])
     kwargs['version'] = response.dataset.version if response.dataset.version else None
+    kwargs['dataset_version_id'] = dataset_version_id
     return Dataset.from_auth_helper(auth=self.auth_helper, **kwargs)
 
   def model(self, model_id: str, model_version_id: str = "", **kwargs) -> Model:

diff --git a/clarifai/client/dataset.py b/clarifai/client/dataset.py
@@ -43,6 +43,7 @@ class Dataset(Lister, BaseClient):
   def __init__(self,
                url: str = None,
                dataset_id: str = None,
+               dataset_version_id: str = None,
                base_url: str = "https://api.clarifai.com",
                pat: str = None,
                token: str = None,
@@ -52,6 +53,7 @@ def __init__(self,
     Args:
         url (str): The URL to initialize the dataset object.
         dataset_id (str): The Dataset ID within the App to interact with.
+        dataset_version_id (str): The Dataset Version ID within the Dataset to interact with.
         base_url (str): Base API url. Default "https://api.clarifai.com"
         pat (str): A personal access token for authentication. Can be set as env var CLARIFAI_PAT
         token (str): A session token for authentication. Accepts either a session token or a pat. Can be set as env var CLARIFAI_SESSION_TOKEN
@@ -60,9 +62,13 @@ def __init__(self,
     if url and dataset_id:
       raise UserError("You can only specify one of url or dataset_id.")
     if url:
-      user_id, app_id, _, dataset_id, _ = ClarifaiUrlHelper.split_clarifai_url(url)
+      user_id, app_id, _, dataset_id, dataset_version_id = ClarifaiUrlHelper.split_clarifai_url(
+          url)
       kwargs = {'user_id': user_id, 'app_id': app_id}
-    self.kwargs = {**kwargs, 'id': dataset_id}
+    dataset_version = {
+        'id': dataset_version_id
+    } if dataset_version_id else kwargs['version'] if 'version' in kwargs else None
+    self.kwargs = {**kwargs, 'id': dataset_id, 'version': dataset_version}
     self.dataset_info = resources_pb2.Dataset(**self.kwargs)
     # Related to Dataset Upload
     self.num_workers: int = min(10, cpu_count())  #15 req/sec rate limit
@@ -162,18 +168,16 @@ def list_versions(self, page_no: int = None,
     for dataset_version_info in all_dataset_versions_info:
       dataset_version_info['id'] = dataset_version_info['dataset_version_id']
       del dataset_version_info['dataset_version_id']
-      del dataset_version_info['metrics']
+      dataset_version_info.pop('metrics', None)
+      dataset_version_info.pop('export_info', None)
       kwargs = {
           'dataset_id': self.id,
           'version': resources_pb2.DatasetVersion(**dataset_version_info),
       }
       yield Dataset.from_auth_helper(self.auth_helper, **kwargs)
 
-  def iter_inputs(self):
-    return iter(DatasetExportReader(archive_url=self.archive_zip()))
-
   def __iter__(self):
-    return self.iter_inputs()
+    return iter(DatasetExportReader(archive_url=self.archive_zip()))
 
   def _concurrent_annot_upload(self, annots: List[List[resources_pb2.Annotation]]
                               ) -> Union[List[resources_pb2.Annotation], List[None]]:

diff --git a/clarifai/datasets/export/inputs_annotations.py b/clarifai/datasets/export/inputs_annotations.py
@@ -116,7 +116,8 @@ def __init__(self,
     """
     self.input_iterator = input_iterator
     self.num_workers = min(num_workers, 10)  # Max 10 threads
-    self.num_inputs_annotations = 0
+    self.num_inputs = 0
+    self.num_annotations = 0
     self.split_prefix = None
     self.session = session
     self.input_ext = dict(image=".png", text=".txt", audio=".mp3", video=".mp4")
@@ -187,14 +188,14 @@ def _write_archive(self, input_, new_archive, split: Optional[str] = None) -> No
         self._save_audio_to_archive(new_archive, hosted_url, file_name)
       elif input_type == "video":
         self._save_video_to_archive(new_archive, hosted_url, file_name)
-      self.num_inputs_annotations += 1
+      self.num_inputs += 1
 
     if data_dict.get("concepts") or data_dict.get("regions"):
       file_name = os.path.join(split, "annotations", input_.id + ".json")
       annot_data = data_dict.get("concepts") or data_dict.get("regions")
 
       self._save_annotation_to_archive(new_archive, annot_data, file_name)
-      self.num_inputs_annotations += 1
+      self.num_annotations += 1
 
   def _check_output_archive(self, save_path: str) -> None:
     try:
@@ -203,8 +204,8 @@ def _check_output_archive(self, save_path: str) -> None:
       raise e
     assert len(
         archive.namelist()
-    ) == self.num_inputs_annotations, "Archive has %d inputs+annotations | expecting %d inputs+annotations" % (
-        len(archive.namelist()), self.num_inputs_annotations)
+    ) == self.num_inputs + self.num_annotations, "Archive has %d inputs+annotations | expecting %d inputs+annotations" % (
+        len(archive.namelist()), self.num_inputs + self.num_annotations)
 
   def download_archive(self, save_path: str, split: Optional[str] = None) -> None:
     """Downloads the archive from the URL into an archive of inputs, annotations in the directory format
@@ -223,5 +224,5 @@ def download_archive(self, save_path: str, split: Optional[str] = None) -> None:
             progress.update()
 
     self._check_output_archive(save_path)
-    logger.info("Downloaded %d inputs+annotations to %s" % (self.num_inputs_annotations,
-                                                            save_path))
+    logger.info("Downloaded %d inputs and %d annotations to %s" %
+                (self.num_inputs, self.num_annotations, save_path))