-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathcli_functions.py
1530 lines (1332 loc) · 49.8 KB
/
cli_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import argparse
import concurrent.futures
import datetime
import json
import os
import sys
import traceback
from glob import glob
from itertools import tee
from pathlib import Path
from typing import Dict, Iterator, List, NoReturn, Optional, Set, Union
import humanize
from rich.console import Console
from rich.live import Live
from rich.progress import (
BarColumn,
DownloadColumn,
Progress,
SpinnerColumn,
TaskID,
TextColumn,
TimeRemainingColumn,
TransferSpeedColumn,
)
from rich.table import Table
from rich.theme import Theme
from darwin.client import Client
from darwin.config import Config
from darwin.dataset import RemoteDataset
from darwin.dataset.identifier import DatasetIdentifier
from darwin.dataset.release import Release
from darwin.dataset.split_manager import split_dataset
from darwin.dataset.upload_manager import LocalFile
from darwin.dataset.utils import get_release_path
from darwin.datatypes import (
AnnotatorReportGrouping,
ExportParser,
ImportParser,
NumberLike,
PathLike,
Team,
UnknownType,
)
from darwin.exceptions import (
AnnotationFileValidationError,
IncompatibleOptions,
InvalidLogin,
MissingConfig,
MissingDependency,
MissingSchema,
NameTaken,
NotFound,
Unauthenticated,
UnknownAnnotationFileSchema,
UnrecognizableFileEncoding,
UnsupportedExportFormat,
UnsupportedFileType,
ValidationError,
)
from darwin.exporter import ExporterNotFoundError, export_annotations, get_exporter
from darwin.exporter.formats import supported_formats as export_formats
from darwin.extractor import video
from darwin.importer import ImporterNotFoundError, get_importer, import_annotations
from darwin.importer.formats import supported_formats as import_formats
from darwin.item import DatasetItem
from darwin.utils import (
BLOCKED_UPLOAD_ERROR_ALREADY_EXISTS,
find_files,
persist_client_configuration,
prompt,
secure_continue_request,
validate_file_against_schema,
)
def validate_api_key(api_key: str) -> None:
"""
Validates the given API key. Exits the application if it fails validation.
Parameters
----------
api_key: str
The API key to be validated.
"""
example_key = "DHMhAWr.BHucps-tKMAi6rWF1xieOpUvNe5WzrHP"
if len(api_key) != 40:
_error(f"Expected key to be 40 characters long\n(example: {example_key})")
if "." not in api_key:
_error(f"Expected key formatted as prefix . suffix\n(example: {example_key})")
if len(api_key.split(".")[0]) != 7:
_error(f"Expected key prefix to be 7 characters long\n(example: {example_key})")
def authenticate(
api_key: str,
default_team: Optional[Union[str, bool]] = None,
datasets_dir: Optional[Path] = None,
) -> Config:
"""
Authenticate the API key against the server and creates a configuration file for it.
Parameters
----------
api_key : str
API key to use for the client login.
default_team: Optional[bool]
Flag to make the team the default one. Defaults to None.
datasets_dir: Optional[Path]
Dataset directory on the file system. Defaults to None.
Returns
-------
Config
A configuration object to handle YAML files.
"""
# Resolve the home folder if the dataset_dir starts with ~ or ~user
validate_api_key(api_key)
try:
client = Client.from_api_key(api_key=api_key)
config_path = Path.home() / ".darwin" / "config.yaml"
config_path.parent.mkdir(exist_ok=True)
if default_team is None:
default_team_name = (
client.default_team
if input(f"Make {client.default_team} the default team? [y/N] ")
in ["Y", "y"]
else None
)
elif default_team is False:
default_team_name = None
elif default_team is True:
default_team_name = client.default_team
else:
default_team_name = default_team
if datasets_dir is None:
datasets_dir = Path(prompt("Datasets directory", "~/.darwin/datasets"))
datasets_dir = Path(datasets_dir).expanduser()
Path(datasets_dir).mkdir(parents=True, exist_ok=True)
client.set_datasets_dir(datasets_dir)
return persist_client_configuration(client, default_team=default_team_name)
except InvalidLogin:
_error("Invalid API key")
def current_team() -> None:
"""Print the team currently authenticated against."""
client: Client = _load_client()
print(client.default_team)
def list_teams() -> None:
"""Print a table of teams to which the client belong to."""
for team in _config().get_all_teams():
if team.default:
print(f"{team.slug} (default)")
else:
print(team.slug)
def set_team(team_slug: str) -> None:
"""
Switches the client to the selected team and persist the change on the configuration file.
Parameters
----------
team_slug : str
Slug of the team to switch to.
"""
config = _config()
config.set_default_team(team_slug)
def set_compression_level(compression_level: int) -> None:
"""
Change the compression level of text/json contents sent to Darwin APIs and persist the change on the configuration file.
Can be in range from 0 - no compression, to 9 - best compression. By default, 0 is used.
Parameters
----------
compression_level : int
Compression level to use.
"""
config = _config()
config.set_compression_level(compression_level)
def create_dataset(dataset_slug: str) -> None:
"""
Creates a dataset remotely. Exits the application if the dataset's name is already taken or is
not valid.
Parameters
----------
dataset_slug : str
Slug of the new dataset.
"""
identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug)
client: Client = _load_client(team_slug=identifier.team_slug)
try:
dataset: RemoteDataset = client.create_dataset(name=identifier.dataset_slug)
print(
f"Dataset '{dataset.name}' ({dataset.team}/{dataset.slug}) has been created.\nAccess at {dataset.remote_path}"
)
print_new_version_info(client)
except NameTaken:
_error(f"Dataset name '{identifier.dataset_slug}' is already taken.")
except ValidationError:
_error(f"Dataset name '{identifier.dataset_slug}' is not valid.")
def local(team: Optional[str] = None) -> None:
"""
Lists synced datasets, stored in the specified path.
Parameters
----------
team: Optional[str]
The name of the team to list, or the default one if no team is given. Defaults to None.
"""
table: Table = Table(show_header=True, header_style="bold cyan")
table.add_column("Name")
table.add_column("Image Count", justify="right")
table.add_column("Sync Date", justify="right")
table.add_column("Size", justify="right")
client: Client = _load_client()
for dataset_path in client.list_local_datasets(team_slug=team):
files_in_dataset_path = find_files([dataset_path])
table.add_row(
f"{dataset_path.parent.name}/{dataset_path.name}",
str(len(files_in_dataset_path)),
humanize.naturaldate(
datetime.datetime.fromtimestamp(dataset_path.stat().st_mtime)
),
humanize.naturalsize(sum(p.stat().st_size for p in files_in_dataset_path)),
)
Console().print(table)
def path(dataset_slug: str) -> Path:
"""
Returns the absolute path of the specified dataset.
Exits the application if the dataset does not exist locally.
Parameters
----------
dataset_slug: str
The dataset's slug.
Returns
-------
Path
The absolute path of the dataset.
"""
identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug)
client: Client = _load_client()
for path in client.list_local_datasets(team_slug=identifier.team_slug):
if identifier.dataset_slug == path.name:
return path
_error(
f"Dataset '{identifier.dataset_slug}' does not exist locally. "
f"Use 'darwin dataset remote' to see all the available datasets, "
f"and 'darwin dataset pull' to pull them."
)
def url(dataset_slug: str) -> None:
"""
Prints the url of the specified dataset.
Exits the application if no dataset was found.
Parameters
----------
dataset_slug: str
The dataset's slug.
"""
client: Client = _load_client()
try:
remote_dataset: RemoteDataset = client.get_remote_dataset(
dataset_identifier=dataset_slug
)
print(remote_dataset.remote_path)
except NotFound as e:
_error(f"Dataset '{e.name}' does not exist.")
def export_dataset(
dataset_slug: str,
include_url_token: bool,
name: str,
annotation_class_ids: Optional[List[str]] = None,
include_authorship: bool = False,
version: Optional[str] = None,
) -> None:
"""
Create a new release for the dataset.
Parameters
----------
dataset_slug : str
Slug of the dataset to which we perform the operation on.
include_url_token : bool, default: False
If ``True`` includes the url token, if ``False`` does not.
name : str
Name of the release.
annotation_class_ids : Optional[List[str]], default: None
List of the classes to filter.
include_authorship : bool, default: False
If ``True`` include annotator and reviewer metadata for each annotation.
version : Optional[str], default: None
When used for V2 dataset, allows to force generation of either Darwin JSON 1.0 (Legacy) or newer 2.0.
Ommit this option to get your team's default.
"""
client: Client = _load_client()
identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug)
ds: RemoteDataset = client.get_remote_dataset(identifier)
try:
ds.export(
annotation_class_ids=annotation_class_ids,
name=name,
include_url_token=include_url_token,
include_authorship=include_authorship,
version=version,
)
except ValidationError:
_error("Nothing to export")
else:
identifier.version = name
print(f"Dataset {dataset_slug} successfully exported to {identifier}")
print_new_version_info(client)
def pull_dataset(
dataset_slug: str,
only_annotations: bool = False,
folders: bool = True,
video_frames: bool = False,
force_slots: bool = False,
ignore_slots: bool = False,
no_folders: bool = False,
retry: bool = False,
retry_timeout: int = 600,
retry_interval: int = 10,
) -> None:
"""
Downloads a remote dataset (images and annotations) in the datasets directory.
Exits the application if dataset is not found, the user is not authenticated, there are no
releases or the export format for the latest release is not supported.
Parameters
----------
dataset_slug: str
Slug of the dataset to which we perform the operation on.
only_annotations: bool
Download only the annotations and no corresponding images. Defaults to False.
folders: bool
Recreates the folders in the dataset. Defaults to True.
video_frames: bool
Pulls video frames images instead of video files. Defaults to False.
force_slots: bool
Pulls all slots of items into deeper file structure ({prefix}/{item_name}/{slot_name}/{file_name})
no_folders: bool
Does not recreate the folders in the dataset. Defaults to False.
retry: bool
If True, will repeatedly try to download the release if it is still processing until the timeout is reached.
retry_timeout: int
If retrying, total time to wait for the release to be ready for download
retry_interval: int
If retrying, time to wait between retries of checking if the release is ready for download.
"""
version: str = DatasetIdentifier.parse(dataset_slug).version or "latest"
client: Client = _load_client(maybe_guest=True)
try:
dataset: RemoteDataset = client.get_remote_dataset(
dataset_identifier=dataset_slug
)
except NotFound:
_error(
f"Dataset '{dataset_slug}' does not exist, please check the spelling. "
"Use 'darwin remote' to list all the remote datasets."
)
except Unauthenticated:
_error("please re-authenticate")
if no_folders:
folders = False
try:
release: Release = dataset.get_release(version, retry)
dataset.pull(
release=release,
only_annotations=only_annotations,
use_folders=folders,
video_frames=video_frames,
force_slots=force_slots,
ignore_slots=ignore_slots,
retry=retry,
retry_timeout=retry_timeout,
retry_interval=retry_interval,
)
print_new_version_info(client)
except NotFound:
_error(
f"Version '{dataset.identifier}:{version}' does not exist. "
f"Use 'darwin dataset releases' to list all available versions."
)
except UnsupportedExportFormat as uef:
_error(
f"Version '{dataset.identifier}:{version}' is of format '{uef.format}', "
f"only the darwin formats ('json', 'darwin_json_2') are supported for `darwin dataset pull`"
)
except MissingDependency as e:
_error(str(e))
print(f"Dataset {release.identifier} downloaded at {dataset.local_path} .")
def split(
dataset_slug: str, val_percentage: float, test_percentage: float, seed: int = 0
) -> None:
"""
Splits a local version of a dataset into train, validation, and test partitions.
Parameters
----------
dataset_slug: str
Slug of the dataset to which we perform the operation on.
val_percentage: float
Percentage in the validation set.
test_percentage: float
Percentage in the test set.
seed: int
Random seed. Defaults to 0.
"""
identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug)
client: Client = _load_client()
for p in client.list_local_datasets(team_slug=identifier.team_slug):
if identifier.dataset_slug == p.name:
try:
split_path = split_dataset(
dataset_path=p,
release_name=identifier.version,
val_percentage=val_percentage,
test_percentage=test_percentage,
split_seed=seed,
)
print(f"Partition lists saved at {split_path}")
return
except ImportError as e:
_error(e.msg)
except NotFound as e:
_error(e.name)
except ValueError as e:
_error(e.args[0])
_error(
f"Dataset '{identifier.dataset_slug}' does not exist locally. "
f"Use 'darwin dataset remote' to see all the available datasets, "
f"and 'darwin dataset pull' to pull them."
)
def list_remote_datasets(all_teams: bool, team: Optional[str] = None) -> None:
"""
Lists remote datasets with its annotation progress.
Parameters
----------
all_teams: bool
If True, lists remote datasets from all teams, if False, lists only datasets from the given
Team.
team: Optional[str]
Name of the team with the datasets we want to see. Uses the default Team is non is given.
Defaults to None.
"""
# TODO: add listing open datasets
table: Table = Table(show_header=True, header_style="bold cyan")
table.add_column("Name")
table.add_column("Item Count", justify="right")
table.add_column("Complete Items", justify="right")
datasets: List[RemoteDataset] = []
client: Optional[Client] = None
if all_teams:
teams: List[Team] = _config().get_all_teams()
for a_team in teams:
client = _load_client(a_team.slug)
datasets += list(client.list_remote_datasets())
else:
client = _load_client(team)
datasets = list(client.list_remote_datasets())
for dataset in datasets:
table.add_row(
f"{dataset.team}/{dataset.slug}",
str(dataset.item_count),
f"{dataset.progress * 100:.1f}%",
)
if table.row_count == 0:
print("No dataset available.")
else:
Console().print(table)
print_new_version_info(client)
def remove_remote_dataset(dataset_slug: str) -> None:
"""
Remove a remote dataset from the workview. The dataset gets archived.
Exits the application if no dataset with the given slug were found.
Parameters
----------
dataset_slug: str
The dataset's slug.
"""
client: Client = _load_client()
try:
dataset: RemoteDataset = client.get_remote_dataset(
dataset_identifier=dataset_slug
)
print(f"About to delete {dataset.identifier} on darwin.")
if not secure_continue_request():
print("Cancelled.")
return
dataset.remove_remote()
print_new_version_info(client)
except NotFound:
_error(f"No dataset with name '{dataset_slug}'")
def dataset_list_releases(dataset_slug: str) -> None:
"""
Lists all the releases from the given dataset.
Exits the application if no dataset with the given slug were found.
Parameters
----------
dataset_slug: str
The dataset's slug.
"""
client: Client = _load_client()
try:
dataset: RemoteDataset = client.get_remote_dataset(
dataset_identifier=dataset_slug
)
releases: List[Release] = dataset.get_releases()
if len(releases) == 0:
print("No available releases, export one first.")
return
table: Table = Table(show_header=True, header_style="bold cyan")
table.add_column("Name")
table.add_column("Item Count", justify="right")
table.add_column("Class Count", justify="right")
table.add_column("Export Date", justify="right")
for release in releases:
if not release.available:
continue
table.add_row(
str(release.identifier),
str(release.image_count),
str(release.class_count),
str(release.export_date),
)
Console().print(table)
print_new_version_info(client)
except NotFound:
_error(f"No dataset with name '{dataset_slug}'")
def upload_data(
dataset_identifier: str,
files: Optional[List[Union[PathLike, LocalFile]]],
files_to_exclude: Optional[List[PathLike]],
fps: int,
path: Optional[str],
frames: bool,
extract_views: bool = False,
handle_as_slices: bool = False,
preserve_folders: bool = False,
verbose: bool = False,
item_merge_mode: Optional[str] = None,
) -> None:
"""
Uploads the provided files to the remote dataset.
Exits the application if no dataset with the given name is found, the files in the given path
have unsupported formats, or if there are no files found in the given Path.
Parameters
----------
dataset_identifier : str
Slug of the dataset to retrieve.
files : List[Union[PathLike, LocalFile]]
List of files to upload. Can be None.
files_to_exclude : List[PathLike]
List of files to exclude from the file scan (which is done only if files is None).
fps : int
Frame rate to split videos in.
path : Optional[str]
If provided; files will be placed under this path in the v7 platform. If `preserve_folders`
is `True` then it must be possible to draw a relative path from this folder to the one the
files are in, otherwise an error will be raised.
frames : bool
Specify whether the files will be uploaded as a list of frames or not.
extract_views : bool
If providing a volume, specify whether to extract the orthogonal views or not.
handle_as_slices : bool
Whether to upload DICOM files as slices
preserve_folders : bool
Specify whether or not to preserve folder paths when uploading.
verbose : bool
Specify whether to have full traces print when uploading files or not.
item_merge_mode : Optional[str]
If set, each file path passed to `files_to_upload` behaves as follows:
- Paths pointing directly to individual files are ignored
- Paths pointing to folders of files will be uploaded according to the following mode rules.
Note that folders will not be recursively searched, so only files in the first level of the folder will be uploaded:
- "slots": Each file in the folder will be uploaded to a different slot of the same item.
- "series": All `.dcm` files in the folder will be concatenated into a single slot. All other files are ignored.
- "channels": Each file in the folder will be uploaded to a different channel of the same item.
"""
client: Client = _load_client()
try:
max_workers: int = concurrent.futures.ThreadPoolExecutor()._max_workers # type: ignore
dataset: RemoteDataset = client.get_remote_dataset(
dataset_identifier=dataset_identifier
)
sync_metadata: Progress = Progress(
SpinnerColumn(), TextColumn("[bold blue]Syncing metadata")
)
overall_progress = Progress(
TextColumn("[bold blue]{task.fields[filename]}"),
BarColumn(),
"{task.completed} of {task.total}",
)
file_progress = Progress(
TextColumn("[bold green]{task.fields[filename]}", justify="right"),
BarColumn(),
"[progress.percentage]{task.percentage:>3.1f}%",
DownloadColumn(),
"•",
TransferSpeedColumn(),
"•",
TimeRemainingColumn(),
)
progress_table: Table = Table.grid()
progress_table.add_row(sync_metadata)
progress_table.add_row(file_progress)
progress_table.add_row(overall_progress)
with Live(progress_table):
sync_task: TaskID = sync_metadata.add_task("")
file_tasks: Dict[str, TaskID] = {}
overall_task = overall_progress.add_task(
"[green]Total progress",
filename="Total progress",
total=0,
visible=False,
)
def progress_callback(
total_file_count: NumberLike, file_advancement: NumberLike
) -> None:
sync_metadata.update(sync_task, visible=False)
overall_progress.update(
overall_task,
total=total_file_count,
advance=file_advancement,
visible=True,
)
def file_upload_callback(
file_name: str,
file_total_bytes: NumberLike,
file_bytes_sent: NumberLike,
) -> None:
if file_name not in file_tasks:
file_tasks[file_name] = file_progress.add_task(
f"[blue]{file_name}", filename=file_name, total=file_total_bytes
)
# Rich has a concurrency issue, so sometimes updating progress
# or removing a task fails. Wrapping this logic around a try/catch block
# is a workaround, we should consider solving this properly (e.g.: using locks)
try:
file_progress.update(
file_tasks[file_name], completed=file_bytes_sent
)
for task in file_progress.tasks:
if task.finished and len(file_progress.tasks) >= max_workers:
file_progress.remove_task(task.id)
except Exception:
pass
upload_manager = dataset.push(
files,
files_to_exclude=files_to_exclude,
fps=fps,
as_frames=frames,
extract_views=extract_views,
handle_as_slices=handle_as_slices,
path=path,
preserve_folders=preserve_folders,
progress_callback=progress_callback,
file_upload_callback=file_upload_callback,
item_merge_mode=item_merge_mode,
)
console = Console(theme=_console_theme())
console.print()
if not upload_manager.blocked_count and not upload_manager.error_count:
console.print(
f"All {upload_manager.total_count} files have been successfully uploaded.\n",
style="success",
)
return
already_existing_items = []
other_skipped_items = []
for item in upload_manager.blocked_items:
for slot in item.slots:
if (slot.reason is not None) and (
slot.reason.upper() == BLOCKED_UPLOAD_ERROR_ALREADY_EXISTS
):
already_existing_items.append(item)
else:
other_skipped_items.append(item)
if already_existing_items:
console.print(
f"Skipped {len(already_existing_items)} files already in the dataset.\n",
style="warning",
)
if upload_manager.error_count or other_skipped_items:
error_count = upload_manager.error_count + len(other_skipped_items)
console.print(
f"{error_count} files couldn't be uploaded because an error occurred.\n",
style="error",
)
if not verbose and upload_manager.error_count:
console.print('Re-run with "--verbose" for further details')
return
error_table: Table = Table(
"Dataset Item ID",
"Filename",
"Remote Path",
"Stage",
"Reason",
show_header=True,
header_style="bold cyan",
)
for item in upload_manager.blocked_items:
for slot in item.slots:
if (slot.reason is not None) and (
slot.reason.upper() != BLOCKED_UPLOAD_ERROR_ALREADY_EXISTS
):
error_table.add_row(
str(item.dataset_item_id),
item.filename,
item.path,
"UPLOAD_REQUEST",
slot.reason,
)
for error in upload_manager.errors:
for local_file in upload_manager.local_files:
if local_file.local_path != error.file_path:
continue
for pending_item in upload_manager.pending_items:
if pending_item.filename != local_file.data["filename"]:
continue
error_table.add_row(
str(pending_item.dataset_item_id),
pending_item.filename,
pending_item.path,
error.stage.name,
str(error.error),
)
break
if error_table.row_count:
console.print(error_table)
print_new_version_info(client)
except NotFound as e:
_error(f"No dataset with name '{e.name}'")
except UnsupportedFileType as e:
_error(f"Unsupported file type {e.path.suffix} ({e.path.name})")
except ValueError as e:
_error(f"{e}")
def dataset_import(
dataset_slug: str,
format: str,
files: List[PathLike],
append: bool,
class_prompt: bool = True,
delete_for_empty: bool = False,
import_annotators: bool = False,
import_reviewers: bool = False,
overwrite: bool = False,
use_multi_cpu: bool = False,
cpu_limit: Optional[int] = None,
) -> None:
"""
Imports annotation files to the given dataset.
Exits the application if no dataset with the given slug is found.
Parameters
----------
dataset_slug : str
The dataset's slug.
format : str
Format of the export files.
files : List[PathLike]
List of where the files are.
append : bool, default: True
If ``True`` it appends the annotation from the files to the dataset, if ``False`` it will
override the dataset's current annotations with the ones from the given files.
Incompatible with ``delete-for-empty``.
delete_for_empty : bool, default: False
If ``True`` will use empty annotation files to delete all annotations from the remote file.
If ``False``, empty annotation files will simply be skipped.
Only works for V2 datasets.
Incompatible with ``append``.
import_annotators : bool, default: False
If ``True`` it will import the annotators from the files to the dataset, if available.
If ``False`` it will not import the annotators.
import_reviewers : bool, default: False
If ``True`` it will import the reviewers from the files to the dataset, if .
If ``False`` it will not import the reviewers.
overwrite : bool, default: False
If ``True`` it will bypass a warning that the import will overwrite the current annotations if any are present.
If ``False`` this warning will be skipped and the import will overwrite the current annotations without warning.
use_multi_cpu : bool, default: False
If ``True`` it will use all multiple CPUs to speed up the import process.
cpu_limit : Optional[int], default: Core count - 2
The maximum number of CPUs to use for the import process.
"""
client: Client = _load_client(dataset_identifier=dataset_slug)
try:
importer: ImportParser = get_importer(format)
dataset: RemoteDataset = client.get_remote_dataset(
dataset_identifier=dataset_slug
)
if cpu_limit is not None:
use_multi_cpu = True
import_annotations(
dataset,
importer,
files,
append,
class_prompt,
delete_for_empty,
import_annotators,
import_reviewers,
overwrite,
use_multi_cpu,
cpu_limit,
)
except ImporterNotFoundError:
_error(
f"Unsupported import format: {format}, currently supported: {import_formats}"
)
except AttributeError as e:
_error(f"Internal problem with import occured: {str(e)}")
except NotFound as e:
_error(f"No dataset with name '{e.name}'")
except IncompatibleOptions as e:
_error(str(e))
except UnrecognizableFileEncoding as e:
_error(str(e))
except UnknownAnnotationFileSchema as e:
_error(str(e))
except AnnotationFileValidationError as e:
_error(str(e))
def list_files(
dataset_slug: str,
statuses: Optional[str],
path: Optional[str],
only_filenames: bool,
sort_by: Optional[str] = "updated_at:desc",
) -> None:
"""
List all file from the given dataset.
Exits the application if it finds unknown file statuses, if no dataset with the given slug is
found or if another general error occurred.
Parameters
----------
dataset_slug: str
The dataset's slug.
statuses: Optional[str]
Only list files with the given statuses. Valid statuses are: 'annotate', 'archived',
'complete', 'new', 'review'.
path: Optional[str]
Only list files whose Path matches.
only_filenames: bool
If True, only prints the filenames, if False it prints the full file url.
sort_by: Optional[str]
Sort order for listing files. Defaults to 'updated_at:desc'.
"""
client: Client = _load_client(dataset_identifier=dataset_slug)
try:
dataset: RemoteDataset = client.get_remote_dataset(
dataset_identifier=dataset_slug
)
filters: Dict[str, UnknownType] = {}
if statuses:
for status in statuses.split(","):
if not _has_valid_status(status):
_error(
f"Invalid status '{status}', available statuses: annotate, archived, complete, new, review"
)
filters["statuses"] = statuses
else:
filters["statuses"] = "new,annotate,review,complete"
if path:
filters["path"] = path
if not sort_by:
sort_by = "updated_at:desc"
table: Table = Table(show_header=True, header_style="bold cyan")
table.add_column("Name", justify="left")
if not only_filenames:
table.add_column("Status", justify="left")
table.add_column("URL", justify="left")
for file in dataset.fetch_remote_files(filters, sort_by): # type: ignore
if only_filenames:
table.add_row(file.filename)
else:
image_url = dataset.workview_url_for_item(file)
table.add_row(
file.filename,
f"{file.status if not file.archived else 'archived'}",
image_url,
)
Console().print(table)
except NotFound as e:
_error(f"No dataset with name '{e.name}'")
except ValueError as e:
_error(str(e))
def set_file_status(dataset_slug: str, status: str, files: List[str]) -> None:
"""
Sets the status of the given files from the given dataset.
Exits the application if the given status is unknown or if no dataset was found.
Parameters
----------
dataset_slug: str
The dataset's slug.
status: str
The new status for the files.