## Uploading a dataset

In [1]:
from pubweb import PubWeb

client = PubWeb()

View a list of projects and processes to get the correct ID
You can also find the project ID by looking at the URL on the data portal

In [2]:
projects = client.project.list()
project = projects[0]
project

Project(id='5d747907-7d2a-4562-bcd3-bb4563ae861e', name='Single Cell', description='Public Datasets')

In [3]:
from pubweb.models.process import Executor

ingest_processes = client.process.list(Executor.INGEST)
process = ingest_processes[1]
process

{'id': 'sequencing-run',
 'name': 'Illumina Sequencing Run',
 'desc': 'Illumina Sequencing Run Data (Metrics)',
 '_deleted': None}

We've included two helper functions to get a list of files in the specified directory and filter them.

You can also manually create the list of files (using the relative paths)

In [4]:
from pubweb.file_utils import get_files_in_directory, filter_files_by_pattern

directory_to_upload = '/test'

files = get_files_in_directory(directory_to_upload)
files_to_upload = filter_files_by_pattern(files, '*.fastq.gz')
files_to_upload

['test2_R1.fastq.gz',
 'test2_R2.fastq.gz',
 'test_R1.fastq.gz',
 'test_R2.fastq.gz']

In [5]:
from pubweb.models.dataset import CreateIngestDatasetInput

dataset_create_request = CreateIngestDatasetInput(
    project_id=project.id,
    process_id=process['id'],
    name='Test dataset',
    description='',
    files=files_to_upload
)

create_response = client.dataset.create(dataset_create_request)

client.dataset.upload_files(
    project_id=project.id,
    dataset_id=create_response['datasetId'],
    directory=directory_to_upload,
    files=dataset_create_request.files
)

create_response['datasetId']

Uploading file test2_R1.fastq.gz (1.00 B) | 100.0%|█████████████████████████ | 3.57B/s
Uploading file test2_R2.fastq.gz (1.00 B) | 100.0%|█████████████████████████ | 12.7B/s
Uploading file test_R1.fastq.gz (1.00 B) | 100.0%|█████████████████████████ | 16.1B/s
Uploading file test_R2.fastq.gz (1.00 B) | 100.0%|█████████████████████████ | 8.98B/s


'145100fb-5c65-415c-8613-6e5eec3d9d82'