## Analyzing a dataset

In [1]:
from pubweb import PubWeb
from pubweb.auth import UsernameAndPasswordAuth
from pubweb.config import load_config

client = PubWeb(auth_info=UsernameAndPasswordAuth(*load_config()))

In [2]:
project = client.project.list()[0]
process_id = 'process-hutch-magic_count-1_0'
dataset_id = '06886bbc-793a-46d7-8a92-e5392b71704b'

See the [05_References](05_References.ipynb) file for more info on how to find references

In [3]:
references = client.project.get_references(project.id, 'crispr_libraries')
reference_library = references.find_by_name('CUL3')
reference_library

Reference(path=data/references/crispr_libraries/CUL3/library.csv)

Discovering process parameters

In [4]:
param_spec = client.process.get_parameter_spec(process_id)
param_spec.print()

Parameters:
	FASTQ (key=fastq, type=string)
	Library (key=library, type=string)
	5' Adapter (key=adapter, default=CTTGTGGAAAGGACGAAACACCG, type=string, description=Adapter sequence to be trimmed from the 5' end of each read)
	Insert Length (key=insert_length, default=20, type=integer, description=Length of the sgRNA sequences contained in each read)


In [5]:
from pubweb.file_utils import filter_files_by_pattern

files = client.dataset.get_dataset_files(project_id=project.id,
                                         dataset_id=dataset_id)
fastqs = filter_files_by_pattern(files, '**/controls/*.fastq.gz')
fastqs

[File(path=data/controls/Piplup_20211116_7.fastq.gz),
 File(path=data/controls/Piplup_20211116_8.fastq.gz)]

In [6]:
from pubweb.models.process import RunAnalysisCommand

params = {
    'fastq': ','.join([f.absolute_path for f in fastqs]),
    "adapter": "CTTGTGGAAAGGACGAAACACCG",
    "insert_length": 20,
    "library": reference_library.absolute_path
}
params

{'fastq': 's3://z-1e58ebdf-1619-4f14-9a11-369f6b650abc/datasets/06886bbc-793a-46d7-8a92-e5392b71704b/data/controls/Piplup_20211116_7.fastq.gz,s3://z-1e58ebdf-1619-4f14-9a11-369f6b650abc/datasets/06886bbc-793a-46d7-8a92-e5392b71704b/data/controls/Piplup_20211116_8.fastq.gz',
 'adapter': 'CTTGTGGAAAGGACGAAACACCG',
 'insert_length': 20,
 'library': 's3://z-1e58ebdf-1619-4f14-9a11-369f6b650abc/resources/data/references/crispr_libraries/CUL3/library.csv'}

Before submitting the analysis, the client validates that the parameters are valid.
You can also validate manually.

In [11]:
param_spec.validate_params({
    'library': 1
})

RuntimeError: Parameter at $.library error: 1 is not of type 'string'

In [7]:
command = RunAnalysisCommand(
    name='count analysis',
    description='test from SDK',
    process_id=process_id,
    parent_dataset_id=dataset_id,
    project_id=project.id,
    params=params,
    notifications_emails=[]
)

new_dataset_id = client.process.run_analysis(command)
print(new_dataset_id)

GraphQLError: Unknown type 'RunAnalysisInput'.

GraphQL request:2:40
1 |
2 |           mutation RunAnalysis($input: RunAnalysisInput!) {
  |                                        ^
3 |             runAnalysis(input: $input)