In [2]:
import awswrangler as wr
import os
import pandas as pd

## Pipeline parameters

In [3]:
pipeline_output_path = "s3://genomics-workflow-core/Results/GenomeMining"
project = "IMG"
prefix = "20220707"
seedfile_name = "test_seedfile.csv"

input_s3_path = "s3://maf-versioned/GenomeMining/Genomes/IMG/"
file_extension = "tar.gz"

seedfile = os.path.join(pipeline_output_path, project, prefix, seedfile_name)

In [4]:
file_paths = wr.s3.list_objects(input_s3_path, file_extension)

In [5]:
len(file_paths)

78357

In [6]:
file_paths_test = file_paths[:5]
file_paths_test

['s3://maf-versioned/GenomeMining/Genomes/IMG/2022827000.tar.gz',
 's3://maf-versioned/GenomeMining/Genomes/IMG/2040502012.tar.gz',
 's3://maf-versioned/GenomeMining/Genomes/IMG/2140918011.tar.gz',
 's3://maf-versioned/GenomeMining/Genomes/IMG/2228664006.tar.gz',
 's3://maf-versioned/GenomeMining/Genomes/IMG/2228664007.tar.gz']

In [7]:
seedfile_df = pd.DataFrame({
"genome_id" : [os.path.basename(filename).replace(".tar.gz","") for filename in file_paths_test],
"genome_path" : file_paths_test
})

wr.s3.to_csv(df=seedfile_df, path=seedfile, index=False)

{'paths': ['s3://genomics-workflow-core/Results/GenomeMining/IMG/20220707/test_seedfile.csv'],
 'partitions_values': {}}

In [None]:
def submit_batch_job(
    project: str,
    prefix: str,
    seedfile: str = None,
    genome: str = None,
    branch: str = "main",
    job_queue: str = "priority-maf-pipelines",
    job_definition: str = "nextflow-production",
    aws_profile: str = None,
    dry_run: bool= False
) -> dict:
    """Submit a nf-genome-mining job to AWS Batch

    Args:
        project (_str_): name of the project
        prefix (_str_): name of the sample/batch
        seedfile (_str_): s3 path to seedfile (incompatible with genome). Defaults to "None".
        genome (_str_): s3 path to individual genome tarball (incompatible with seedfile). Defaults to "None".
        branch (_str_, optional): Branch of read-profiler to use. Defaults to "main".
        job_queue (_str_, optional): name of the queue for the head node. Defaults to "priority-maf-pipelines".
        job_definition (_str_, optional): nextflow job definition. Doesn't usually change. Defaults to "nextflow-production".
        aws_profile (_str_, optional): if a non-default aws profile should be used to submit jobs. Defaults to "None".
        dry_run (_bool_, optional): don't submit the job, just print what the submission command would look like. Defaults to "False".
    Returns:
        _dict_: a response object that contains details of the job submission from AWS
        (https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/batch.html#Batch.Client.submit_job)
    """
    
    assert (!(seedfile is None) & (prefix is None)), "Both `seedfile` and `prefix` cannot be empty"

    ## Set AWS Profile
    if aws_profile is None:
        s = boto3.session.Session()
    else:
        s = boto3.session.Session(profile_name=aws_profile)

    batch = s.client("batch")

    # Set the pipeline flags for the analysis
    command = [
        "FischbachLab/nf-genome-mining",
        "-r",
        branch,
        "--project",
        project,
        "--prefix",
        prefix,
    ]

    ## Set Single/Pair Ended
    if seedfile is not None:
        command += ["--seedfile", seedfile]
    elif genome is not None:
        command += ["--genome", genome]

    if dry_run:
        logging.info(f"command: \'{' '.join(command)}\'")
        return

    # Submit job
    response = batch.submit_job(
        jobName=f"nf-gm-{project}",
        jobQueue=job_queue,
        jobDefinition=job_definition,
        containerOverrides={"command": command},
    )
    return response