From d8a16595b8ff03270521915eec9a3c776ac6fdb1 Mon Sep 17 00:00:00 2001 From: dglemos Date: Thu, 23 Mar 2023 16:27:51 +0000 Subject: [PATCH 1/4] Add EVA pipeline first steps --- nextflow/EVAImport/main.nf | 128 +++++++++++++++++++++++++++++ nextflow/EVAImport/nextflow.config | 37 +++++++++ 2 files changed, 165 insertions(+) create mode 100644 nextflow/EVAImport/main.nf create mode 100644 nextflow/EVAImport/nextflow.config diff --git a/nextflow/EVAImport/main.nf b/nextflow/EVAImport/main.nf new file mode 100644 index 000000000..af546563e --- /dev/null +++ b/nextflow/EVAImport/main.nf @@ -0,0 +1,128 @@ +/* + * Nextflow pipeline to import variants from EVA + */ + +nextflow.enable.dsl=2 +nextflow.enable.strict = true + +// Scripts +eva_script = "${ENSEMBL_ROOT_DIR}/ensembl-variation/scripts/import/import_vcf.pl" +var_syn_script = "${ENSEMBL_ROOT_DIR}/ensembl-variation/scripts/import/import_variant_synonyms" + +// Common params +//params.help = false +params.species = null +params.release = null +params.registry = null + +// Params for EVA import script +params.input_file = null +params.source = "EVA" +params.description = "Short variant data imported from EVA" +params.version = 4 +params.remove_prefix = false +params.chr_synonyms = false +params.merge_all_types = true +params.fork = 10 +params.skip_tables = "allele,allele_code,population,population_genotype,genotype_code,compressed_genotype_var,sample" +params.output_file = null +params.sort_vf = true + +// Params for variant synonyms import +params.var_syn_file = null + + +// Check input params +if(!params.species) { + exit 1, "ERROR: species name must be provided when running EVA import" +} + +if(!params.input_file || !file(params.input_file)) { + exit 1, "ERROR: a valid input file must be provided when running EVA import" +} + +if(!params.release || !params.registry) { + exit 1, "ERROR: release version and registry file must be provided when running EVA import" +} + + +// Build command to run +command_to_run = " -i ${params.input_file} --source ${params.source} --source_description '${params.description}' --version ${params.version} --registry ${params.registry} --species ${params.species} --skip_tables '${params.skip_tables}'" + +if(params.merge_all_types) { +command_to_run += " --merge_all_types" +} + +if(params.fork) { +command_to_run += " --fork ${params.fork}" +} + +if(params.chr_synonyms) { +command_to_run += " --chr_synonyms ${params.chr_synonyms}" +} + +if(params.remove_prefix) { +command_to_run += " --remove_prefix ${params.remove_prefix}" +} + +if(params.sort_vf) { +command_to_run += " --sort_vf" +} + +log.info """ + Import EVA script: ${eva_script} \ + Options: ${command_to_run} +""" + + +process run_eva { + input: + path eva_script + val options + val output_file + + output: + + script: + """ + perl ${eva_script} ${options} --output_file ${output_file} + """ +} + +process run_variant_synonyms { + input: + path var_syn_script + val source_name + val species + val input_file + val registry + + output: + + script: + + if(species == "sus_scrofa") + """ + perl ${var_syn_script} --source_name ${source_name} --species ${species} --data_file ${input_file} --registry ${registry} + perl ${var_syn_script} --source_name "pig_chip" --species ${species} --registry ${registry} + """ + + else if(species == "rattus_norvegicus") + """ + perl ${var_syn_script} --source_name ${source_name} --species ${species} --data_file ${input_file} --registry ${registry} + perl ${var_syn_script} --source_name "rat" --species ${species} --registry ${registry} + """ + else + """ + perl ${var_syn_script} --source_name ${source_name} --species ${species} --data_file ${input_file} --registry ${registry} + """ +} + + +workflow { + // TODO: run script to truncate tables + + run_eva(file(eva_script), command_to_run, params.output_file) + run_variant_synonyms(file(var_syn_script), params.source, params.species, params.var_syn_file, params.registry) + +} diff --git a/nextflow/EVAImport/nextflow.config b/nextflow/EVAImport/nextflow.config new file mode 100644 index 000000000..0e888b1ec --- /dev/null +++ b/nextflow/EVAImport/nextflow.config @@ -0,0 +1,37 @@ +profiles { + standard { + process.executor = 'local' + process.memory = '2GB' + process.cpus = 1 + } + + lsf { + process.executor = 'lsf' + process.memory = '2GB' + process.cpus = 1 + process.clusterOptions = '-R "select[mem>2000] rusage[mem=2000]" -M2000' + } +} + +process { + queue = 'production' + memory = '2 GB' +} + +dag { + enabled = true + overwrite = true + file = "reports/flowchart.html" +} + +timeline { + enabled = true + overwrite = true + file = "reports/timeline.html" +} + +report { + enabled = true + overwrite = true + file = "reports/report.html" +} From 9cb075f7fade030f0087c91251650643d5def6d8 Mon Sep 17 00:00:00 2001 From: dglemos Date: Thu, 23 Mar 2023 17:10:34 +0000 Subject: [PATCH 2/4] Add more options to run synonyms for rat --- nextflow/EVAImport/main.nf | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/nextflow/EVAImport/main.nf b/nextflow/EVAImport/main.nf index af546563e..a2654042c 100644 --- a/nextflow/EVAImport/main.nf +++ b/nextflow/EVAImport/main.nf @@ -30,6 +30,9 @@ params.sort_vf = true // Params for variant synonyms import params.var_syn_file = null +params.host = null +params.port = null +params.dbname = null // Check input params @@ -45,6 +48,10 @@ if(!params.release || !params.registry) { exit 1, "ERROR: release version and registry file must be provided when running EVA import" } +if( (!params.host || !params.port || !params.dbname) && params.species == "rattus_norvegicus") { + exit 1, "ERROR: please provide a host, port and db name for a previous rat database" +} + // Build command to run command_to_run = " -i ${params.input_file} --source ${params.source} --source_description '${params.description}' --version ${params.version} --registry ${params.registry} --species ${params.species} --skip_tables '${params.skip_tables}'" @@ -96,6 +103,9 @@ process run_variant_synonyms { val species val input_file val registry + val host + val port + val dbname output: @@ -110,7 +120,7 @@ process run_variant_synonyms { else if(species == "rattus_norvegicus") """ perl ${var_syn_script} --source_name ${source_name} --species ${species} --data_file ${input_file} --registry ${registry} - perl ${var_syn_script} --source_name "rat" --species ${species} --registry ${registry} + perl ${var_syn_script} --source_name "rat" --species ${species} --registry ${registry} --host ${host} --port ${port} --user 'ensro' --db_name $dbname """ else """ @@ -123,6 +133,6 @@ workflow { // TODO: run script to truncate tables run_eva(file(eva_script), command_to_run, params.output_file) - run_variant_synonyms(file(var_syn_script), params.source, params.species, params.var_syn_file, params.registry) + run_variant_synonyms(file(var_syn_script), params.source, params.species, params.var_syn_file, params.registry, params.host, params.port, params.dbname) } From 398f52bfa56fe881f9f43e61b0fc92341cdb46d2 Mon Sep 17 00:00:00 2001 From: dglemos Date: Mon, 3 Apr 2023 12:08:14 +0100 Subject: [PATCH 3/4] Update process run_eva --- nextflow/EVAImport/main.nf | 42 +++++++++++++++----------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/nextflow/EVAImport/main.nf b/nextflow/EVAImport/main.nf index a2654042c..2b72b34f8 100644 --- a/nextflow/EVAImport/main.nf +++ b/nextflow/EVAImport/main.nf @@ -30,9 +30,9 @@ params.sort_vf = true // Params for variant synonyms import params.var_syn_file = null -params.host = null -params.port = null -params.dbname = null +params.host = "" +params.port = "" +params.dbname = "" // Check input params @@ -53,28 +53,9 @@ if( (!params.host || !params.port || !params.dbname) && params.species == "rattu } -// Build command to run +// Build command to run EVA import command_to_run = " -i ${params.input_file} --source ${params.source} --source_description '${params.description}' --version ${params.version} --registry ${params.registry} --species ${params.species} --skip_tables '${params.skip_tables}'" -if(params.merge_all_types) { -command_to_run += " --merge_all_types" -} - -if(params.fork) { -command_to_run += " --fork ${params.fork}" -} - -if(params.chr_synonyms) { -command_to_run += " --chr_synonyms ${params.chr_synonyms}" -} - -if(params.remove_prefix) { -command_to_run += " --remove_prefix ${params.remove_prefix}" -} - -if(params.sort_vf) { -command_to_run += " --sort_vf" -} log.info """ Import EVA script: ${eva_script} \ @@ -86,13 +67,24 @@ process run_eva { input: path eva_script val options + val merge_all_types + val fork + val sort_vf + val chr_synonyms + val remove_prefix val output_file output: script: + def sort_vf_table = sort_vf ? " --sort_vf" : "" + def merge_all = merge_all_types ? " --merge_all_types" : "" + def use_fork = fork ? "--fork ${fork}" : "" + def chr_synonyms_file = chr_synonyms ? " --chr_synonyms ${chr_synonyms}" : "" + def rm_prefix = remove_prefix ? " --remove_prefix ${remove_prefix}" : "" + """ - perl ${eva_script} ${options} --output_file ${output_file} + perl ${eva_script} ${options} $sort_vf_table $merge_all $use_fork $chr_synonyms_file $rm_prefix --output_file ${output_file} """ } @@ -132,7 +124,7 @@ process run_variant_synonyms { workflow { // TODO: run script to truncate tables - run_eva(file(eva_script), command_to_run, params.output_file) + run_eva(file(eva_script), command_to_run, params.merge_all_types, params.fork, params.sort_vf, params.chr_synonyms, params.remove_prefix, params.output_file) run_variant_synonyms(file(var_syn_script), params.source, params.species, params.var_syn_file, params.registry, params.host, params.port, params.dbname) } From 6cfa753f82d54ce77caad6a63fdab1901ac022dc Mon Sep 17 00:00:00 2001 From: dglemos Date: Mon, 3 Apr 2023 13:09:05 +0100 Subject: [PATCH 4/4] Add list of species to copy sets --- nextflow/EVAImport/main.nf | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/nextflow/EVAImport/main.nf b/nextflow/EVAImport/main.nf index 2b72b34f8..e5aa4bb44 100644 --- a/nextflow/EVAImport/main.nf +++ b/nextflow/EVAImport/main.nf @@ -34,6 +34,18 @@ params.host = "" params.port = "" params.dbname = "" +// Params for sets import +list_species_setname = [ "mus_musculus":["MGP"], + "sus_scrofa":["PorcineHD", "PorcineLD", "PorcineSNP60", "Affy_PorcineHD"], + "ovis_aries":["OvineSNP50", "OvineHDSNP"], + "ovis_aries_rambouillet":["OvineSNP50", "OvineHDSNP"], + "gallus_gallus":["Chicken600K"], + "gallus_gallus_gca000002315v5":["Chicken600K"], + "equus_caballus":["Illumina_EquineSNP50"], + "capra_hircus":["GoatSNP50"], + "bos_taurus":["BovineHD", "BovineLD", "BovineSNP50"] + ] + // Check input params if(!params.species) {