from os.path import splitext, basename fasta = config["ref"] basename = config.get("basename", basename(splitext(fasta)[0])) short_reads = config.get("short_reads") long_reads = config.get("long_reads") short_sra = config.get("sra_short") long_sra = config.get("sra_long") busco_db = config["busco"].lower() gene_prefix = config["gene_prefix"] augustus_species = config["augustus_species"] opts = config.get("opts", {}) max_threads = 16 busco_links = { "viridiplantae": "https://busco-data.ezlab.org/v4/data/lineages/viridiplantae_odb10.2019-11-20.tar.gz", "chlorophyta": "https://busco-data.ezlab.org/v4/data/lineages/chlorophyta_odb10.2019-11-20.tar.gz", "liliopsida": "https://busco-data.ezlab.org/v4/data/lineages/liliopsida_odb10.2019-11-20.tar.gz", "brassicales": "https://busco-data.ezlab.org/v4/data/lineages/brassicales_odb10.2019-11-20.tar.gz", "eudicots": "https://busco-data.ezlab.org/v4/data/lineages/eudicots_odb10.2019-11-20.tar.gz", "solanales": "https://busco-data.ezlab.org/v4/data/lineages/solanales_odb10.2019-11-20.tar.gz", "poales": "https://busco-data.ezlab.org/v4/data/lineages/poales_odb10.2019-11-20.tar.gz", "embryophyta": "https://busco-data.ezlab.org/v4/data/lineages/embryophyta_odb10.2019-11-20.tar.gz", "fabales": "https://busco-data.ezlab.org/v4/data/lineages/fabales_odb10.2019-11-20.tar.gz" } busco_archives = {k: f.split("/")[-1] for k, f in busco_links.items()} busco_dirs = {k: f.split(".")[0] for k, f in busco_archives.items()} rule all: input: f"{basename}.emapper.annotations" group: "all" rule edta: input: fasta output: fasta + ".mod.EDTA.TElib.fa", fasta + ".mod.EDTA.TEanno.gff", fasta + ".mod.EDTA.TEanno.sum" conda: "envs/edta.yml" threads: max_threads group: "all" shell: """ echo RUNNING EDTA EDTA.pl --genome {input} --anno 1 --threads {threads} """ rule maskfasta: input: fasta, rules.edta.output[1] output: f"{basename}.softmasked.fasta" conda: "envs/fun.yml" group: "all" shell: "bedtools maskfasta -fi {input[0]} -fo {output} -bed {input[1]} -soft" rule predict: input: rules.maskfasta.output[0] output: predict_results = [ f"{basename}/predict_results/{basename}.cds-transcripts.fa", f"{basename}/predict_results/{basename}.discrepency.report.txt", f"{basename}/predict_results/{basename}.error.summary.txt", f"{basename}/predict_results/{basename}.gbk", f"{basename}/predict_results/{basename}.gff3", f"{basename}/predict_results/{basename}.mrna-transcripts.fa", f"{basename}/predict_results/{basename}.proteins.fa", f"{basename}/predict_results/{basename}.scaffolds.fa", f"{basename}/predict_results/{basename}.validation.txt" ], logfiles = [ f"{basename}/logfiles/busco.log", f"{basename}/logfiles/augustus-parallel.log", f"{basename}/logfiles/funannotate-EVM.log", f"{basename}/logfiles/funannotate-EVM_busco.log", f"{basename}/logfiles/funannotate-p2g.log", f"{basename}/logfiles/funannotate-predict.log" ] conda: "envs/fun.yml" params: out = basename, name = gene_prefix, busco_link = busco_links[busco_db], busco_archive = busco_archives[busco_db], busco_dir = busco_dirs[busco_db], predict_opt = opts.get("predict", ""), augustus_species = augustus_species, fun_db = 'fun_db/' threads: max_threads group: "all" shell: """ wget {params[busco_link]} tar -zxf {params[busco_archive]} funannotate setup -i all -d {params[fun_db]} funannotate predict {params[predict_opt]} -i {input[0]} \ -o {params[out]} --species {params[out]} -d $(pwd)/{params[fun_db]}\ --busco_db $(pwd)/{params[busco_dir]} --organism other --repeats2evm \ --name {params[name]} --augustus_species {params[augustus_species]} \ --cpus {threads} """ rule eggnogmapper: input: rules.predict.output["predict_results"][6] output: f"{basename}.emapper.annotations" params: out = basename, data_dir = "eggnog_data" conda: "envs/eggnog.yml" group: "all" threads: max_threads shell: """ mkdir -p {params[data_dir]} download_eggnog_data.py -y --data_dir {params[data_dir]} emapper.py -i {input} -o {params[out]} -m diamond --cpu {threads} --data_dir {params[data_dir]} """