Skip to content

Commit

Permalink
fix: add fake protein/nucleotide to the inputs to BUSCO proteome/tran…
Browse files Browse the repository at this point in the history
…scriptome runs in the absense any models being present in the fasta
  • Loading branch information
gemygk committed Jun 8, 2023
1 parent 579348f commit 2f028e9
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 2 deletions.
6 changes: 6 additions & 0 deletions minos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@
__copyright__ = "Copyright 2019-2023 Earlham Institute"
__version__ = pkg_resources.require("minos")[0].version

DEFAULT_FAKE_PROT = pkg_resources.resource_filename(
"minos.etc", "fake_prot.fasta"
)
DEFAULT_FAKE_NUC = pkg_resources.resource_filename(
"minos.etc", "fake_nuc.fasta"
)
2 changes: 2 additions & 0 deletions minos/etc/fake_nuc.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>fake_nuc
TACAATACTATCCCGCGCGGTTGCGGATATCCCAGTCTACAGCGTTGAGTGAGACACATG
2 changes: 2 additions & 0 deletions minos/etc/fake_prot.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>fake_prot
FKEPTN
31 changes: 31 additions & 0 deletions minos/scripts/busco_splitter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script to split and check fasta for BUSCO analysis
"""

import os
from minos import (
DEFAULT_FAKE_PROT,
DEFAULT_FAKE_NUC,
)
def split_fasta(fastafile, split_files):
out = None
for line in open(fastafile):
Expand All @@ -18,3 +29,23 @@ def split_fasta(fastafile, split_files):
for f in split_files.values():
f.close()

def copy_file(_in, label, _out):
with open(_in, 'r') as fh:
fhdata = fh.read()

fhdata = fhdata.replace('>', f'>{label}_')

with open(_out, 'w') as fh:
fh.write(fhdata)


def check_split_fasta(format, fasta_files):
for label, path in fasta_files.items():
fasta = path.name
if os.path.exists(fasta) and os.stat(fasta).st_size == 0:
if format == "prot":
print(f"INFO: Copy fake protein '{DEFAULT_FAKE_PROT}' to empty protein fasta file '{fasta}' to prevent BUSCO analysis failure")
copy_file(DEFAULT_FAKE_PROT, label, fasta)
if format == "nuc":
print(f"INFO: Copy fake nucleotide '{DEFAULT_FAKE_NUC}' to empty nucleotide fasta file '{fasta}' to prevent BUSCO analysis failure")
copy_file(DEFAULT_FAKE_NUC, label, fasta)
6 changes: 4 additions & 2 deletions minos/zzz/minos_run.smk
Original file line number Diff line number Diff line change
Expand Up @@ -950,9 +950,10 @@ rule split_proteins_prepare:
log:
os.path.join(BUSCO_PATH, "logs", "split_proteins_prepare.log")
run:
from minos.scripts.busco_splitter import split_fasta
from minos.scripts.busco_splitter import split_fasta, check_split_fasta
fasta_files = {tm: open(os.path.join(BUSCO_PATH, "runs", "proteins_prepare", "input", tm + ".proteins.fasta"), "w") for tm in config["data"]["transcript_models"]}
split_fasta(input[0], fasta_files)
check_split_fasta("prot", fasta_files)

rule busco_proteins_prepare:
input:
Expand Down Expand Up @@ -1013,9 +1014,10 @@ rule split_transcripts_prepare:
log:
os.path.join(BUSCO_PATH, "logs", "split_transcripts_prepare.log")
run:
from minos.scripts.busco_splitter import split_fasta
from minos.scripts.busco_splitter import split_fasta, check_split_fasta
fasta_files = {tm: open(os.path.join(BUSCO_PATH, "runs", "transcripts_prepare", "input", tm + ".cdna.fasta"), "w") for tm in config["data"]["transcript_models"]}
split_fasta(input[0], fasta_files)
check_split_fasta("nuc", fasta_files)

rule busco_transcripts_prepare:
input:
Expand Down

0 comments on commit 2f028e9

Please sign in to comment.